From: Carsten Stollmaier <stollmc@amazon.com>
To: Sean Christopherson <seanjc@google.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>,
Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>, <x86@kernel.org>,
"H. Peter Anvin" <hpa@zytor.com>
Cc: <nh-open-source@amazon.com>,
Carsten Stollmaier <stollmc@amazon.com>,
David Woodhouse <dwmw2@infradead.org>,
Peter Xu <peterx@redhat.com>,
Sebastian Biemueller <sbiemue@amazon.de>, <kvm@vger.kernel.org>,
<linux-kernel@vger.kernel.org>
Subject: [PATCH] KVM: x86: Use gfn_to_pfn_cache for steal_time
Date: Fri, 2 Aug 2024 11:44:01 +0000 [thread overview]
Message-ID: <20240802114402.96669-1-stollmc@amazon.com> (raw)
On vcpu_run, before entering the guest, the update of the steal time
information causes a page-fault if the page is not present. In our
scenario, this gets handled by do_user_addr_fault and successively
handle_userfault since we have the region registered to that.
handle_userfault uses TASK_INTERRUPTIBLE, so it is interruptible by
signals. do_user_addr_fault then busy-retries it if the pending signal
is non-fatal. This leads to contention of the mmap_lock.
This patch replaces the use of gfn_to_hva_cache with gfn_to_pfn_cache,
as gfn_to_pfn_cache ensures page presence for the memory access,
preventing the contention of the mmap_lock.
Signed-off-by: Carsten Stollmaier <stollmc@amazon.com>
CC: David Woodhouse <dwmw2@infradead.org>
CC: Sean Christopherson <seanjc@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Peter Xu <peterx@redhat.com>
CC: Sebastian Biemueller <sbiemue@amazon.de>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/x86.c | 115 +++++++++++++++-----------------
2 files changed, 54 insertions(+), 63 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 950a03e0181e..63d0c0cd7a8e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -898,7 +898,7 @@ struct kvm_vcpu_arch {
u8 preempted;
u64 msr_val;
u64 last_steal;
- struct gfn_to_hva_cache cache;
+ struct gfn_to_pfn_cache cache;
} st;
u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af6c8cf6a37a..2b8adbadfc50 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3652,10 +3652,8 @@ EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
- struct kvm_steal_time __user *st;
- struct kvm_memslots *slots;
- gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+ struct kvm_steal_time *st;
u64 steal;
u32 version;
@@ -3670,42 +3668,26 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- slots = kvm_memslots(vcpu->kvm);
-
- if (unlikely(slots->generation != ghc->generation ||
- gpa != ghc->gpa ||
- kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ read_lock(&gpc->lock);
+ while (!kvm_gpc_check(gpc, sizeof(*st))) {
/* We rely on the fact that it fits in a single page. */
BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
- kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ read_unlock(&gpc->lock);
+
+ if (kvm_gpc_refresh(gpc, sizeof(*st)))
return;
+
+ read_lock(&gpc->lock);
}
- st = (struct kvm_steal_time __user *)ghc->hva;
+ st = (struct kvm_steal_time *)gpc->khva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
- u8 st_preempted = 0;
- int err = -EFAULT;
-
- if (!user_access_begin(st, sizeof(*st)))
- return;
-
- asm volatile("1: xchgb %0, %2\n"
- "xor %1, %1\n"
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- : "+q" (st_preempted),
- "+&r" (err),
- "+m" (st->preempted));
- if (err)
- goto out;
-
- user_access_end();
+ u8 st_preempted = xchg(&st->preempted, 0);
vcpu->arch.st.preempted = 0;
@@ -3713,39 +3695,32 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
st_preempted & KVM_VCPU_FLUSH_TLB);
if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
-
- if (!user_access_begin(st, sizeof(*st)))
- goto dirty;
} else {
- if (!user_access_begin(st, sizeof(*st)))
- return;
-
- unsafe_put_user(0, &st->preempted, out);
+ st->preempted = 0;
vcpu->arch.st.preempted = 0;
}
- unsafe_get_user(version, &st->version, out);
+ version = st->version;
if (version & 1)
version += 1; /* first time write, random junk */
version += 1;
- unsafe_put_user(version, &st->version, out);
+ st->version = version;
smp_wmb();
- unsafe_get_user(steal, &st->steal, out);
+ steal = st->steal;
steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- unsafe_put_user(steal, &st->steal, out);
+ st->steal = steal;
version += 1;
- unsafe_put_user(version, &st->version, out);
+ st->version = version;
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
- out:
- user_access_end();
- dirty:
- mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+ read_unlock(&gpc->lock);
}
static bool kvm_is_msr_to_save(u32 msr_index)
@@ -4020,8 +3995,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.st.msr_val = data;
- if (!(data & KVM_MSR_ENABLED))
- break;
+ if (data & KVM_MSR_ENABLED) {
+ kvm_gpc_activate(&vcpu->arch.st.cache, data & ~KVM_MSR_ENABLED,
+ sizeof(struct kvm_steal_time));
+ } else {
+ kvm_gpc_deactivate(&vcpu->arch.st.cache);
+ }
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
@@ -5051,11 +5030,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
- struct kvm_steal_time __user *st;
- struct kvm_memslots *slots;
+ struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+ struct kvm_steal_time *st;
static const u8 preempted = KVM_VCPU_PREEMPTED;
- gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ unsigned long flags;
/*
* The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5080,20 +5058,28 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (unlikely(current->mm != vcpu->kvm->mm))
return;
- slots = kvm_memslots(vcpu->kvm);
-
- if (unlikely(slots->generation != ghc->generation ||
- gpa != ghc->gpa ||
- kvm_is_error_hva(ghc->hva) || !ghc->memslot))
- return;
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, sizeof(*st)))
+ goto out_unlock_gpc;
- st = (struct kvm_steal_time __user *)ghc->hva;
+ st = (struct kvm_steal_time *)gpc->khva;
BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
- if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
- vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ st->preempted = preempted;
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
- mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+ kvm_gpc_mark_dirty_in_slot(gpc);
+
+out_unlock_gpc:
+ read_unlock_irqrestore(&gpc->lock, flags);
+}
+
+static void kvm_steal_time_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_gpc_deactivate(&vcpu->arch.st.cache);
+ vcpu->arch.st.preempted = 0;
+ vcpu->arch.st.msr_val = 0;
+ vcpu->arch.st.last_steal = 0;
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -12219,6 +12205,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.st.cache, vcpu->kvm);
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
@@ -12331,6 +12319,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int idx;
+ kvm_steal_time_reset(vcpu);
+
kvmclock_reset(vcpu);
kvm_x86_call(vcpu_free)(vcpu);
@@ -12401,7 +12391,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_en_val = 0;
vcpu->arch.apf.msr_int_val = 0;
- vcpu->arch.st.msr_val = 0;
+
+ kvm_steal_time_reset(vcpu);
kvmclock_reset(vcpu);
--
2.40.1
Amazon Web Services Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
next reply other threads:[~2024-08-02 11:44 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-02 11:44 Carsten Stollmaier [this message]
2024-08-02 12:03 ` [PATCH] KVM: x86: Use gfn_to_pfn_cache for steal_time David Woodhouse
2024-08-02 12:38 ` Matthew Wilcox
2024-08-02 12:53 ` David Woodhouse
2024-08-02 12:56 ` David Woodhouse
2024-08-02 16:06 ` David Woodhouse
2024-08-02 22:40 ` Peter Xu
2024-08-03 8:35 ` David Woodhouse
2024-08-04 13:31 ` Peter Xu
2024-08-17 0:22 ` Sean Christopherson
2024-08-20 10:11 ` David Woodhouse
2025-07-29 10:28 ` David Woodhouse
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240802114402.96669-1-stollmc@amazon.com \
--to=stollmc@amazon.com \
--cc=bp@alien8.de \
--cc=dave.hansen@linux.intel.com \
--cc=dwmw2@infradead.org \
--cc=hpa@zytor.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=nh-open-source@amazon.com \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=sbiemue@amazon.de \
--cc=seanjc@google.com \
--cc=tglx@linutronix.de \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox