* [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
@ 2024-10-18 10:08 Bernhard Kauer
2024-10-21 7:43 ` Chao Gao
2024-10-22 17:32 ` Sean Christopherson
0 siblings, 2 replies; 10+ messages in thread
From: Bernhard Kauer @ 2024-10-18 10:08 UTC (permalink / raw)
To: kvm; +Cc: Bernhard Kauer
It used a static key to avoid loading the lapic pointer from
the vcpu->arch structure. However, in the common case the load
is from a hot cacheline and the CPU should be able to perfectly
predict it. Thus there is no upside of this premature optimization.
The downside is that code patching including an IPI to all CPUs
is required whenever the first VM without an lapic is created or
the last is destroyed.
Signed-off-by: Bernhard Kauer <bk@alpico.io>
---
arch/x86/kvm/lapic.c | 4 ----
arch/x86/kvm/lapic.h | 6 +-----
arch/x86/kvm/x86.c | 8 --------
3 files changed, 1 insertion(+), 17 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2098dc689088..0c75b3cc01f2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -135,8 +135,6 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
@@ -2518,7 +2516,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
if (!vcpu->arch.apic) {
- static_branch_dec(&kvm_has_noapic_vcpu);
return;
}
@@ -2864,7 +2861,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
ASSERT(vcpu != NULL);
if (!irqchip_in_kernel(vcpu->kvm)) {
- static_branch_inc(&kvm_has_noapic_vcpu);
return 0;
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1b8ef9856422..157af18c9fc8 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -179,13 +179,9 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
return __kvm_lapic_get_reg(apic->regs, reg_off);
}
-DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-
static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
- if (static_branch_unlikely(&kvm_has_noapic_vcpu))
- return vcpu->arch.apic;
- return true;
+ return vcpu->arch.apic;
}
extern struct static_key_false_deferred apic_hw_disabled;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83fe0a78146f..ca73cae31f53 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9828,8 +9828,6 @@ void kvm_x86_vendor_exit(void)
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
#endif
- kvm_lapic_exit();
-
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
@@ -14015,9 +14013,3 @@ static int __init kvm_x86_init(void)
return 0;
}
module_init(kvm_x86_init);
-
-static void __exit kvm_x86_exit(void)
-{
- WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
-}
-module_exit(kvm_x86_exit);
--
2.45.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-18 10:08 [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization Bernhard Kauer
@ 2024-10-21 7:43 ` Chao Gao
2024-10-22 17:32 ` Sean Christopherson
1 sibling, 0 replies; 10+ messages in thread
From: Chao Gao @ 2024-10-21 7:43 UTC (permalink / raw)
To: Bernhard Kauer; +Cc: kvm
The shortlog should be:
KVM: x86: Drop the kvm_has_noapic_vcpu optimization
see https://docs.kernel.org/process/maintainer-kvm-x86.html#shortlog
On Fri, Oct 18, 2024 at 12:08:45PM +0200, Bernhard Kauer wrote:
>It used a static key to avoid loading the lapic pointer from
>the vcpu->arch structure. However, in the common case the load
>is from a hot cacheline and the CPU should be able to perfectly
>predict it. Thus there is no upside of this premature optimization.
>
>The downside is that code patching including an IPI to all CPUs
>is required whenever the first VM without an lapic is created or
>the last is destroyed.
>
>Signed-off-by: Bernhard Kauer <bk@alpico.io>
>---
> arch/x86/kvm/lapic.c | 4 ----
> arch/x86/kvm/lapic.h | 6 +-----
> arch/x86/kvm/x86.c | 8 --------
> 3 files changed, 1 insertion(+), 17 deletions(-)
>
>diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>index 2098dc689088..0c75b3cc01f2 100644
>--- a/arch/x86/kvm/lapic.c
>+++ b/arch/x86/kvm/lapic.c
>@@ -135,8 +135,6 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
> return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
> }
>
>-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
>-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
>
> __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
> __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
>@@ -2518,7 +2516,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
> struct kvm_lapic *apic = vcpu->arch.apic;
>
> if (!vcpu->arch.apic) {
>- static_branch_dec(&kvm_has_noapic_vcpu);
> return;
> }
remove the curly brackets, i.e., just
if (!vcpu->arch.apic)
return;
>
>@@ -2864,7 +2861,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
> ASSERT(vcpu != NULL);
>
> if (!irqchip_in_kernel(vcpu->kvm)) {
>- static_branch_inc(&kvm_has_noapic_vcpu);
> return 0;
> }
ditto
>
>diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
>index 1b8ef9856422..157af18c9fc8 100644
>--- a/arch/x86/kvm/lapic.h
>+++ b/arch/x86/kvm/lapic.h
>@@ -179,13 +179,9 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
> return __kvm_lapic_get_reg(apic->regs, reg_off);
> }
>
>-DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
>-
> static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
> {
>- if (static_branch_unlikely(&kvm_has_noapic_vcpu))
>- return vcpu->arch.apic;
>- return true;
>+ return vcpu->arch.apic;
> }
>
> extern struct static_key_false_deferred apic_hw_disabled;
>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index 83fe0a78146f..ca73cae31f53 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -9828,8 +9828,6 @@ void kvm_x86_vendor_exit(void)
> if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
> clear_hv_tscchange_cb();
> #endif
>- kvm_lapic_exit();
>-
why is this call removed?
> if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
> cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
> CPUFREQ_TRANSITION_NOTIFIER);
>@@ -14015,9 +14013,3 @@ static int __init kvm_x86_init(void)
> return 0;
> }
> module_init(kvm_x86_init);
>-
>-static void __exit kvm_x86_exit(void)
>-{
>- WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
>-}
>-module_exit(kvm_x86_exit);
>--
>2.45.2
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-18 10:08 [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization Bernhard Kauer
2024-10-21 7:43 ` Chao Gao
@ 2024-10-22 17:32 ` Sean Christopherson
2024-10-22 19:08 ` Bernhard Kauer
1 sibling, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2024-10-22 17:32 UTC (permalink / raw)
To: Bernhard Kauer; +Cc: kvm
On Fri, Oct 18, 2024, Bernhard Kauer wrote:
> It used a static key to avoid loading the lapic pointer from
> the vcpu->arch structure. However, in the common case the load
> is from a hot cacheline and the CPU should be able to perfectly
> predict it. Thus there is no upside of this premature optimization.
Do you happen to have performance numbers? I've been itching for an excuse to
excise this code for a few years now, the only reason I haven't ripped it out is
because I didn't want to do so without numbers to back up my claim that it's a
premature optimization.
In other words, I agree with your analysis, but I don't want to yank out the code
without at least _some_ numbers to back up the claim, because then we're essentially
committing the same crime of optimizing without measuring.
> The downside is that code patching including an IPI to all CPUs
> is required whenever the first VM without an lapic is created or
> the last is destroyed.
In practice, this almost never happens though. Do you have a use case for
creating VMs without in-kernel local APICs?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-22 17:32 ` Sean Christopherson
@ 2024-10-22 19:08 ` Bernhard Kauer
2024-10-31 16:24 ` Sean Christopherson
0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Kauer @ 2024-10-22 19:08 UTC (permalink / raw)
To: Sean Christopherson; +Cc: Bernhard Kauer, kvm
On Tue, Oct 22, 2024 at 10:32:59AM -0700, Sean Christopherson wrote:
> On Fri, Oct 18, 2024, Bernhard Kauer wrote:
> > It used a static key to avoid loading the lapic pointer from
> > the vcpu->arch structure. However, in the common case the load
> > is from a hot cacheline and the CPU should be able to perfectly
> > predict it. Thus there is no upside of this premature optimization.
>
> Do you happen to have performance numbers?
Sure. I have some preliminary numbers as I'm still optimizing the
round-trip time for tiny virtual machines.
A hello-world micro benchmark on my AMD 6850U needs at least 331us. With
the static keys it requires 579us. That is a 75% increase.
Take the absolute values with a grain of salt as not all of my patches might
be applicable to the general case.
For the other side I don't have a relevant benchmark yet. But I doubt you
would see anything even with a very high IRQ rate.
> > The downside is that code patching including an IPI to all CPUs
> > is required whenever the first VM without an lapic is created or
> > the last is destroyed.
>
> In practice, this almost never happens though. Do you have a use case for
> creating VMs without in-kernel local APICs?
I switched from "full irqchip" to "no irqchip" due to a significant
performance gain and the simplicity it promised.
I might have to go to "split irqchip" mode for performance reasons but I
didn't had time to look into it yet.
So in the end I assume it will be a trade-off: Do I want to rely on these
3000 lines of kernel code to gain an X% performance increase, or not?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-22 19:08 ` Bernhard Kauer
@ 2024-10-31 16:24 ` Sean Christopherson
2024-10-31 20:08 ` Bernhard Kauer
0 siblings, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2024-10-31 16:24 UTC (permalink / raw)
To: Bernhard Kauer; +Cc: kvm
On Tue, Oct 22, 2024, Bernhard Kauer wrote:
> On Tue, Oct 22, 2024 at 10:32:59AM -0700, Sean Christopherson wrote:
> > On Fri, Oct 18, 2024, Bernhard Kauer wrote:
> > > It used a static key to avoid loading the lapic pointer from
> > > the vcpu->arch structure. However, in the common case the load
> > > is from a hot cacheline and the CPU should be able to perfectly
> > > predict it. Thus there is no upside of this premature optimization.
> >
> > Do you happen to have performance numbers?
>
> Sure. I have some preliminary numbers as I'm still optimizing the
> round-trip time for tiny virtual machines.
>
> A hello-world micro benchmark on my AMD 6850U needs at least 331us. With
> the static keys it requires 579us. That is a 75% increase.
For the first VM only though, correct?
> Take the absolute values with a grain of salt as not all of my patches might
> be applicable to the general case.
>
> For the other side I don't have a relevant benchmark yet. But I doubt you
> would see anything even with a very high IRQ rate.
>
>
> > > The downside is that code patching including an IPI to all CPUs
> > > is required whenever the first VM without an lapic is created or
> > > the last is destroyed.
> >
> > In practice, this almost never happens though. Do you have a use case for
> > creating VMs without in-kernel local APICs?
>
> I switched from "full irqchip" to "no irqchip" due to a significant
> performance gain
Signifcant performance gain for what path? I'm genuinely curious. Unless your
VM doesn't need a timer and doesn't need interrupts of any kind, emulating the
local APIC in userspace is going to be much less performant.
> and the simplicity it promised.
Similar to above, unless you are not emulating a local APIC anywhere, disabling
KVM's in-kernel local APIC isn't a meaningful change in overall complexity.
> I might have to go to "split irqchip" mode for performance reasons but I
> didn't had time to look into it yet.
>
> So in the end I assume it will be a trade-off: Do I want to rely on these
> 3000 lines of kernel code to gain an X% performance increase, or not?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-31 16:24 ` Sean Christopherson
@ 2024-10-31 20:08 ` Bernhard Kauer
2024-10-31 23:29 ` Sean Christopherson
0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Kauer @ 2024-10-31 20:08 UTC (permalink / raw)
To: Sean Christopherson; +Cc: Bernhard Kauer, kvm
On Thu, Oct 31, 2024 at 09:24:29AM -0700, Sean Christopherson wrote:
> On Tue, Oct 22, 2024, Bernhard Kauer wrote:
> > > > It used a static key to avoid loading the lapic pointer from
> > > > the vcpu->arch structure. However, in the common case the load
> > > > is from a hot cacheline and the CPU should be able to perfectly
> > > > predict it. Thus there is no upside of this premature optimization.
> > >
> > > Do you happen to have performance numbers?
> >
> > Sure. I have some preliminary numbers as I'm still optimizing the
> > round-trip time for tiny virtual machines.
> >
> > A hello-world micro benchmark on my AMD 6850U needs at least 331us. With
> > the static keys it requires 579us. That is a 75% increase.
>
> For the first VM only though, correct?
That is right. If I keep one VM in the background the overhead is not
measureable anymore.
> > Take the absolute values with a grain of salt as not all of my patches might
> > be applicable to the general case.
> >
> > For the other side I don't have a relevant benchmark yet. But I doubt you
> > would see anything even with a very high IRQ rate.
> >
> >
> > > > The downside is that code patching including an IPI to all CPUs
> > > > is required whenever the first VM without an lapic is created or
> > > > the last is destroyed.
> > >
> > > In practice, this almost never happens though. Do you have a use case for
> > > creating VMs without in-kernel local APICs?
> >
> > I switched from "full irqchip" to "no irqchip" due to a significant
> > performance gain
>
> Signifcant performance gain for what path? I'm genuinely curious.
I have this really slow PREEMPT_RT kernel (Debian 6.11.4-rt-amd64).
The hello-world benchmark takes on average 100ms. With IRQCHIP it goes
up to 220ms. An strace gives 83ms for the extra ioctl:
ioctl(4, KVM_CREATE_IRQCHIP, 0) = 0 <0.083242>
My current theory is that RCU takes ages on this kernel. And creating an
IOAPIC uses SRCU to synchronize the bus array...
However, in my latest benchmark runs the overhead for IRQCHIP is down to 15
microseconds. So no big deal anymore.
> Unless your VM doesn't need a timer and doesn't need interrupts of
> any kind, emulating the local APIC in userspace is going to be much
> less performant.
Do you have any performance numbers?
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-31 20:08 ` Bernhard Kauer
@ 2024-10-31 23:29 ` Sean Christopherson
2024-11-01 8:55 ` Bernhard Kauer
0 siblings, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2024-10-31 23:29 UTC (permalink / raw)
To: Bernhard Kauer; +Cc: kvm
On Thu, Oct 31, 2024, Bernhard Kauer wrote:
> > > > In practice, this almost never happens though. Do you have a use case for
> > > > creating VMs without in-kernel local APICs?
> > >
> > > I switched from "full irqchip" to "no irqchip" due to a significant
> > > performance gain
> >
> > Signifcant performance gain for what path? I'm genuinely curious.
>
> I have this really slow PREEMPT_RT kernel (Debian 6.11.4-rt-amd64).
> The hello-world benchmark takes on average 100ms. With IRQCHIP it goes
> up to 220ms. An strace gives 83ms for the extra ioctl:
>
> ioctl(4, KVM_CREATE_IRQCHIP, 0) = 0 <0.083242>
>
> My current theory is that RCU takes ages on this kernel. And creating an
> IOAPIC uses SRCU to synchronize the bus array...
>
> However, in my latest benchmark runs the overhead for IRQCHIP is down to 15
> microseconds. So no big deal anymore.
Assuming you're running a recent kernel, that's likely thanks to commit
fbe4a7e881d4 ("KVM: Setup empty IRQ routing when creating a VM").
> > Unless your VM doesn't need a timer and doesn't need interrupts of
> > any kind, emulating the local APIC in userspace is going to be much
> > less performant.
>
> Do you have any performance numbers?
Heh, nope. I actually tried to grab some, mostly out of curiosity again, but
recent (last few years) versions of QEMU don't even support a userspace APIC.
A single EOI is a great example though. On a remotely modern CPU, an in-kernel
APIC allows KVM to enable hardware acceleration so that the EOI is virtualized by
hardware, i.e. doesn't take a VM-Exit and so the latency is basically the same as
a native EOI (tens of cycles, maybe less).
With a userspace APIC, the roundtrip to userspace to emulate the EOI is measured
in tens of thousands of cycles. IIRC, last I played around with userspace exits
the average turnaround time was ~50k cycles.
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-10-31 23:29 ` Sean Christopherson
@ 2024-11-01 8:55 ` Bernhard Kauer
2024-11-01 14:49 ` Sean Christopherson
0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Kauer @ 2024-11-01 8:55 UTC (permalink / raw)
To: Sean Christopherson; +Cc: Bernhard Kauer, kvm
On Thu, Oct 31, 2024 at 04:29:52PM -0700, Sean Christopherson wrote:
> > > Unless your VM doesn't need a timer and doesn't need interrupts of
> > > any kind, emulating the local APIC in userspace is going to be much
> > > less performant.
> >
> > Do you have any performance numbers?
>
> Heh, nope. I actually tried to grab some, mostly out of curiosity again, but
> recent (last few years) versions of QEMU don't even support a userspace APIC.
>
> A single EOI is a great example though. On a remotely modern CPU, an in-kernel
> APIC allows KVM to enable hardware acceleration so that the EOI is virtualized by
> hardware, i.e. doesn't take a VM-Exit and so the latency is basically the same as
> a native EOI (tens of cycles, maybe less).
>
> With a userspace APIC, the roundtrip to userspace to emulate the EOI is measured
> in tens of thousands of cycles. IIRC, last I played around with userspace exits
> the average turnaround time was ~50k cycles.
That sound a lot so I did some quick benchmarking. An exit is around 1400
TSC cycles on my AMD laptop, instruction emulation takes 1200 and going
to user-level needs at least 6200. Not terribly slow but still room for
optimizations.
INSTR
CPUID 1394
RDMSR 1550
MMIO
APIC 2609
IOAPIC 2800
HPET 9426
PIO
PIC 1804
UART 8011
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-11-01 8:55 ` Bernhard Kauer
@ 2024-11-01 14:49 ` Sean Christopherson
2024-11-15 9:12 ` Bernhard Kauer
0 siblings, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2024-11-01 14:49 UTC (permalink / raw)
To: Bernhard Kauer; +Cc: kvm
On Fri, Nov 01, 2024, Bernhard Kauer wrote:
> On Thu, Oct 31, 2024 at 04:29:52PM -0700, Sean Christopherson wrote:
> > > > Unless your VM doesn't need a timer and doesn't need interrupts of
> > > > any kind, emulating the local APIC in userspace is going to be much
> > > > less performant.
> > >
> > > Do you have any performance numbers?
> >
> > Heh, nope. I actually tried to grab some, mostly out of curiosity again, but
> > recent (last few years) versions of QEMU don't even support a userspace APIC.
> >
> > A single EOI is a great example though. On a remotely modern CPU, an in-kernel
> > APIC allows KVM to enable hardware acceleration so that the EOI is virtualized by
> > hardware, i.e. doesn't take a VM-Exit and so the latency is basically the same as
> > a native EOI (tens of cycles, maybe less).
> >
> > With a userspace APIC, the roundtrip to userspace to emulate the EOI is measured
> > in tens of thousands of cycles. IIRC, last I played around with userspace exits
> > the average turnaround time was ~50k cycles.
>
>
> That sound a lot so I did some quick benchmarking. An exit is around 1400
> TSC cycles on my AMD laptop, instruction emulation takes 1200 and going
> to user-level needs at least 6200. Not terribly slow but still room for
> optimizations.
Ah, I suspect my recollection of ~50k cycles is from measuring all exits to
userspace, i.e. included the reaaaaly slow paths.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization
2024-11-01 14:49 ` Sean Christopherson
@ 2024-11-15 9:12 ` Bernhard Kauer
0 siblings, 0 replies; 10+ messages in thread
From: Bernhard Kauer @ 2024-11-15 9:12 UTC (permalink / raw)
To: Sean Christopherson; +Cc: Alexander Graf, kvm
On Fri, Nov 01, 2024 at 07:49:10AM -0700, Sean Christopherson wrote:
> On Fri, Nov 01, 2024, Bernhard Kauer wrote:
> > On Thu, Oct 31, 2024 at 04:29:52PM -0700, Sean Christopherson wrote:
> > > With a userspace APIC, the roundtrip to userspace to emulate the EOI is measured
> > > in tens of thousands of cycles. IIRC, last I played around with userspace exits
> > > the average turnaround time was ~50k cycles.
> >
> >
> > That sound a lot so I did some quick benchmarking. An exit is around 1400
> > TSC cycles on my AMD laptop, instruction emulation takes 1200 and going
> > to user-level needs at least 6200. Not terribly slow but still room for
> > optimizations.
>
> Ah, I suspect my recollection of ~50k cycles is from measuring all exits to
> userspace, i.e. included the reaaaaly slow paths.
I finally found the reason for the slow user-level roundtrip on my Zen3+
machine.
Disabling SRSO with spec_rstack_overflow=off improves the user-level part
by 3x. The exit as well as the instruction emulation overhead is down by 40%.
Thus without SRSO a roundtrip to user-level needs roughly 2000 cycles.
SRSO=off default factor
INSTR CPUID 1008 1394 1.4x
RDMSR 1072 1550 1.4x
MMIO APIC 1666 2609 1.6x
IOAPIC 1783 2800 1.6x
HPET 3626 9426 2.6x
PIO PIC 1250 1804 1.4x
UART 2837 8011 2.8x
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-11-15 9:53 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-18 10:08 [PATCH] KVM: drop the kvm_has_noapic_vcpu optimization Bernhard Kauer
2024-10-21 7:43 ` Chao Gao
2024-10-22 17:32 ` Sean Christopherson
2024-10-22 19:08 ` Bernhard Kauer
2024-10-31 16:24 ` Sean Christopherson
2024-10-31 20:08 ` Bernhard Kauer
2024-10-31 23:29 ` Sean Christopherson
2024-11-01 8:55 ` Bernhard Kauer
2024-11-01 14:49 ` Sean Christopherson
2024-11-15 9:12 ` Bernhard Kauer
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox