* [RFC PATCH] KVM: optimize apic interrupt delivery
@ 2012-09-10 13:09 Gleb Natapov
2012-09-10 14:44 ` Michael S. Tsirkin
2012-09-10 15:09 ` Avi Kivity
0 siblings, 2 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-10 13:09 UTC (permalink / raw)
To: kvm; +Cc: avi, mtosatti, mst
Most interrupt are delivered to only one vcpu. Use pre-build tables to
find interrupt destination instead of looping through all vcpus.
Signed-off-by: Gleb Natapov <gleb@redhat.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 64adb61..121f308 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
+struct kvm_apic_map {
+ struct rcu_head rcu;
+ bool flat;
+ u8 ldr_bits;
+ struct kvm_lapic *phys_map[255];
+ struct kvm_lapic *logical_map[16][16];
+};
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -528,6 +536,8 @@ struct kvm_arch {
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
int vapics_in_nmi_mode;
+ struct kvm_apic_map *apic_map;
+ struct mutex apic_map_lock;
unsigned int tss_addr;
struct page *apic_access_page;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 07ad628..d18ddd2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -139,11 +139,101 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+
static inline int kvm_apic_id(struct kvm_lapic *apic)
{
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
+static void rcu_free_apic_map(struct rcu_head *head)
+{
+ struct kvm_apic_map *p = container_of(head,
+ struct kvm_apic_map, rcu);
+
+ kfree(p);
+}
+
+static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
+ u16 *cid, u16 *lid)
+{
+ if (ldr_bits == 32) {
+ *cid = ldr >> 16;
+ *lid = ldr & 0xffff;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+
+ if (flat) {
+ *cid = 0;
+ *lid = ldr;
+ } else {
+ *cid = ldr >> 4;
+ *lid = ldr & 0xf;
+ }
+ }
+}
+
+static inline int recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+
+ if (!new)
+ return -ENOMEM;
+
+ new->ldr_bits = 8;
+ new->flat = true;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ u16 cid, lid;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (apic_x2apic_mode(apic)) {
+ new->ldr_bits = 32;
+ new->flat = false;
+ } else if (kvm_apic_sw_enabled(apic) && new->flat &&
+ kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER)
+ new->flat = false;
+
+ new->phys_map[kvm_apic_id(apic)] = apic;
+ kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
+ new->flat, new->ldr_bits, &cid, &lid);
+
+ if (lid)
+ new->logical_map[cid][ffs(lid) - 1] = apic;
+ }
+ mutex_lock(&kvm->arch.apic_map_lock);
+ old = kvm->arch.apic_map;
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ call_rcu(&old->rcu, rcu_free_apic_map);
+
+ return 0;
+}
+
+static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+{
+ apic_set_reg(apic, APIC_ID, id << 24);
+ return recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ apic_set_reg(apic, APIC_LDR, id);
+ return recalculate_apic_map(apic->vcpu->kvm);
+}
+
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -193,11 +283,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_LVR, v);
}
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
- return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -477,6 +562,67 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
return result;
}
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r)
+{
+ unsigned long bitmap = 1;
+ struct kvm_lapic **dst;
+ int i;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ *r = kvm_apic_set_irq(src->vcpu, irq);
+ return true;
+ }
+
+ if (irq->shorthand)
+ return false;
+
+ if (irq->dest_mode == 0) { /* physical mode */
+ if (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->dest_id == 0xff)
+ return false;
+ rcu_read_lock();
+ dst = &kvm->arch.apic_map->phys_map[irq->dest_id & 0xff];
+ } else {
+ u16 cid, lid;
+ u32 mda = irq->dest_id;
+
+ rcu_read_lock();
+ if (kvm->arch.apic_map->ldr_bits == 8)
+ mda <<= 24;
+
+ kvm_apic_get_logical_id(mda, kvm->arch.apic_map->flat,
+ kvm->arch.apic_map->ldr_bits, &cid, &lid);
+ dst = kvm->arch.apic_map->logical_map[cid];
+
+ bitmap = lid;
+ if (irq->delivery_mode == APIC_DM_LOWEST) {
+ int l = -1;
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (l < 0)
+ l = i;
+ else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+ l = i;
+ }
+
+ bitmap = (l >= 0) ? 1 << l : 0;
+ }
+ }
+
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (*r < 0)
+ *r = 0;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+ }
+
+ rcu_read_unlock();
+ return true;
+}
+
/*
* Add a pending IRQ into lapic.
* Return 1 if successfully added and 0 if discarded.
@@ -880,7 +1026,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_ID, val);
+ kvm_apic_set_id(apic, val >> 24);
else
ret = 1;
break;
@@ -896,15 +1042,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LDR:
if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
else
ret = 1;
break;
case APIC_DFR:
- if (!apic_x2apic_mode(apic))
+ if (!apic_x2apic_mode(apic)) {
apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- else
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
ret = 1;
break;
@@ -1135,6 +1282,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
static_key_slow_dec_deferred(&apic_hw_disabled);
else
static_key_slow_inc(&apic_hw_disabled.key);
+ recalculate_apic_map(vcpu->kvm);
}
if (!kvm_vcpu_is_bsp(apic->vcpu))
@@ -1144,7 +1292,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if (apic_x2apic_mode(apic)) {
u32 id = kvm_apic_id(apic);
u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
- apic_set_reg(apic, APIC_LDR, ldr);
+ kvm_apic_set_ldr(apic, ldr);
}
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -1169,7 +1317,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+ kvm_apic_set_id(apic, (u8)vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1180,7 +1328,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
apic_set_spiv(apic, 0xff);
apic_set_reg(apic, APIC_TASKPRI, 0);
- apic_set_reg(apic, APIC_LDR, 0);
+ kvm_apic_set_ldr(apic, 0);
apic_set_reg(apic, APIC_ESR, 0);
apic_set_reg(apic, APIC_ICR, 0);
apic_set_reg(apic, APIC_ICR2, 0);
@@ -1398,6 +1546,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
/* set SPIV separately to get count of SW disabled APICs right */
apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+ /* call kvm_apic_set_id() to put apic into apic_map */
+ kvm_apic_set_id(apic, kvm_apic_id(apic));
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 615a8b0..5ce084e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -52,6 +52,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r);
+
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
@@ -120,5 +123,4 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
{
return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
}
-
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f91e2c9..db6db8f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6269,6 +6269,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
return 0;
}
@@ -6319,6 +6320,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ kfree(kvm->arch.apic_map);
}
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 7118be0..3ca89c4 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq))
+ kvm_is_dm_lowest_prio(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+ return r;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
--
Gleb.
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 13:09 [RFC PATCH] KVM: optimize apic interrupt delivery Gleb Natapov
@ 2012-09-10 14:44 ` Michael S. Tsirkin
2012-09-10 16:17 ` Gleb Natapov
2012-09-10 15:09 ` Avi Kivity
1 sibling, 1 reply; 17+ messages in thread
From: Michael S. Tsirkin @ 2012-09-10 14:44 UTC (permalink / raw)
To: Gleb Natapov; +Cc: kvm, avi, mtosatti
On Mon, Sep 10, 2012 at 04:09:15PM +0300, Gleb Natapov wrote:
> Most interrupt are delivered to only one vcpu. Use pre-build tables to
> find interrupt destination instead of looping through all vcpus.
>
> Signed-off-by: Gleb Natapov <gleb@redhat.com>
Looks good overall. I think I see some bugs, with rcu use and others.
the most suspecious thing is that code seems to use rcu but
there are no calls to rcu_dereference anywhere.
Pls see below.
Thanks for looking into this, once ready I intend to rework
direct injection to work on top of this.
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 64adb61..121f308 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
> struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
> };
>
> +struct kvm_apic_map {
> + struct rcu_head rcu;
> + bool flat;
> + u8 ldr_bits;
> + struct kvm_lapic *phys_map[255];
Not 256? apic id is sometimes used in the lookup - is is guaranteed
that guest can not set it to 0xff? If yes this will overflow.
> + struct kvm_lapic *logical_map[16][16];
These are large arrays: 2Kbyte each one, which is bad
for cache.
Is it not better to have vcpu index stored there?
that would reduce cache footprint by x8.
> +};
> +
> struct kvm_arch {
> unsigned int n_used_mmu_pages;
> unsigned int n_requested_mmu_pages;
> @@ -528,6 +536,8 @@ struct kvm_arch {
> struct kvm_ioapic *vioapic;
> struct kvm_pit *vpit;
> int vapics_in_nmi_mode;
> + struct kvm_apic_map *apic_map;
> + struct mutex apic_map_lock;
>
> unsigned int tss_addr;
> struct page *apic_access_page;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 07ad628..d18ddd2 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -139,11 +139,101 @@ static inline int apic_enabled(struct kvm_lapic *apic)
> (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
>
> +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> +{
> + return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> +}
> +
> +
two emoty lines not needed
> static inline int kvm_apic_id(struct kvm_lapic *apic)
> {
> return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> }
>
> +static void rcu_free_apic_map(struct rcu_head *head)
> +{
> + struct kvm_apic_map *p = container_of(head,
> + struct kvm_apic_map, rcu);
why break this line? it is not too long as is.
> +
> + kfree(p);
> +}
> +
> +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> + u16 *cid, u16 *lid)
> +{
> + if (ldr_bits == 32) {
> + *cid = ldr >> 16;
> + *lid = ldr & 0xffff;
> + } else {
> + ldr = GET_APIC_LOGICAL_ID(ldr);
> +
> + if (flat) {
> + *cid = 0;
> + *lid = ldr;
> + } else {
> + *cid = ldr >> 4;
> + *lid = ldr & 0xf;
> + }
> + }
> +}
> +
> +static inline int recalculate_apic_map(struct kvm *kvm)
__must_check?
> +{
> + struct kvm_apic_map *new, *old = NULL;
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> +
empty line not needed here :)
> + if (!new)
> + return -ENOMEM;
> +
> + new->ldr_bits = 8;
> + new->flat = true;
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + u16 cid, lid;
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (apic_x2apic_mode(apic)) {
> + new->ldr_bits = 32;
> + new->flat = false;
> + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER)
> + new->flat = false;
> +
> + new->phys_map[kvm_apic_id(apic)] = apic;
> + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> + new->flat, new->ldr_bits, &cid, &lid);
> +
> + if (lid)
> + new->logical_map[cid][ffs(lid) - 1] = apic;
> + }
> + mutex_lock(&kvm->arch.apic_map_lock);
> + old = kvm->arch.apic_map;
rcu_dereference_protected?
> + rcu_assign_pointer(kvm->arch.apic_map, new);
> + mutex_unlock(&kvm->arch.apic_map_lock);
> +
> + if (old)
> + call_rcu(&old->rcu, rcu_free_apic_map);
What guarantees rcu_free_apic_map is called before module goes away?
I suspect we need at least rcu_barrier here:
https://lwn.net/Articles/217484/
Also, this is done upon guest request?
This allocates 2K and delays free until next rcu grace
period. Guest can buffer up *a lot of* host kernel
memory before this happens - looks like a DOS potential?
Maybe simply synchronize_rcu here? Or if that looks too
aggressive, adding this to some per-kvm pending list,
and call rcu_barrier once that list is too large.
> +
> + return 0;
> +}
> +
> +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> +{
> + apic_set_reg(apic, APIC_ID, id << 24);
> + return recalculate_apic_map(apic->vcpu->kvm);
> +}
> +
> +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> +{
> + apic_set_reg(apic, APIC_LDR, id);
> + return recalculate_apic_map(apic->vcpu->kvm);
> +}
> +
return value of these functions seems never checked.
> static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
> {
> return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
> @@ -193,11 +283,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
> apic_set_reg(apic, APIC_LVR, v);
> }
>
> -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> -{
> - return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> -}
> -
> static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
> LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
> LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
> @@ -477,6 +562,67 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
> return result;
> }
>
> +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> + struct kvm_lapic_irq *irq, int *r)
> +{
> + unsigned long bitmap = 1;
> + struct kvm_lapic **dst;
> + int i;
> +
> + if (irq->shorthand == APIC_DEST_SELF) {
> + *r = kvm_apic_set_irq(src->vcpu, irq);
> + return true;
> + }
> +
> + if (irq->shorthand)
> + return false;
> +
> + if (irq->dest_mode == 0) { /* physical mode */
> + if (irq->delivery_mode == APIC_DM_LOWEST ||
> + irq->dest_id == 0xff)
> + return false;
> + rcu_read_lock();
> + dst = &kvm->arch.apic_map
rcu_dereference?
Also, how do we know apic_map is not NULL at this point?
>->phys_map[irq->dest_id & 0xff];
I went back and forth and I think I see what is going
on here. bitmap is 1, this way for_each_set_bit below
triggers with value 0. and we get &dst[0] which is same
as dst. And read lock here is pared with unlock outside if.
So it looks correct.
But IMHO this is just trying too hard to reduce duplication,
it confused at least this reader. Is this erquivalent to:
rcu_read_lock();
dst = &kvm->arch.apic_map->phys_map[irq->dest_id & 0xff];
*r += kvm_apic_set_irq(dst->vcpu, irq);
rcu_read_unlock();
return true;
?
If yes would be much clearer this way.
And the long code section below will also cleaner
because nesting is more shallow.
> + } else {
> + u16 cid, lid;
> + u32 mda = irq->dest_id;
> +
> + rcu_read_lock();
> + if (kvm->arch.apic_map->ldr_bits == 8)
rcu_dereference?
Also, how do we know apic_map is not NULL at this point?
> + mda <<= 24;
> +
> + kvm_apic_get_logical_id(mda, kvm->arch.apic_map->flat,
> + kvm->arch.apic_map->ldr_bits, &cid, &lid);
> + dst = kvm->arch.apic_map->logical_map[cid];
> +
> + bitmap = lid;
> + if (irq->delivery_mode == APIC_DM_LOWEST) {
> + int l = -1;
> + for_each_set_bit(i, &bitmap, 16) {
constant 16 repeated here and below. to make it a bit prettier:
#define foreach_bit_in_apic_lid(bit, lid) for_each_set_bit(bit, &lid, 16)
?
> + if (!dst[i])
> + continue;
> + if (l < 0)
> + l = i;
> + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
> + l = i;
> + }
> +
> + bitmap = (l >= 0) ? 1 << l : 0;
We already know the destination here. Why encode it in bitmap and
rescan below? Rewriting bitmap like this confused at least this reader.
This will be equivalent, right?
if (l >= 0)
*r = kvm_apic_set_irq(dst[i]->vcpu, irq);
goto unlock;
If yes IMHO this is much more readable.
> + }
> + }
> +
> + for_each_set_bit(i, &bitmap, 16) {
I note that *r is never set if bitmap is 0
(this became apparent after rewrite as suggested above).
Looks like this will leak some kernel stack info to userspace.
> + if (!dst[i])
> + continue;
> + if (*r < 0)
> + *r = 0;
> + *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
> + }
> +
> + rcu_read_unlock();
> + return true;
> +}
> +
> /*
> * Add a pending IRQ into lapic.
> * Return 1 if successfully added and 0 if discarded.
> @@ -880,7 +1026,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> switch (reg) {
> case APIC_ID: /* Local APIC ID */
> if (!apic_x2apic_mode(apic))
> - apic_set_reg(apic, APIC_ID, val);
> + kvm_apic_set_id(apic, val >> 24);
return value discarded here.
> else
> ret = 1;
> break;
> @@ -896,15 +1042,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>
> case APIC_LDR:
> if (!apic_x2apic_mode(apic))
> - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
> + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
and here
> else
> ret = 1;
> break;
>
> case APIC_DFR:
> - if (!apic_x2apic_mode(apic))
> + if (!apic_x2apic_mode(apic)) {
> apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> - else
> + recalculate_apic_map(apic->vcpu->kvm);
return value discarded here.
> + } else
> ret = 1;
> break;
>
> @@ -1135,6 +1282,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> static_key_slow_dec_deferred(&apic_hw_disabled);
> else
> static_key_slow_inc(&apic_hw_disabled.key);
> + recalculate_apic_map(vcpu->kvm);
and here
> }
>
> if (!kvm_vcpu_is_bsp(apic->vcpu))
> @@ -1144,7 +1292,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> if (apic_x2apic_mode(apic)) {
> u32 id = kvm_apic_id(apic);
> u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
> - apic_set_reg(apic, APIC_LDR, ldr);
> + kvm_apic_set_ldr(apic, ldr);
and here
> }
> apic->base_address = apic->vcpu->arch.apic_base &
> MSR_IA32_APICBASE_BASE;
> @@ -1169,7 +1317,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> /* Stop the timer in case it's a reset to an active apic */
> hrtimer_cancel(&apic->lapic_timer.timer);
>
> - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
> + kvm_apic_set_id(apic, (u8)vcpu->vcpu_id);
and here
> kvm_apic_set_version(apic->vcpu);
>
> for (i = 0; i < APIC_LVT_NUM; i++)
> @@ -1180,7 +1328,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> apic_set_reg(apic, APIC_DFR, 0xffffffffU);
> apic_set_spiv(apic, 0xff);
> apic_set_reg(apic, APIC_TASKPRI, 0);
> - apic_set_reg(apic, APIC_LDR, 0);
> + kvm_apic_set_ldr(apic, 0);
and here
> apic_set_reg(apic, APIC_ESR, 0);
> apic_set_reg(apic, APIC_ICR, 0);
> apic_set_reg(apic, APIC_ICR2, 0);
> @@ -1398,6 +1546,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
> /* set SPIV separately to get count of SW disabled APICs right */
> apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
> memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
> + /* call kvm_apic_set_id() to put apic into apic_map */
> + kvm_apic_set_id(apic, kvm_apic_id(apic));
and here
> kvm_apic_set_version(vcpu);
>
> apic_update_ppr(apic);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 615a8b0..5ce084e 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -52,6 +52,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
> int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
> int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
>
> +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> + struct kvm_lapic_irq *irq, int *r);
> +
> u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
> void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
> void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
> @@ -120,5 +123,4 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
> {
> return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
> }
> -
> #endif
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f91e2c9..db6db8f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6269,6 +6269,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
>
> raw_spin_lock_init(&kvm->arch.tsc_write_lock);
> + mutex_init(&kvm->arch.apic_map_lock);
>
> return 0;
> }
> @@ -6319,6 +6320,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
> put_page(kvm->arch.apic_access_page);
> if (kvm->arch.ept_identity_pagetable)
> put_page(kvm->arch.ept_identity_pagetable);
> + kfree(kvm->arch.apic_map);
rcu_dereference_check(..., 1)
since apic_map is around until vm is destroyed
maybe really put it inline in arch?
> }
>
> void kvm_arch_free_memslot(struct kvm_memory_slot *free,
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 7118be0..3ca89c4 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
> struct kvm_vcpu *vcpu, *lowest = NULL;
>
> if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
> - kvm_is_dm_lowest_prio(irq))
> + kvm_is_dm_lowest_prio(irq)) {
> printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
> + irq->delivery_mode = APIC_DM_FIXED;
> + }
> +
> + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
> + return r;
>
> kvm_for_each_vcpu(i, vcpu, kvm) {
> if (!kvm_apic_present(vcpu))
> --
> Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 13:09 [RFC PATCH] KVM: optimize apic interrupt delivery Gleb Natapov
2012-09-10 14:44 ` Michael S. Tsirkin
@ 2012-09-10 15:09 ` Avi Kivity
2012-09-10 15:35 ` Gleb Natapov
1 sibling, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2012-09-10 15:09 UTC (permalink / raw)
To: Gleb Natapov; +Cc: kvm, mtosatti, mst
On 09/10/2012 04:09 PM, Gleb Natapov wrote:
> Most interrupt are delivered to only one vcpu. Use pre-build tables to
> find interrupt destination instead of looping through all vcpus.
>
> +
> +static inline int recalculate_apic_map(struct kvm *kvm)
> +{
> + struct kvm_apic_map *new, *old = NULL;
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> +
> + if (!new)
> + return -ENOMEM;
> +
> + new->ldr_bits = 8;
> + new->flat = true;
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + u16 cid, lid;
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (apic_x2apic_mode(apic)) {
> + new->ldr_bits = 32;
> + new->flat = false;
> + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER)
> + new->flat = false;
> +
> + new->phys_map[kvm_apic_id(apic)] = apic;
> + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> + new->flat, new->ldr_bits, &cid, &lid);
> +
> + if (lid)
> + new->logical_map[cid][ffs(lid) - 1] = apic;
> + }
> + mutex_lock(&kvm->arch.apic_map_lock);
> + old = kvm->arch.apic_map;
> + rcu_assign_pointer(kvm->arch.apic_map, new);
> + mutex_unlock(&kvm->arch.apic_map_lock);
is this not racy?
cpu 0 cpu 1
---------------- -----------------
new apic id
recalculate
new apic id
recalculate
take lock
install map
drop lock
take lock
install map
drop lock
> +
> + if (old)
> + call_rcu(&old->rcu, rcu_free_apic_map);
> +
> + return 0;
> +}
> +
> -
> static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
> LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
> LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
> @@ -477,6 +562,67 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
> return result;
> }
>
> +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> + struct kvm_lapic_irq *irq, int *r)
> +{
> + unsigned long bitmap = 1;
> + struct kvm_lapic **dst;
> + int i;
> +
> + if (irq->shorthand == APIC_DEST_SELF) {
> + *r = kvm_apic_set_irq(src->vcpu, irq);
> + return true;
> + }
> +
> + if (irq->shorthand)
> + return false;
> +
> + if (irq->dest_mode == 0) { /* physical mode */
> + if (irq->delivery_mode == APIC_DM_LOWEST ||
> + irq->dest_id == 0xff)
> + return false;
> + rcu_read_lock();
Unmatched rcu_read_lock() inside a block is ugly, please start it earlier.
> + dst = &kvm->arch.apic_map->phys_map[irq->dest_id & 0xff];
Like mst says, rcu_dereference().
> + } else {
> + u16 cid, lid;
> + u32 mda = irq->dest_id;
> +
> + rcu_read_lock();
> + if (kvm->arch.apic_map->ldr_bits == 8)
> + mda <<= 24;
> +
> + kvm_apic_get_logical_id(mda, kvm->arch.apic_map->flat,
> + kvm->arch.apic_map->ldr_bits, &cid, &lid);
> + dst = kvm->arch.apic_map->logical_map[cid];
> +
> + bitmap = lid;
> + if (irq->delivery_mode == APIC_DM_LOWEST) {
> + int l = -1;
> + for_each_set_bit(i, &bitmap, 16) {
> + if (!dst[i])
> + continue;
> + if (l < 0)
> + l = i;
> + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
> + l = i;
> + }
> +
> + bitmap = (l >= 0) ? 1 << l : 0;
> + }
> + }
> +
> + for_each_set_bit(i, &bitmap, 16) {
> + if (!dst[i])
> + continue;
> + if (*r < 0)
> + *r = 0;
> + *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
> + }
> +
> + rcu_read_unlock();
> + return true;
> +}
> +
> /*
> * Add a pending IRQ into lapic.
> * Return 1 if successfully added and 0 if discarded.
> @@ -880,7 +1026,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> switch (reg) {
> case APIC_ID: /* Local APIC ID */
> if (!apic_x2apic_mode(apic))
> - apic_set_reg(apic, APIC_ID, val);
> + kvm_apic_set_id(apic, val >> 24);
> else
> ret = 1;
> break;
> @@ -896,15 +1042,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>
> case APIC_LDR:
> if (!apic_x2apic_mode(apic))
> - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
> + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
> else
> ret = 1;
> break;
>
> case APIC_DFR:
> - if (!apic_x2apic_mode(apic))
> + if (!apic_x2apic_mode(apic)) {
> apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> - else
> + recalculate_apic_map(apic->vcpu->kvm);
> + } else
> ret = 1;
What if these have different settings on different vcpus?
> break;
>
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 15:09 ` Avi Kivity
@ 2012-09-10 15:35 ` Gleb Natapov
0 siblings, 0 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-10 15:35 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm, mtosatti, mst
On Mon, Sep 10, 2012 at 06:09:01PM +0300, Avi Kivity wrote:
> On 09/10/2012 04:09 PM, Gleb Natapov wrote:
> > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > find interrupt destination instead of looping through all vcpus.
> >
> > +
> > +static inline int recalculate_apic_map(struct kvm *kvm)
> > +{
> > + struct kvm_apic_map *new, *old = NULL;
> > + struct kvm_vcpu *vcpu;
> > + int i;
> > +
> > + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > +
> > + if (!new)
> > + return -ENOMEM;
> > +
> > + new->ldr_bits = 8;
> > + new->flat = true;
> > + kvm_for_each_vcpu(i, vcpu, kvm) {
> > + u16 cid, lid;
> > + struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > + if (!kvm_apic_present(vcpu))
> > + continue;
> > +
> > + if (apic_x2apic_mode(apic)) {
> > + new->ldr_bits = 32;
> > + new->flat = false;
> > + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER)
> > + new->flat = false;
> > +
> > + new->phys_map[kvm_apic_id(apic)] = apic;
> > + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > + new->flat, new->ldr_bits, &cid, &lid);
> > +
> > + if (lid)
> > + new->logical_map[cid][ffs(lid) - 1] = apic;
> > + }
> > + mutex_lock(&kvm->arch.apic_map_lock);
> > + old = kvm->arch.apic_map;
> > + rcu_assign_pointer(kvm->arch.apic_map, new);
> > + mutex_unlock(&kvm->arch.apic_map_lock);
>
> is this not racy?
>
> cpu 0 cpu 1
> ---------------- -----------------
> new apic id
> recalculate
>
> new apic id
> recalculate
>
> take lock
> install map
> drop lock
>
> take lock
> install map
> drop lock
>
Hmm, yes. Will have to hold lock during recalculation. Not a big deal.
>
> > +
> > + if (old)
> > + call_rcu(&old->rcu, rcu_free_apic_map);
> > +
> > + return 0;
> > +}
> > +
>
> > -
> > static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
> > LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
> > LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
> > @@ -477,6 +562,67 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
> > return result;
> > }
> >
> > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> > + struct kvm_lapic_irq *irq, int *r)
> > +{
> > + unsigned long bitmap = 1;
> > + struct kvm_lapic **dst;
> > + int i;
> > +
> > + if (irq->shorthand == APIC_DEST_SELF) {
> > + *r = kvm_apic_set_irq(src->vcpu, irq);
> > + return true;
> > + }
> > +
> > + if (irq->shorthand)
> > + return false;
> > +
> > + if (irq->dest_mode == 0) { /* physical mode */
> > + if (irq->delivery_mode == APIC_DM_LOWEST ||
> > + irq->dest_id == 0xff)
> > + return false;
> > + rcu_read_lock();
>
> Unmatched rcu_read_lock() inside a block is ugly, please start it earlier.
>
Doing rcu_read_unlock() ugly too, but if you prefer it this way...
> > + dst = &kvm->arch.apic_map->phys_map[irq->dest_id & 0xff];
>
> Like mst says, rcu_dereference().
>
Ugh.
> > + } else {
> > + u16 cid, lid;
> > + u32 mda = irq->dest_id;
> > +
> > + rcu_read_lock();
> > + if (kvm->arch.apic_map->ldr_bits == 8)
> > + mda <<= 24;
> > +
> > + kvm_apic_get_logical_id(mda, kvm->arch.apic_map->flat,
> > + kvm->arch.apic_map->ldr_bits, &cid, &lid);
> > + dst = kvm->arch.apic_map->logical_map[cid];
> > +
> > + bitmap = lid;
> > + if (irq->delivery_mode == APIC_DM_LOWEST) {
> > + int l = -1;
> > + for_each_set_bit(i, &bitmap, 16) {
> > + if (!dst[i])
> > + continue;
> > + if (l < 0)
> > + l = i;
> > + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
> > + l = i;
> > + }
> > +
> > + bitmap = (l >= 0) ? 1 << l : 0;
> > + }
> > + }
> > +
> > + for_each_set_bit(i, &bitmap, 16) {
> > + if (!dst[i])
> > + continue;
> > + if (*r < 0)
> > + *r = 0;
> > + *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
> > + }
> > +
> > + rcu_read_unlock();
> > + return true;
> > +}
> > +
> > /*
> > * Add a pending IRQ into lapic.
> > * Return 1 if successfully added and 0 if discarded.
> > @@ -880,7 +1026,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> > switch (reg) {
> > case APIC_ID: /* Local APIC ID */
> > if (!apic_x2apic_mode(apic))
> > - apic_set_reg(apic, APIC_ID, val);
> > + kvm_apic_set_id(apic, val >> 24);
> > else
> > ret = 1;
> > break;
> > @@ -896,15 +1042,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> >
> > case APIC_LDR:
> > if (!apic_x2apic_mode(apic))
> > - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
> > + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
> > else
> > ret = 1;
> > break;
> >
> > case APIC_DFR:
> > - if (!apic_x2apic_mode(apic))
> > + if (!apic_x2apic_mode(apic)) {
> > apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> > - else
> > + recalculate_apic_map(apic->vcpu->kvm);
> > + } else
> > ret = 1;
>
> What if these have different settings on different vcpus?
>
Then table approach will not work and we need to loop. Luckily for us
spec says all CPUs have to have same DFR settings.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 14:44 ` Michael S. Tsirkin
@ 2012-09-10 16:17 ` Gleb Natapov
2012-09-10 17:05 ` Gleb Natapov
2012-09-11 11:20 ` Gleb Natapov
0 siblings, 2 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-10 16:17 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm, avi, mtosatti
On Mon, Sep 10, 2012 at 05:44:38PM +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 10, 2012 at 04:09:15PM +0300, Gleb Natapov wrote:
> > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > find interrupt destination instead of looping through all vcpus.
> >
> > Signed-off-by: Gleb Natapov <gleb@redhat.com>
>
> Looks good overall. I think I see some bugs, with rcu use and others.
> the most suspecious thing is that code seems to use rcu but
> there are no calls to rcu_dereference anywhere.
Yep. Forgot about it.
> Pls see below.
>
> Thanks for looking into this, once ready I intend to rework
> direct injection to work on top of this.
>
>
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 64adb61..121f308 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
> > struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
> > };
> >
> > +struct kvm_apic_map {
> > + struct rcu_head rcu;
> > + bool flat;
> > + u8 ldr_bits;
> > + struct kvm_lapic *phys_map[255];
>
> Not 256? apic id is sometimes used in the lookup - is is guaranteed
> that guest can not set it to 0xff? If yes this will overflow.
>
Yes. Need to be 256.
> > + struct kvm_lapic *logical_map[16][16];
>
>
> These are large arrays: 2Kbyte each one, which is bad
> for cache.
> Is it not better to have vcpu index stored there?
> that would reduce cache footprint by x8.
>
We do not have vcpu indexes, only vcpu ids which are not correspond to
vcpu place in vcpu array. vcpu array likely be replaced by a list when
vcpu destruction functionality will be implemented, so I do not want to
introduce any dependencies on current implementation detail.
> > +};
> > +
> > struct kvm_arch {
> > unsigned int n_used_mmu_pages;
> > unsigned int n_requested_mmu_pages;
> > @@ -528,6 +536,8 @@ struct kvm_arch {
> > struct kvm_ioapic *vioapic;
> > struct kvm_pit *vpit;
> > int vapics_in_nmi_mode;
> > + struct kvm_apic_map *apic_map;
> > + struct mutex apic_map_lock;
> >
> > unsigned int tss_addr;
> > struct page *apic_access_page;
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 07ad628..d18ddd2 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -139,11 +139,101 @@ static inline int apic_enabled(struct kvm_lapic *apic)
> > (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> > APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
> >
> > +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > +{
> > + return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > +}
> > +
> > +
>
> two emoty lines not needed
>
That's terrible, I agree.
> > static inline int kvm_apic_id(struct kvm_lapic *apic)
> > {
> > return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> > }
> >
> > +static void rcu_free_apic_map(struct rcu_head *head)
> > +{
> > + struct kvm_apic_map *p = container_of(head,
> > + struct kvm_apic_map, rcu);
>
> why break this line? it is not too long as is.
>
Because it was longer, but than structure was renamed to be shorter :)
> > +
> > + kfree(p);
> > +}
> > +
> > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > + u16 *cid, u16 *lid)
> > +{
> > + if (ldr_bits == 32) {
> > + *cid = ldr >> 16;
> > + *lid = ldr & 0xffff;
> > + } else {
> > + ldr = GET_APIC_LOGICAL_ID(ldr);
> > +
> > + if (flat) {
> > + *cid = 0;
> > + *lid = ldr;
> > + } else {
> > + *cid = ldr >> 4;
> > + *lid = ldr & 0xf;
> > + }
> > + }
> > +}
> > +
> > +static inline int recalculate_apic_map(struct kvm *kvm)
>
> __must_check?
What is it?
>
> > +{
> > + struct kvm_apic_map *new, *old = NULL;
> > + struct kvm_vcpu *vcpu;
> > + int i;
> > +
> > + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > +
>
> empty line not needed here :)
I like it that way. You think it will break compilation or runtime?
>
> > + if (!new)
> > + return -ENOMEM;
> > +
> > + new->ldr_bits = 8;
> > + new->flat = true;
> > + kvm_for_each_vcpu(i, vcpu, kvm) {
> > + u16 cid, lid;
> > + struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > + if (!kvm_apic_present(vcpu))
> > + continue;
> > +
> > + if (apic_x2apic_mode(apic)) {
> > + new->ldr_bits = 32;
> > + new->flat = false;
> > + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER)
> > + new->flat = false;
> > +
> > + new->phys_map[kvm_apic_id(apic)] = apic;
> > + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > + new->flat, new->ldr_bits, &cid, &lid);
> > +
> > + if (lid)
> > + new->logical_map[cid][ffs(lid) - 1] = apic;
> > + }
> > + mutex_lock(&kvm->arch.apic_map_lock);
> > + old = kvm->arch.apic_map;
>
> rcu_dereference_protected?
What is _protected? I do not think rcu_dereference() needed here, under
lock. It is needed in rcu_read section. If it is needed here too then
we have the same bug in irq routing code.
>
> > + rcu_assign_pointer(kvm->arch.apic_map, new);
> > + mutex_unlock(&kvm->arch.apic_map_lock);
> > +
> > + if (old)
> > + call_rcu(&old->rcu, rcu_free_apic_map);
>
> What guarantees rcu_free_apic_map is called before module goes away?
Why do we care?
> I suspect we need at least rcu_barrier here:
> https://lwn.net/Articles/217484/
According to the article rcu_barrier() is heavier than
synchronize_rcu(), so I do not understand what do you mean by "at
least".
>
> Also, this is done upon guest request?
> This allocates 2K and delays free until next rcu grace
> period. Guest can buffer up *a lot of* host kernel
> memory before this happens - looks like a DOS potential?
> Maybe simply synchronize_rcu here? Or if that looks too
> aggressive, adding this to some per-kvm pending list,
> and call rcu_barrier once that list is too large.
I am personally fine with synchronize_rcu() or even doing nothing until
proven that the problem actually exists. I know that Avi do not like
synchronize_rcu(). From guest point of view vcpu may stall when apic id
is written, but it is unlikely to be a problem since interrupts are
better to be disabled while this happens.
>
>
> > +
> > + return 0;
> > +}
> > +
> > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> > +{
> > + apic_set_reg(apic, APIC_ID, id << 24);
> > + return recalculate_apic_map(apic->vcpu->kvm);
> > +}
> > +
> > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> > +{
> > + apic_set_reg(apic, APIC_LDR, id);
> > + return recalculate_apic_map(apic->vcpu->kvm);
> > +}
> > +
>
> return value of these functions seems never checked.
>
Yes, the problem is that we can do nothing about the failure if failure
happens during guest write.
> > static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
> > {
> > return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
> > @@ -193,11 +283,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
> > apic_set_reg(apic, APIC_LVR, v);
> > }
> >
> > -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > -{
> > - return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > -}
> > -
> > static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
> > LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
> > LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
> > @@ -477,6 +562,67 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
> > return result;
> > }
> >
> > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> > + struct kvm_lapic_irq *irq, int *r)
> > +{
> > + unsigned long bitmap = 1;
> > + struct kvm_lapic **dst;
> > + int i;
> > +
> > + if (irq->shorthand == APIC_DEST_SELF) {
> > + *r = kvm_apic_set_irq(src->vcpu, irq);
> > + return true;
> > + }
> > +
> > + if (irq->shorthand)
> > + return false;
> > +
> > + if (irq->dest_mode == 0) { /* physical mode */
> > + if (irq->delivery_mode == APIC_DM_LOWEST ||
> > + irq->dest_id == 0xff)
> > + return false;
> > + rcu_read_lock();
> > + dst = &kvm->arch.apic_map
>
>
> rcu_dereference?
>
> Also, how do we know apic_map is not NULL at this point?
>
apic_map is created when first apic is created and this function should
not be called before that. But better to be safe than sorry. May be
WARN_ON() is appropriate.
>
> >->phys_map[irq->dest_id & 0xff];
>
> I went back and forth and I think I see what is going
> on here. bitmap is 1, this way for_each_set_bit below
> triggers with value 0. and we get &dst[0] which is same
> as dst. And read lock here is pared with unlock outside if.
> So it looks correct.
>
> But IMHO this is just trying too hard to reduce duplication,
> it confused at least this reader. Is this erquivalent to:
>
> rcu_read_lock();
> dst = &kvm->arch.apic_map->phys_map[irq->dest_id & 0xff];
> *r += kvm_apic_set_irq(dst->vcpu, irq);
> rcu_read_unlock();
> return true;
>
> ?
>
> If yes would be much clearer this way.
> And the long code section below will also cleaner
> because nesting is more shallow.
>
I don't like scattered kvm_apic_set_irq() calls, But I'll try and see
how it looks.
>
>
> > + } else {
> > + u16 cid, lid;
> > + u32 mda = irq->dest_id;
> > +
> > + rcu_read_lock();
> > + if (kvm->arch.apic_map->ldr_bits == 8)
>
>
> rcu_dereference?
>
> Also, how do we know apic_map is not NULL at this point?
>
Same as above. This is the same function.
>
>
> > + mda <<= 24;
> > +
> > + kvm_apic_get_logical_id(mda, kvm->arch.apic_map->flat,
> > + kvm->arch.apic_map->ldr_bits, &cid, &lid);
> > + dst = kvm->arch.apic_map->logical_map[cid];
> > +
> > + bitmap = lid;
> > + if (irq->delivery_mode == APIC_DM_LOWEST) {
> > + int l = -1;
> > + for_each_set_bit(i, &bitmap, 16) {
>
> constant 16 repeated here and below. to make it a bit prettier:
> #define foreach_bit_in_apic_lid(bit, lid) for_each_set_bit(bit, &lid, 16)
> ?
>
Obfuscation really. If anything better to #define CPUS_IN_CLUSER 16 or
something. But this constant is unlikely to change :)
> > + if (!dst[i])
> > + continue;
> > + if (l < 0)
> > + l = i;
> > + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
> > + l = i;
> > + }
> > +
> > + bitmap = (l >= 0) ? 1 << l : 0;
>
> We already know the destination here. Why encode it in bitmap and
> rescan below? Rewriting bitmap like this confused at least this reader.
What is confusing about it?
>
> This will be equivalent, right?
>
> if (l >= 0)
> *r = kvm_apic_set_irq(dst[i]->vcpu, irq);
> goto unlock;
>
> If yes IMHO this is much more readable.
I don't see why. I really do not like to call kvm_apic_set_irq() in
four places in the same function.
>
> > + }
> > + }
> > +
> > + for_each_set_bit(i, &bitmap, 16) {
>
> I note that *r is never set if bitmap is 0
> (this became apparent after rewrite as suggested above).
It is much more apparent with current code since *r is set in 2 places
instead of four.
> Looks like this will leak some kernel stack info to userspace.
>
r is set by a caller. But we can set it explicitly in the function too
for clarity.
> > + if (!dst[i])
> > + continue;
> > + if (*r < 0)
> > + *r = 0;
> > + *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
> > + }
> > +
> > + rcu_read_unlock();
> > + return true;
> > +}
> > +
> > /*
> > * Add a pending IRQ into lapic.
> > * Return 1 if successfully added and 0 if discarded.
> > @@ -880,7 +1026,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> > switch (reg) {
> > case APIC_ID: /* Local APIC ID */
> > if (!apic_x2apic_mode(apic))
> > - apic_set_reg(apic, APIC_ID, val);
> > + kvm_apic_set_id(apic, val >> 24);
>
> return value discarded here.
>
Nothing can be done.
> > else
> > ret = 1;
> > break;
> > @@ -896,15 +1042,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
> >
> > case APIC_LDR:
> > if (!apic_x2apic_mode(apic))
> > - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
> > + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
>
> and here
>
Same.
> > else
> > ret = 1;
> > break;
> >
> > case APIC_DFR:
> > - if (!apic_x2apic_mode(apic))
> > + if (!apic_x2apic_mode(apic)) {
> > apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
> > - else
> > + recalculate_apic_map(apic->vcpu->kvm);
>
> return value discarded here.
>
Once again.
> > + } else
> > ret = 1;
> > break;
> >
> > @@ -1135,6 +1282,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> > static_key_slow_dec_deferred(&apic_hw_disabled);
> > else
> > static_key_slow_inc(&apic_hw_disabled.key);
> > + recalculate_apic_map(vcpu->kvm);
>
> and here
>
here too
> > }
> >
> > if (!kvm_vcpu_is_bsp(apic->vcpu))
> > @@ -1144,7 +1292,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
> > if (apic_x2apic_mode(apic)) {
> > u32 id = kvm_apic_id(apic);
> > u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
> > - apic_set_reg(apic, APIC_LDR, ldr);
> > + kvm_apic_set_ldr(apic, ldr);
>
> and here
same
>
> > }
> > apic->base_address = apic->vcpu->arch.apic_base &
> > MSR_IA32_APICBASE_BASE;
> > @@ -1169,7 +1317,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> > /* Stop the timer in case it's a reset to an active apic */
> > hrtimer_cancel(&apic->lapic_timer.timer);
> >
> > - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
> > + kvm_apic_set_id(apic, (u8)vcpu->vcpu_id);
>
> and here
here is actually can be handled
>
> > kvm_apic_set_version(apic->vcpu);
> >
> > for (i = 0; i < APIC_LVT_NUM; i++)
> > @@ -1180,7 +1328,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> > apic_set_reg(apic, APIC_DFR, 0xffffffffU);
> > apic_set_spiv(apic, 0xff);
> > apic_set_reg(apic, APIC_TASKPRI, 0);
> > - apic_set_reg(apic, APIC_LDR, 0);
> > + kvm_apic_set_ldr(apic, 0);
>
> and here
and here.
>
> > apic_set_reg(apic, APIC_ESR, 0);
> > apic_set_reg(apic, APIC_ICR, 0);
> > apic_set_reg(apic, APIC_ICR2, 0);
> > @@ -1398,6 +1546,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
> > /* set SPIV separately to get count of SW disabled APICs right */
> > apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
> > memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
> > + /* call kvm_apic_set_id() to put apic into apic_map */
> > + kvm_apic_set_id(apic, kvm_apic_id(apic));
>
> and here
>
and here.
> > kvm_apic_set_version(vcpu);
> >
> > apic_update_ppr(apic);
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 615a8b0..5ce084e 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -52,6 +52,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
> > int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
> > int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
> >
> > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> > + struct kvm_lapic_irq *irq, int *r);
> > +
> > u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
> > void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
> > void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
> > @@ -120,5 +123,4 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
> > {
> > return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
> > }
> > -
> > #endif
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index f91e2c9..db6db8f 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -6269,6 +6269,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> > set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
> >
> > raw_spin_lock_init(&kvm->arch.tsc_write_lock);
> > + mutex_init(&kvm->arch.apic_map_lock);
> >
> > return 0;
> > }
> > @@ -6319,6 +6320,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
> > put_page(kvm->arch.apic_access_page);
> > if (kvm->arch.ept_identity_pagetable)
> > put_page(kvm->arch.ept_identity_pagetable);
> > + kfree(kvm->arch.apic_map);
>
> rcu_dereference_check(..., 1)
What is this? Why?
>
> since apic_map is around until vm is destroyed
> maybe really put it inline in arch?
and who you will rcu protect is than?
>
> > }
> >
> > void kvm_arch_free_memslot(struct kvm_memory_slot *free,
> > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> > index 7118be0..3ca89c4 100644
> > --- a/virt/kvm/irq_comm.c
> > +++ b/virt/kvm/irq_comm.c
> > @@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
> > struct kvm_vcpu *vcpu, *lowest = NULL;
> >
> > if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
> > - kvm_is_dm_lowest_prio(irq))
> > + kvm_is_dm_lowest_prio(irq)) {
> > printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
> > + irq->delivery_mode = APIC_DM_FIXED;
> > + }
> > +
> > + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
> > + return r;
> >
> > kvm_for_each_vcpu(i, vcpu, kvm) {
> > if (!kvm_apic_present(vcpu))
> > --
> > Gleb.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 16:17 ` Gleb Natapov
@ 2012-09-10 17:05 ` Gleb Natapov
2012-09-11 9:29 ` Avi Kivity
2012-09-11 11:20 ` Gleb Natapov
1 sibling, 1 reply; 17+ messages in thread
From: Gleb Natapov @ 2012-09-10 17:05 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm, avi, mtosatti
On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
> > > + return 0;
> > > +}
> > > +
> > > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> > > +{
> > > + apic_set_reg(apic, APIC_ID, id << 24);
> > > + return recalculate_apic_map(apic->vcpu->kvm);
> > > +}
> > > +
> > > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> > > +{
> > > + apic_set_reg(apic, APIC_LDR, id);
> > > + return recalculate_apic_map(apic->vcpu->kvm);
> > > +}
> > > +
> >
> > return value of these functions seems never checked.
> >
> Yes, the problem is that we can do nothing about the failure if failure
> happens during guest write.
>
Actually I have an idea how to handle the error. Never return one. If
map cannot be allocated go slow path always. phys_map should be checked
for NULL during delivery in this case obviously.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 17:05 ` Gleb Natapov
@ 2012-09-11 9:29 ` Avi Kivity
2012-09-11 9:35 ` Gleb Natapov
0 siblings, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2012-09-11 9:29 UTC (permalink / raw)
To: Gleb Natapov; +Cc: Michael S. Tsirkin, kvm, mtosatti
On 09/10/2012 08:05 PM, Gleb Natapov wrote:
> On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
>> > > + return 0;
>> > > +}
>> > > +
>> > > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
>> > > +{
>> > > + apic_set_reg(apic, APIC_ID, id << 24);
>> > > + return recalculate_apic_map(apic->vcpu->kvm);
>> > > +}
>> > > +
>> > > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
>> > > +{
>> > > + apic_set_reg(apic, APIC_LDR, id);
>> > > + return recalculate_apic_map(apic->vcpu->kvm);
>> > > +}
>> > > +
>> >
>> > return value of these functions seems never checked.
>> >
>> Yes, the problem is that we can do nothing about the failure if failure
>> happens during guest write.
We can. Return -ENOMEM all the way up to userspace.
>>
> Actually I have an idea how to handle the error. Never return one. If
> map cannot be allocated go slow path always. phys_map should be checked
> for NULL during delivery in this case obviously.
That's better of course (though we have to beware of such tricks, but in
this case the slow path is regularly exercised so it should keep working).
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 9:29 ` Avi Kivity
@ 2012-09-11 9:35 ` Gleb Natapov
2012-09-11 9:41 ` Avi Kivity
0 siblings, 1 reply; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 9:35 UTC (permalink / raw)
To: Avi Kivity; +Cc: Michael S. Tsirkin, kvm, mtosatti
On Tue, Sep 11, 2012 at 12:29:06PM +0300, Avi Kivity wrote:
> On 09/10/2012 08:05 PM, Gleb Natapov wrote:
> > On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
> >> > > + return 0;
> >> > > +}
> >> > > +
> >> > > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> >> > > +{
> >> > > + apic_set_reg(apic, APIC_ID, id << 24);
> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
> >> > > +}
> >> > > +
> >> > > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> >> > > +{
> >> > > + apic_set_reg(apic, APIC_LDR, id);
> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
> >> > > +}
> >> > > +
> >> >
> >> > return value of these functions seems never checked.
> >> >
> >> Yes, the problem is that we can do nothing about the failure if failure
> >> happens during guest write.
>
> We can. Return -ENOMEM all the way up to userspace.
>
There is no userspace to return error to if error happens on guest MMIO
write. Unless you mean return it as a return value of ioctl(VM_RUN) in
which case it is equivalent of killing the guest. And this is not fair
to a guest who did nothing wrong to suffer from our stupid optimizations :)
Actually I am not sure that returning to userspace in the middle of an
IO that is handled by a kernel is well defined in KVM ABI.
> >>
> > Actually I have an idea how to handle the error. Never return one. If
> > map cannot be allocated go slow path always. phys_map should be checked
> > for NULL during delivery in this case obviously.
>
> That's better of course (though we have to beware of such tricks, but in
> this case the slow path is regularly exercised so it should keep working).
>
Oh with Windows guests it has work to do for sure.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 9:35 ` Gleb Natapov
@ 2012-09-11 9:41 ` Avi Kivity
2012-09-11 9:45 ` Gleb Natapov
0 siblings, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2012-09-11 9:41 UTC (permalink / raw)
To: Gleb Natapov; +Cc: Michael S. Tsirkin, kvm, mtosatti
On 09/11/2012 12:35 PM, Gleb Natapov wrote:
> On Tue, Sep 11, 2012 at 12:29:06PM +0300, Avi Kivity wrote:
>> On 09/10/2012 08:05 PM, Gleb Natapov wrote:
>> > On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
>> >> > > + return 0;
>> >> > > +}
>> >> > > +
>> >> > > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
>> >> > > +{
>> >> > > + apic_set_reg(apic, APIC_ID, id << 24);
>> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
>> >> > > +}
>> >> > > +
>> >> > > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
>> >> > > +{
>> >> > > + apic_set_reg(apic, APIC_LDR, id);
>> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
>> >> > > +}
>> >> > > +
>> >> >
>> >> > return value of these functions seems never checked.
>> >> >
>> >> Yes, the problem is that we can do nothing about the failure if failure
>> >> happens during guest write.
>>
>> We can. Return -ENOMEM all the way up to userspace.
>>
> There is no userspace to return error to if error happens on guest MMIO
> write. Unless you mean return it as a return value of ioctl(VM_RUN) in
> which case it is equivalent of killing the guest.
That is what I meant.
> And this is not fair
> to a guest who did nothing wrong to suffer from our stupid optimizations :)
> Actually I am not sure that returning to userspace in the middle of an
> IO that is handled by a kernel is well defined in KVM ABI.
If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
GFP_NOIO etc) then the entire host is dead anyway. The same thing can
happen if the guest (or userspace) touches a yet-unallocated page, or if
the page fault path fails to allocate mmu pages, or any of a thousand
other allocations we have all over.
>
>> >>
>> > Actually I have an idea how to handle the error. Never return one. If
>> > map cannot be allocated go slow path always. phys_map should be checked
>> > for NULL during delivery in this case obviously.
>>
>> That's better of course (though we have to beware of such tricks, but in
>> this case the slow path is regularly exercised so it should keep working).
>>
> Oh with Windows guests it has work to do for sure.
This reminds me, we could speed up self-ipi for that.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 9:41 ` Avi Kivity
@ 2012-09-11 9:45 ` Gleb Natapov
2012-09-11 9:57 ` Gleb Natapov
2012-09-11 12:04 ` Avi Kivity
0 siblings, 2 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 9:45 UTC (permalink / raw)
To: Avi Kivity; +Cc: Michael S. Tsirkin, kvm, mtosatti
On Tue, Sep 11, 2012 at 12:41:28PM +0300, Avi Kivity wrote:
> On 09/11/2012 12:35 PM, Gleb Natapov wrote:
> > On Tue, Sep 11, 2012 at 12:29:06PM +0300, Avi Kivity wrote:
> >> On 09/10/2012 08:05 PM, Gleb Natapov wrote:
> >> > On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
> >> >> > > + return 0;
> >> >> > > +}
> >> >> > > +
> >> >> > > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> >> >> > > +{
> >> >> > > + apic_set_reg(apic, APIC_ID, id << 24);
> >> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
> >> >> > > +}
> >> >> > > +
> >> >> > > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> >> >> > > +{
> >> >> > > + apic_set_reg(apic, APIC_LDR, id);
> >> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
> >> >> > > +}
> >> >> > > +
> >> >> >
> >> >> > return value of these functions seems never checked.
> >> >> >
> >> >> Yes, the problem is that we can do nothing about the failure if failure
> >> >> happens during guest write.
> >>
> >> We can. Return -ENOMEM all the way up to userspace.
> >>
> > There is no userspace to return error to if error happens on guest MMIO
> > write. Unless you mean return it as a return value of ioctl(VM_RUN) in
> > which case it is equivalent of killing the guest.
>
> That is what I meant.
>
> > And this is not fair
> > to a guest who did nothing wrong to suffer from our stupid optimizations :)
> > Actually I am not sure that returning to userspace in the middle of an
> > IO that is handled by a kernel is well defined in KVM ABI.
>
> If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
> GFP_NOIO etc) then the entire host is dead anyway. The same thing can
> happen if the guest (or userspace) touches a yet-unallocated page, or if
> the page fault path fails to allocate mmu pages, or any of a thousand
> other allocations we have all over.
Then it is just simpler to sigkill the guest right away. What's the
point in returning error if you believe that userspace can't handle it
and will likely not run long enough to even get to userspace due to
memory shortage.
>
> >
> >> >>
> >> > Actually I have an idea how to handle the error. Never return one. If
> >> > map cannot be allocated go slow path always. phys_map should be checked
> >> > for NULL during delivery in this case obviously.
> >>
> >> That's better of course (though we have to beware of such tricks, but in
> >> this case the slow path is regularly exercised so it should keep working).
> >>
> > Oh with Windows guests it has work to do for sure.
>
> This reminds me, we could speed up self-ipi for that.
>
The patch does it. Windows sends a lot of all but self IPIs too.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 9:45 ` Gleb Natapov
@ 2012-09-11 9:57 ` Gleb Natapov
2012-09-11 12:05 ` Avi Kivity
2012-09-11 12:04 ` Avi Kivity
1 sibling, 1 reply; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 9:57 UTC (permalink / raw)
To: Avi Kivity; +Cc: Michael S. Tsirkin, kvm, mtosatti
On Tue, Sep 11, 2012 at 12:45:46PM +0300, Gleb Natapov wrote:
> On Tue, Sep 11, 2012 at 12:41:28PM +0300, Avi Kivity wrote:
> > On 09/11/2012 12:35 PM, Gleb Natapov wrote:
> > > On Tue, Sep 11, 2012 at 12:29:06PM +0300, Avi Kivity wrote:
> > >> On 09/10/2012 08:05 PM, Gleb Natapov wrote:
> > >> > On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
> > >> >> > > + return 0;
> > >> >> > > +}
> > >> >> > > +
> > >> >> > > +static inline int kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> > >> >> > > +{
> > >> >> > > + apic_set_reg(apic, APIC_ID, id << 24);
> > >> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
> > >> >> > > +}
> > >> >> > > +
> > >> >> > > +static inline int kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
> > >> >> > > +{
> > >> >> > > + apic_set_reg(apic, APIC_LDR, id);
> > >> >> > > + return recalculate_apic_map(apic->vcpu->kvm);
> > >> >> > > +}
> > >> >> > > +
> > >> >> >
> > >> >> > return value of these functions seems never checked.
> > >> >> >
> > >> >> Yes, the problem is that we can do nothing about the failure if failure
> > >> >> happens during guest write.
> > >>
> > >> We can. Return -ENOMEM all the way up to userspace.
> > >>
> > > There is no userspace to return error to if error happens on guest MMIO
> > > write. Unless you mean return it as a return value of ioctl(VM_RUN) in
> > > which case it is equivalent of killing the guest.
> >
> > That is what I meant.
> >
> > > And this is not fair
> > > to a guest who did nothing wrong to suffer from our stupid optimizations :)
> > > Actually I am not sure that returning to userspace in the middle of an
> > > IO that is handled by a kernel is well defined in KVM ABI.
> >
> > If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
> > GFP_NOIO etc) then the entire host is dead anyway. The same thing can
> > happen if the guest (or userspace) touches a yet-unallocated page, or if
> > the page fault path fails to allocate mmu pages, or any of a thousand
> > other allocations we have all over.
> Then it is just simpler to sigkill the guest right away. What's the
> point in returning error if you believe that userspace can't handle it
> and will likely not run long enough to even get to userspace due to
> memory shortage.
>
And although this is not the route I will go the question remains. How
do we return to the kernel after userspace exit in the middle of IO that
is handled by an in-kernel device. Looks like the kernel will expect
emulation result from userspace on next ioctl(RUN).
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-10 16:17 ` Gleb Natapov
2012-09-10 17:05 ` Gleb Natapov
@ 2012-09-11 11:20 ` Gleb Natapov
2012-09-11 11:49 ` Gleb Natapov
1 sibling, 1 reply; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 11:20 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm, avi, mtosatti
On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
> >
> > > + rcu_assign_pointer(kvm->arch.apic_map, new);
> > > + mutex_unlock(&kvm->arch.apic_map_lock);
> > > +
> > > + if (old)
> > > + call_rcu(&old->rcu, rcu_free_apic_map);
> >
> > What guarantees rcu_free_apic_map is called before module goes away?
> Why do we care?
>
Ugh. We do not care that apic_map memory will stick after module unload, but we
obviously care since rcu_free_apic_map() code itself will go away. Looks
like general problem for all modular users of call_rcu(). Is there
general solution in module unload code somewhere? Calling rcu_barrier()
before unloading module should be enough.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 11:20 ` Gleb Natapov
@ 2012-09-11 11:49 ` Gleb Natapov
0 siblings, 0 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 11:49 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm, avi, mtosatti
On Tue, Sep 11, 2012 at 02:20:49PM +0300, Gleb Natapov wrote:
> On Mon, Sep 10, 2012 at 07:17:54PM +0300, Gleb Natapov wrote:
> > >
> > > > + rcu_assign_pointer(kvm->arch.apic_map, new);
> > > > + mutex_unlock(&kvm->arch.apic_map_lock);
> > > > +
> > > > + if (old)
> > > > + call_rcu(&old->rcu, rcu_free_apic_map);
> > >
> > > What guarantees rcu_free_apic_map is called before module goes away?
> > Why do we care?
> >
> Ugh. We do not care that apic_map memory will stick after module unload, but we
> obviously care since rcu_free_apic_map() code itself will go away. Looks
> like general problem for all modular users of call_rcu(). Is there
> general solution in module unload code somewhere? Calling rcu_barrier()
> before unloading module should be enough.
>
If nothing changed for the past 3 years each module should call
rcu_barrier() by itself:
http://paulmck.livejournal.com/7314.html
Looking at the code I see that it calls synchronize_rcu() on module
delete, but this is not enough. :(
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 9:45 ` Gleb Natapov
2012-09-11 9:57 ` Gleb Natapov
@ 2012-09-11 12:04 ` Avi Kivity
2012-09-11 12:09 ` Gleb Natapov
1 sibling, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2012-09-11 12:04 UTC (permalink / raw)
To: Gleb Natapov; +Cc: Michael S. Tsirkin, kvm, mtosatti
On 09/11/2012 12:45 PM, Gleb Natapov wrote:
>> >>
>> > There is no userspace to return error to if error happens on guest MMIO
>> > write. Unless you mean return it as a return value of ioctl(VM_RUN) in
>> > which case it is equivalent of killing the guest.
>>
>> That is what I meant.
>>
>> > And this is not fair
>> > to a guest who did nothing wrong to suffer from our stupid optimizations :)
>> > Actually I am not sure that returning to userspace in the middle of an
>> > IO that is handled by a kernel is well defined in KVM ABI.
>>
>> If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
>> GFP_NOIO etc) then the entire host is dead anyway. The same thing can
>> happen if the guest (or userspace) touches a yet-unallocated page, or if
>> the page fault path fails to allocate mmu pages, or any of a thousand
>> other allocations we have all over.
> Then it is just simpler to sigkill the guest right away. What's the
> point in returning error if you believe that userspace can't handle it
> and will likely not run long enough to even get to userspace due to
> memory shortage.
Syscalls don't SIGKILL (well except kill(2)). They report errors. The
only other alternative is SIGBUS.
>
>>
>> >
>> >> >>
>> >> > Actually I have an idea how to handle the error. Never return one. If
>> >> > map cannot be allocated go slow path always. phys_map should be checked
>> >> > for NULL during delivery in this case obviously.
>> >>
>> >> That's better of course (though we have to beware of such tricks, but in
>> >> this case the slow path is regularly exercised so it should keep working).
>> >>
>> > Oh with Windows guests it has work to do for sure.
>>
>> This reminds me, we could speed up self-ipi for that.
>>
> The patch does it. Windows sends a lot of all but self IPIs too.
I thought it bailed out if a destination shorthand was used?
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 9:57 ` Gleb Natapov
@ 2012-09-11 12:05 ` Avi Kivity
2012-09-11 12:12 ` Gleb Natapov
0 siblings, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2012-09-11 12:05 UTC (permalink / raw)
To: Gleb Natapov; +Cc: Michael S. Tsirkin, kvm, mtosatti
On 09/11/2012 12:57 PM, Gleb Natapov wrote:
>> > If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
>> > GFP_NOIO etc) then the entire host is dead anyway. The same thing can
>> > happen if the guest (or userspace) touches a yet-unallocated page, or if
>> > the page fault path fails to allocate mmu pages, or any of a thousand
>> > other allocations we have all over.
>> Then it is just simpler to sigkill the guest right away. What's the
>> point in returning error if you believe that userspace can't handle it
>> and will likely not run long enough to even get to userspace due to
>> memory shortage.
>>
> And although this is not the route I will go the question remains. How
> do we return to the kernel after userspace exit in the middle of IO that
> is handled by an in-kernel device. Looks like the kernel will expect
> emulation result from userspace on next ioctl(RUN).
Option 1 is to rewind everything to before the instruction. Option 2 is
to document that errors are not recoverable.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 12:04 ` Avi Kivity
@ 2012-09-11 12:09 ` Gleb Natapov
0 siblings, 0 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 12:09 UTC (permalink / raw)
To: Avi Kivity; +Cc: Michael S. Tsirkin, kvm, mtosatti
On Tue, Sep 11, 2012 at 03:04:36PM +0300, Avi Kivity wrote:
> On 09/11/2012 12:45 PM, Gleb Natapov wrote:
> >> >>
> >> > There is no userspace to return error to if error happens on guest MMIO
> >> > write. Unless you mean return it as a return value of ioctl(VM_RUN) in
> >> > which case it is equivalent of killing the guest.
> >>
> >> That is what I meant.
> >>
> >> > And this is not fair
> >> > to a guest who did nothing wrong to suffer from our stupid optimizations :)
> >> > Actually I am not sure that returning to userspace in the middle of an
> >> > IO that is handled by a kernel is well defined in KVM ABI.
> >>
> >> If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
> >> GFP_NOIO etc) then the entire host is dead anyway. The same thing can
> >> happen if the guest (or userspace) touches a yet-unallocated page, or if
> >> the page fault path fails to allocate mmu pages, or any of a thousand
> >> other allocations we have all over.
> > Then it is just simpler to sigkill the guest right away. What's the
> > point in returning error if you believe that userspace can't handle it
> > and will likely not run long enough to even get to userspace due to
> > memory shortage.
>
> Syscalls don't SIGKILL (well except kill(2)). They report errors. The
> only other alternative is SIGBUS.
>
Anything that will put guest out of its misery.
> >
> >>
> >> >
> >> >> >>
> >> >> > Actually I have an idea how to handle the error. Never return one. If
> >> >> > map cannot be allocated go slow path always. phys_map should be checked
> >> >> > for NULL during delivery in this case obviously.
> >> >>
> >> >> That's better of course (though we have to beware of such tricks, but in
> >> >> this case the slow path is regularly exercised so it should keep working).
> >> >>
> >> > Oh with Windows guests it has work to do for sure.
> >>
> >> This reminds me, we could speed up self-ipi for that.
> >>
> > The patch does it. Windows sends a lot of all but self IPIs too.
>
> I thought it bailed out if a destination shorthand was used?
>
This is highly sophisticated patch! It checks for APIC_DEST_SELF before
bailing out.
if (irq->shorthand == APIC_DEST_SELF) {
*r = kvm_apic_set_irq(src->vcpu, irq);
return true;
}
if (irq->shorthand)
return false;
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH] KVM: optimize apic interrupt delivery
2012-09-11 12:05 ` Avi Kivity
@ 2012-09-11 12:12 ` Gleb Natapov
0 siblings, 0 replies; 17+ messages in thread
From: Gleb Natapov @ 2012-09-11 12:12 UTC (permalink / raw)
To: Avi Kivity; +Cc: Michael S. Tsirkin, kvm, mtosatti
On Tue, Sep 11, 2012 at 03:05:54PM +0300, Avi Kivity wrote:
> On 09/11/2012 12:57 PM, Gleb Natapov wrote:
>
> >> > If you get -ENOMEM when allocating a page without GFP_ATOMIC (or
> >> > GFP_NOIO etc) then the entire host is dead anyway. The same thing can
> >> > happen if the guest (or userspace) touches a yet-unallocated page, or if
> >> > the page fault path fails to allocate mmu pages, or any of a thousand
> >> > other allocations we have all over.
> >> Then it is just simpler to sigkill the guest right away. What's the
> >> point in returning error if you believe that userspace can't handle it
> >> and will likely not run long enough to even get to userspace due to
> >> memory shortage.
> >>
> > And although this is not the route I will go the question remains. How
> > do we return to the kernel after userspace exit in the middle of IO that
> > is handled by an in-kernel device. Looks like the kernel will expect
> > emulation result from userspace on next ioctl(RUN).
>
> Option 1 is to rewind everything to before the instruction. Option 2 is
> to document that errors are not recoverable.
>
This will make all error non recoverable since userspace has no way to
know that error happened during IO emulation. So we either rewind
everything to before the instruction or never return an error to userspace
during IO.
--
Gleb.
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2012-09-11 12:12 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-10 13:09 [RFC PATCH] KVM: optimize apic interrupt delivery Gleb Natapov
2012-09-10 14:44 ` Michael S. Tsirkin
2012-09-10 16:17 ` Gleb Natapov
2012-09-10 17:05 ` Gleb Natapov
2012-09-11 9:29 ` Avi Kivity
2012-09-11 9:35 ` Gleb Natapov
2012-09-11 9:41 ` Avi Kivity
2012-09-11 9:45 ` Gleb Natapov
2012-09-11 9:57 ` Gleb Natapov
2012-09-11 12:05 ` Avi Kivity
2012-09-11 12:12 ` Gleb Natapov
2012-09-11 12:04 ` Avi Kivity
2012-09-11 12:09 ` Gleb Natapov
2012-09-11 11:20 ` Gleb Natapov
2012-09-11 11:49 ` Gleb Natapov
2012-09-10 15:09 ` Avi Kivity
2012-09-10 15:35 ` Gleb Natapov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).