kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
@ 2012-03-12  9:07 Jason Wang
  2012-03-12  9:23 ` Gleb Natapov
  0 siblings, 1 reply; 8+ messages in thread
From: Jason Wang @ 2012-03-12  9:07 UTC (permalink / raw)
  To: mtosatti, avi, kvm, linux-kernel; +Cc: gleb, mst

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 virt/kvm/ioapic.c |   47 +++++++++++++++++++++++++++++++++++++++++++++--
 virt/kvm/ioapic.h |    2 ++
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index dcaf272..892253e 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -221,6 +221,24 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 	return ret;
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+	int i, ret;
+	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+						 eoi_inject.work);
+	spin_lock(&ioapic->lock);
+	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+			continue;
+
+		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+			ret = ioapic_service(ioapic, i);
+	}
+	spin_unlock(&ioapic->lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 				     int trigger_mode)
 {
@@ -249,8 +267,29 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (!ent->fields.mask && (ioapic->irr & (1 << i)))
-			ioapic_service(ioapic, i);
+		if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+			++ioapic->irq_eoi;
+			if (ioapic->irq_eoi == 100) {
+				/*
+				 * Real hardware does not deliver the irq so
+				 * immediately during eoi broadcast, so we need
+				 * to emulate this behavior. Otherwise, for
+				 * guests who has not registered handler of a
+				 * level irq, this irq would be injected
+				 * immediately after guest enables interrupt
+				 * (which happens usually at the end of the
+				 * common interrupt routine). This would lead
+				 * guest can't move forward and may miss the
+				 * possibility to get proper irq handler
+				 * registered. So we need to give some breath to
+				 * guest. TODO: 1 is too long?
+				 */
+				schedule_delayed_work(&ioapic->eoi_inject, 1);
+				ioapic->irq_eoi = 0;
+			} else {
+				ioapic_service(ioapic, i);
+			}
+		}
 	}
 }
 
@@ -375,12 +414,14 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 {
 	int i;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	for (i = 0; i < IOAPIC_NUM_PINS; i++)
 		ioapic->redirtbl[i].fields.mask = 1;
 	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 	ioapic->ioregsel = 0;
 	ioapic->irr = 0;
 	ioapic->id = 0;
+	ioapic->irq_eoi = 0;
 	update_handled_vectors(ioapic);
 }
 
@@ -398,6 +439,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (!ioapic)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
+	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
 	kvm->arch.vioapic = ioapic;
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -418,6 +460,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	if (ioapic) {
 		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
 		kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 0b190c3..8938e66 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -47,6 +47,8 @@ struct kvm_ioapic {
 	void (*ack_notifier)(void *opaque, int irq);
 	spinlock_t lock;
 	DECLARE_BITMAP(handled_vectors, 256);
+	struct delayed_work eoi_inject;
+	u32 irq_eoi;
 };
 
 #ifdef DEBUG

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
  2012-03-12  9:07 [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast Jason Wang
@ 2012-03-12  9:23 ` Gleb Natapov
  2012-03-12  9:44   ` Jason Wang
  0 siblings, 1 reply; 8+ messages in thread
From: Gleb Natapov @ 2012-03-12  9:23 UTC (permalink / raw)
  To: Jason Wang; +Cc: mtosatti, avi, kvm, linux-kernel, mst

On Mon, Mar 12, 2012 at 05:07:35PM +0800, Jason Wang wrote:
> Currently, we call ioapic_service() immediately when we find the irq is still
> active during eoi broadcast. But for real hardware, there's some dealy between
> the EOI writing and irq delivery (system bus latency?). So we need to emulate
> this behavior. Otherwise, for a guest who haven't register a proper irq handler
> , it would stay in the interrupt routine as this irq would be re-injected
> immediately after guest enables interrupt. This would lead guest can't move
> forward and may miss the possibility to get proper irq handler registered (one
> example is windows guest resuming from hibernation).
> 
Yes, I saw this behaviour with Windows NICs, but it looks like the
guest bug. Does this happen with other kind of devices too? Because
if it does not then the correct hack would be to add a delay between
Windows enabling PHY and sending first interrupt to a guest. This will
model what happens on real HW. NIC does not start receiving packets at
the same moment PHY is enabled. Some time is spent bring up the link.

> As there's no way to differ the unhandled irq from new raised ones, this patch
> solve this problems by scheduling a delayed work when the count of irq injected
> during eoi broadcast exceeds a threshold value. After this patch, the guest can
> move a little forward when there's no suitable irq handler in case it may
> register one very soon and for guest who has a bad irq detection routine ( such
> as note_interrupt() in linux ), this bad irq would be recognized soon as in the
> past.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  virt/kvm/ioapic.c |   47 +++++++++++++++++++++++++++++++++++++++++++++--
>  virt/kvm/ioapic.h |    2 ++
>  2 files changed, 47 insertions(+), 2 deletions(-)
> 
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index dcaf272..892253e 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -221,6 +221,24 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
>  	return ret;
>  }
>  
> +static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
> +{
> +	int i, ret;
> +	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
> +						 eoi_inject.work);
> +	spin_lock(&ioapic->lock);
> +	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
> +
> +		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
> +			continue;
> +
> +		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
> +			ret = ioapic_service(ioapic, i);
> +	}
> +	spin_unlock(&ioapic->lock);
> +}
> +
>  static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
>  				     int trigger_mode)
>  {
> @@ -249,8 +267,29 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
>  
>  		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
>  		ent->fields.remote_irr = 0;
> -		if (!ent->fields.mask && (ioapic->irr & (1 << i)))
> -			ioapic_service(ioapic, i);
> +		if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
> +			++ioapic->irq_eoi;
> +			if (ioapic->irq_eoi == 100) {
> +				/*
> +				 * Real hardware does not deliver the irq so
> +				 * immediately during eoi broadcast, so we need
> +				 * to emulate this behavior. Otherwise, for
> +				 * guests who has not registered handler of a
> +				 * level irq, this irq would be injected
> +				 * immediately after guest enables interrupt
> +				 * (which happens usually at the end of the
> +				 * common interrupt routine). This would lead
> +				 * guest can't move forward and may miss the
> +				 * possibility to get proper irq handler
> +				 * registered. So we need to give some breath to
> +				 * guest. TODO: 1 is too long?
> +				 */
> +				schedule_delayed_work(&ioapic->eoi_inject, 1);
> +				ioapic->irq_eoi = 0;
> +			} else {
> +				ioapic_service(ioapic, i);
> +			}
> +		}
>  	}
>  }
>  
> @@ -375,12 +414,14 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
>  {
>  	int i;
>  
> +	cancel_delayed_work_sync(&ioapic->eoi_inject);
>  	for (i = 0; i < IOAPIC_NUM_PINS; i++)
>  		ioapic->redirtbl[i].fields.mask = 1;
>  	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
>  	ioapic->ioregsel = 0;
>  	ioapic->irr = 0;
>  	ioapic->id = 0;
> +	ioapic->irq_eoi = 0;
>  	update_handled_vectors(ioapic);
>  }
>  
> @@ -398,6 +439,7 @@ int kvm_ioapic_init(struct kvm *kvm)
>  	if (!ioapic)
>  		return -ENOMEM;
>  	spin_lock_init(&ioapic->lock);
> +	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
>  	kvm->arch.vioapic = ioapic;
>  	kvm_ioapic_reset(ioapic);
>  	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
> @@ -418,6 +460,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
>  {
>  	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
>  
> +	cancel_delayed_work_sync(&ioapic->eoi_inject);
>  	if (ioapic) {
>  		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
>  		kvm->arch.vioapic = NULL;
> diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
> index 0b190c3..8938e66 100644
> --- a/virt/kvm/ioapic.h
> +++ b/virt/kvm/ioapic.h
> @@ -47,6 +47,8 @@ struct kvm_ioapic {
>  	void (*ack_notifier)(void *opaque, int irq);
>  	spinlock_t lock;
>  	DECLARE_BITMAP(handled_vectors, 256);
> +	struct delayed_work eoi_inject;
> +	u32 irq_eoi;
>  };
>  
>  #ifdef DEBUG

--
			Gleb.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
  2012-03-12  9:23 ` Gleb Natapov
@ 2012-03-12  9:44   ` Jason Wang
  2012-03-12 10:22     ` Gleb Natapov
  0 siblings, 1 reply; 8+ messages in thread
From: Jason Wang @ 2012-03-12  9:44 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: mtosatti, avi, kvm, linux-kernel, mst

On 03/12/2012 05:23 PM, Gleb Natapov wrote:
> On Mon, Mar 12, 2012 at 05:07:35PM +0800, Jason Wang wrote:
>> >  Currently, we call ioapic_service() immediately when we find the irq is still
>> >  active during eoi broadcast. But for real hardware, there's some dealy between
>> >  the EOI writing and irq delivery (system bus latency?). So we need to emulate
>> >  this behavior. Otherwise, for a guest who haven't register a proper irq handler
>> >  , it would stay in the interrupt routine as this irq would be re-injected
>> >  immediately after guest enables interrupt. This would lead guest can't move
>> >  forward and may miss the possibility to get proper irq handler registered (one
>> >  example is windows guest resuming from hibernation).
>> >  
> Yes, I saw this behaviour with Windows NICs, but it looks like the
> guest bug. Does this happen with other kind of devices too? Because
> if it does not then the correct hack would be to add a delay between
> Windows enabling PHY and sending first interrupt to a guest. This will
> model what happens on real HW. NIC does not start receiving packets at
> the same moment PHY is enabled. Some time is spent bring up the link.
>

Looks common for any unhandled level irq but I haven't tried. What I've 
tested is running a similar test program by hacking the card driver and 
let it run in both real physical machine and a kvm guest,  and see what 
happens if there's no irq handled:

- In real hardware, there's a gap between two successive irqs injected 
by eoi broadcast, and OS can move forward.
- In a kvm guest, no gap, guest can't move forward and would always stay 
in the irq context forever.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
  2012-03-12  9:44   ` Jason Wang
@ 2012-03-12 10:22     ` Gleb Natapov
  0 siblings, 0 replies; 8+ messages in thread
From: Gleb Natapov @ 2012-03-12 10:22 UTC (permalink / raw)
  To: Jason Wang; +Cc: mtosatti, avi, kvm, linux-kernel, mst

On Mon, Mar 12, 2012 at 05:44:00PM +0800, Jason Wang wrote:
> On 03/12/2012 05:23 PM, Gleb Natapov wrote:
> >On Mon, Mar 12, 2012 at 05:07:35PM +0800, Jason Wang wrote:
> >>>  Currently, we call ioapic_service() immediately when we find the irq is still
> >>>  active during eoi broadcast. But for real hardware, there's some dealy between
> >>>  the EOI writing and irq delivery (system bus latency?). So we need to emulate
> >>>  this behavior. Otherwise, for a guest who haven't register a proper irq handler
> >>>  , it would stay in the interrupt routine as this irq would be re-injected
> >>>  immediately after guest enables interrupt. This would lead guest can't move
> >>>  forward and may miss the possibility to get proper irq handler registered (one
> >>>  example is windows guest resuming from hibernation).
> >>>
> >Yes, I saw this behaviour with Windows NICs, but it looks like the
> >guest bug. Does this happen with other kind of devices too? Because
> >if it does not then the correct hack would be to add a delay between
> >Windows enabling PHY and sending first interrupt to a guest. This will
> >model what happens on real HW. NIC does not start receiving packets at
> >the same moment PHY is enabled. Some time is spent bring up the link.
> >
> 
> Looks common for any unhandled level irq but I haven't tried. What
> I've tested is running a similar test program by hacking the card
> driver and let it run in both real physical machine and a kvm guest,
> and see what happens if there's no irq handled:
> 
> - In real hardware, there's a gap between two successive irqs
> injected by eoi broadcast, and OS can move forward.
> - In a kvm guest, no gap, guest can't move forward and would always
> stay in the irq context forever.
This is not something an OS should rely on. So lets do the Windows
hack in QEMU NIC devices.

--
			Gleb.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
@ 2014-09-10 12:32 Zhang Haoyu
  2014-09-10 12:52 ` Zhang Haoyu
  0 siblings, 1 reply; 8+ messages in thread
From: Zhang Haoyu @ 2014-09-10 12:32 UTC (permalink / raw)
  To: kvm, Michael S.Tsirkin, Jason Wang; +Cc: Paolo Bonzini, qemu-devel

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Zhang Haoyu <zhanghy@sangfor.com>
---
 include/trace/events/kvm.h | 20 ++++++++++++++++++
 virt/kvm/ioapic.c          | 51 ++++++++++++++++++++++++++++++++++++++++++++--
 virt/kvm/ioapic.h          |  6 ++++++
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..b05f688 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
 		  __entry->coalesced ? " (coalesced)" : "")
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+	    TP_PROTO(__u64 e),
+	    TP_ARGS(e),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		e		)
+	),
+
+	TP_fast_assign(
+		__entry->e		= e;
+	),
+
+	TP_printk("dst %x vec=%u (%s|%s|%s%s)%s",
+		  (u8)(__entry->e >> 56), (u8)__entry->e,
+		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+		  (__entry->e & (1<<11)) ? "logical" : "physical",
+		  (__entry->e & (1<<15)) ? "level" : "edge",
+		  (__entry->e & (1<<16)) ? "|masked" : "")
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
 	    TP_PROTO(__u64 address, __u64 data),
 	    TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..a36c4c4 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
 	spin_unlock(&ioapic->lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+	int i, ret;
+	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+						 eoi_inject.work);
+	spin_lock(&ioapic->lock);
+	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+			continue;
+
+		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+			ret = ioapic_service(ioapic, i, false);
+	}
+	spin_unlock(&ioapic->lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (ioapic->irr & (1 << i))
-			ioapic_service(ioapic, i, false);
+		if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+			++ioapic->irq_eoi[i];
+			if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+				/*
+				 * Real hardware does not deliver the irq so
+				 * immediately during eoi broadcast, so we need
+				 * to emulate this behavior. Otherwise, for
+				 * guests who has not registered handler of a
+				 * level irq, this irq would be injected
+				 * immediately after guest enables interrupt
+				 * (which happens usually at the end of the
+				 * common interrupt routine). This would lead
+				 * guest can't move forward and may miss the
+				 * possibility to get proper irq handler
+				 * registered. So we need to give some breath to
+				 * guest. TODO: 1 is too long?
+				 */
+				schedule_delayed_work(&ioapic->eoi_inject, 1);
+				ioapic->irq_eoi[i] = 0;
+				trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+			}
+			else {
+				ioapic_service(ioapic, i, false);
+			}
+		else {
+			ioapic->irq_eoi[i] = 0;
+		}
 	}
 }
 
@@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 {
 	int i;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	for (i = 0; i < IOAPIC_NUM_PINS; i++)
 		ioapic->redirtbl[i].fields.mask = 1;
 	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 	ioapic->ioregsel = 0;
 	ioapic->irr = 0;
 	ioapic->id = 0;
+	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
 	update_handled_vectors(ioapic);
 }
@@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (!ioapic)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
+	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
 	kvm->arch.vioapic = ioapic;
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	if (ioapic) {
 		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
 		kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 90d43e9..260ac0a 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -34,6 +34,10 @@ struct kvm_vcpu;
 #define	IOAPIC_INIT			0x5
 #define	IOAPIC_EXTINT			0x7
 
+/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached,
+   which can be considered as interrupt storm happened */
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
 #ifdef CONFIG_X86
 #define RTC_GSI 8
 #else
@@ -59,6 +63,8 @@ struct kvm_ioapic {
 	spinlock_t lock;
 	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
+	struct delayed_work eoi_inject;
+	u32 irq_eoi[IOAPIC_NUM_PINS];
 };
 
 #ifdef DEBUG
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
  2014-09-10 12:32 Zhang Haoyu
@ 2014-09-10 12:52 ` Zhang Haoyu
  0 siblings, 0 replies; 8+ messages in thread
From: Zhang Haoyu @ 2014-09-10 12:52 UTC (permalink / raw)
  To: Zhang Haoyu, kvm, Michael S.Tsirkin, Jason Wang; +Cc: Paolo Bonzini, qemu-devel

>+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
>+{
>+	int i, ret;
>+	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
>+						 eoi_inject.work);
>+	spin_lock(&ioapic->lock);
>+	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
>+		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
>+
>+		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
>+			continue;
>+
>+		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
>+			ret = ioapic_service(ioapic, i, false);
>+	}
>+	spin_unlock(&ioapic->lock);
>+}
>+
Delete unused variable, ret.

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Zhang Haoyu <zhanghy@sangfor.com>
---
 include/trace/events/kvm.h | 20 ++++++++++++++++++
 virt/kvm/ioapic.c          | 51 ++++++++++++++++++++++++++++++++++++++++++++--
 virt/kvm/ioapic.h          |  6 ++++++
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..b05f688 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
 		  __entry->coalesced ? " (coalesced)" : "")
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+	    TP_PROTO(__u64 e),
+	    TP_ARGS(e),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		e		)
+	),
+
+	TP_fast_assign(
+		__entry->e		= e;
+	),
+
+	TP_printk("dst %x vec=%u (%s|%s|%s%s)%s",
+		  (u8)(__entry->e >> 56), (u8)__entry->e,
+		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+		  (__entry->e & (1<<11)) ? "logical" : "physical",
+		  (__entry->e & (1<<15)) ? "level" : "edge",
+		  (__entry->e & (1<<16)) ? "|masked" : "")
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
 	    TP_PROTO(__u64 address, __u64 data),
 	    TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..a36c4c4 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
 	spin_unlock(&ioapic->lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+	int i;
+	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+						 eoi_inject.work);
+	spin_lock(&ioapic->lock);
+	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+			continue;
+
+		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+			ioapic_service(ioapic, i, false);
+	}
+	spin_unlock(&ioapic->lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (ioapic->irr & (1 << i))
-			ioapic_service(ioapic, i, false);
+		if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+			++ioapic->irq_eoi[i];
+			if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+				/*
+				 * Real hardware does not deliver the irq so
+				 * immediately during eoi broadcast, so we need
+				 * to emulate this behavior. Otherwise, for
+				 * guests who has not registered handler of a
+				 * level irq, this irq would be injected
+				 * immediately after guest enables interrupt
+				 * (which happens usually at the end of the
+				 * common interrupt routine). This would lead
+				 * guest can't move forward and may miss the
+				 * possibility to get proper irq handler
+				 * registered. So we need to give some breath to
+				 * guest. TODO: 1 is too long?
+				 */
+				schedule_delayed_work(&ioapic->eoi_inject, 1);
+				ioapic->irq_eoi[i] = 0;
+				trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+			}
+			else {
+				ioapic_service(ioapic, i, false);
+			}
+		else {
+			ioapic->irq_eoi[i] = 0;
+		}
 	}
 }
 
@@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 {
 	int i;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	for (i = 0; i < IOAPIC_NUM_PINS; i++)
 		ioapic->redirtbl[i].fields.mask = 1;
 	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 	ioapic->ioregsel = 0;
 	ioapic->irr = 0;
 	ioapic->id = 0;
+	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
 	update_handled_vectors(ioapic);
 }
@@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (!ioapic)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
+	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
 	kvm->arch.vioapic = ioapic;
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	if (ioapic) {
 		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
 		kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 90d43e9..260ac0a 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -34,6 +34,10 @@ struct kvm_vcpu;
 #define	IOAPIC_INIT			0x5
 #define	IOAPIC_EXTINT			0x7
 
+/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached,
+   which can be considered as interrupt storm happened */
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
 #ifdef CONFIG_X86
 #define RTC_GSI 8
 #else
@@ -59,6 +63,8 @@ struct kvm_ioapic {
 	spinlock_t lock;
 	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
+	struct delayed_work eoi_inject;
+	u32 irq_eoi[IOAPIC_NUM_PINS];
 };
 
 #ifdef DEBUG
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
@ 2014-09-11  5:06 Zhang Haoyu
  2014-09-11  6:01 ` Jan Kiszka
  0 siblings, 1 reply; 8+ messages in thread
From: Zhang Haoyu @ 2014-09-11  5:06 UTC (permalink / raw)
  To: kvm, Jason Wang, Michael S.Tsirkin; +Cc: qemu-devel, Paolo Bonzini

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Zhang Haoyu <zhanghy@sangfor.com>
---
 include/trace/events/kvm.h | 20 ++++++++++++++++++
 virt/kvm/ioapic.c          | 51 ++++++++++++++++++++++++++++++++++++++++++++--
 virt/kvm/ioapic.h          |  6 ++++++
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..b05f688 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
 		  __entry->coalesced ? " (coalesced)" : "")
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+	    TP_PROTO(__u64 e),
+	    TP_ARGS(e),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		e		)
+	),
+
+	TP_fast_assign(
+		__entry->e		= e;
+	),
+
+	TP_printk("dst %x vec=%u (%s|%s|%s%s)",
+		  (u8)(__entry->e >> 56), (u8)__entry->e,
+		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+		  (__entry->e & (1<<11)) ? "logical" : "physical",
+		  (__entry->e & (1<<15)) ? "level" : "edge",
+		  (__entry->e & (1<<16)) ? "|masked" : "")
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
 	    TP_PROTO(__u64 address, __u64 data),
 	    TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..a36c4c4 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
 	spin_unlock(&ioapic->lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+	int i;
+	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+						 eoi_inject.work);
+	spin_lock(&ioapic->lock);
+	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+			continue;
+
+		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+			ioapic_service(ioapic, i, false);
+	}
+	spin_unlock(&ioapic->lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (ioapic->irr & (1 << i))
-			ioapic_service(ioapic, i, false);
+		if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+			++ioapic->irq_eoi[i];
+			if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+				/*
+				 * Real hardware does not deliver the irq so
+				 * immediately during eoi broadcast, so we need
+				 * to emulate this behavior. Otherwise, for
+				 * guests who has not registered handler of a
+				 * level irq, this irq would be injected
+				 * immediately after guest enables interrupt
+				 * (which happens usually at the end of the
+				 * common interrupt routine). This would lead
+				 * guest can't move forward and may miss the
+				 * possibility to get proper irq handler
+				 * registered. So we need to give some breath to
+				 * guest. TODO: 1 is too long?
+				 */
+				schedule_delayed_work(&ioapic->eoi_inject, 1);
+				ioapic->irq_eoi[i] = 0;
+				trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+			}
+			else {
+				ioapic_service(ioapic, i, false);
+			}
+		else {
+			ioapic->irq_eoi[i] = 0;
+		}
 	}
 }
 
@@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 {
 	int i;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	for (i = 0; i < IOAPIC_NUM_PINS; i++)
 		ioapic->redirtbl[i].fields.mask = 1;
 	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 	ioapic->ioregsel = 0;
 	ioapic->irr = 0;
 	ioapic->id = 0;
+	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
 	update_handled_vectors(ioapic);
 }
@@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (!ioapic)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
+	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
 	kvm->arch.vioapic = ioapic;
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
+	cancel_delayed_work_sync(&ioapic->eoi_inject);
 	if (ioapic) {
 		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
 		kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 90d43e9..260ac0a 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -34,6 +34,10 @@ struct kvm_vcpu;
 #define	IOAPIC_INIT			0x5
 #define	IOAPIC_EXTINT			0x7
 
+/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached,
+   which can be considered as interrupt storm happened */
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
 #ifdef CONFIG_X86
 #define RTC_GSI 8
 #else
@@ -59,6 +63,8 @@ struct kvm_ioapic {
 	spinlock_t lock;
 	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
+	struct delayed_work eoi_inject;
+	u32 irq_eoi[IOAPIC_NUM_PINS];
 };
 
 #ifdef DEBUG
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast
  2014-09-11  5:06 Zhang Haoyu
@ 2014-09-11  6:01 ` Jan Kiszka
  0 siblings, 0 replies; 8+ messages in thread
From: Jan Kiszka @ 2014-09-11  6:01 UTC (permalink / raw)
  To: Zhang Haoyu, kvm, Jason Wang, Michael S.Tsirkin; +Cc: Paolo Bonzini, qemu-devel

On 2014-09-11 07:06, Zhang Haoyu wrote:
> Currently, we call ioapic_service() immediately when we find the irq is still
> active during eoi broadcast. But for real hardware, there's some dealy between
> the EOI writing and irq delivery (system bus latency?). So we need to emulate
> this behavior. Otherwise, for a guest who haven't register a proper irq handler
> , it would stay in the interrupt routine as this irq would be re-injected
> immediately after guest enables interrupt. This would lead guest can't move
> forward and may miss the possibility to get proper irq handler registered (one
> example is windows guest resuming from hibernation).
> 
> As there's no way to differ the unhandled irq from new raised ones, this patch
> solve this problems by scheduling a delayed work when the count of irq injected
> during eoi broadcast exceeds a threshold value. After this patch, the guest can
> move a little forward when there's no suitable irq handler in case it may
> register one very soon and for guest who has a bad irq detection routine ( such
> as note_interrupt() in linux ), this bad irq would be recognized soon as in the
> past.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Zhang Haoyu <zhanghy@sangfor.com>
> ---
>  include/trace/events/kvm.h | 20 ++++++++++++++++++
>  virt/kvm/ioapic.c          | 51 ++++++++++++++++++++++++++++++++++++++++++++--
>  virt/kvm/ioapic.h          |  6 ++++++
>  3 files changed, 75 insertions(+), 2 deletions(-)
> 
> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
> index 908925a..b05f688 100644
> --- a/include/trace/events/kvm.h
> +++ b/include/trace/events/kvm.h
> @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
>  		  __entry->coalesced ? " (coalesced)" : "")
>  );
>  
> +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
> +	    TP_PROTO(__u64 e),
> +	    TP_ARGS(e),
> +
> +	TP_STRUCT__entry(
> +		__field(	__u64,		e		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->e		= e;
> +	),
> +
> +	TP_printk("dst %x vec=%u (%s|%s|%s%s)",
> +		  (u8)(__entry->e >> 56), (u8)__entry->e,
> +		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
> +		  (__entry->e & (1<<11)) ? "logical" : "physical",
> +		  (__entry->e & (1<<15)) ? "level" : "edge",
> +		  (__entry->e & (1<<16)) ? "|masked" : "")
> +);
> +
>  TRACE_EVENT(kvm_msi_set_irq,
>  	    TP_PROTO(__u64 address, __u64 data),
>  	    TP_ARGS(address, data),
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index e8ce34c..a36c4c4 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
>  	spin_unlock(&ioapic->lock);
>  }
>  
> +static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
> +{
> +	int i;
> +	struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
> +						 eoi_inject.work);
> +	spin_lock(&ioapic->lock);
> +	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
> +
> +		if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
> +			continue;
> +
> +		if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
> +			ioapic_service(ioapic, i, false);
> +	}
> +	spin_unlock(&ioapic->lock);
> +}
> +
>  static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
>  			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
>  {
> @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
>  
>  		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
>  		ent->fields.remote_irr = 0;
> -		if (ioapic->irr & (1 << i))
> -			ioapic_service(ioapic, i, false);
> +		if (!ent->fields.mask && (ioapic->irr & (1 << i))) {

The mask check is new - why now? You don't check it in the work handler
as well.

> +			++ioapic->irq_eoi[i];
> +			if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
> +				/*
> +				 * Real hardware does not deliver the irq so
> +				 * immediately during eoi broadcast, so we need
> +				 * to emulate this behavior. Otherwise, for
> +				 * guests who has not registered handler of a
> +				 * level irq, this irq would be injected
> +				 * immediately after guest enables interrupt
> +				 * (which happens usually at the end of the
> +				 * common interrupt routine). This would lead
> +				 * guest can't move forward and may miss the
> +				 * possibility to get proper irq handler
> +				 * registered. So we need to give some breath to
> +				 * guest. TODO: 1 is too long?

TODO should be clarify. Maybe also translate the "1" into something
HZ-derived.

> +				 */
> +				schedule_delayed_work(&ioapic->eoi_inject, 1);
> +				ioapic->irq_eoi[i] = 0;
> +				trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
> +			}
> +			else {

I think it's the desired kernel style to do "} else {" here, but I'm not
totally sure if that rule exists.

> +				ioapic_service(ioapic, i, false);
> +			}
> +		else {

I strongly suspect this version of the patch wasn't built... ;)

> +			ioapic->irq_eoi[i] = 0;
> +		}
>  	}
>  }
>  
> @@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
>  {
>  	int i;
>  
> +	cancel_delayed_work_sync(&ioapic->eoi_inject);
>  	for (i = 0; i < IOAPIC_NUM_PINS; i++)
>  		ioapic->redirtbl[i].fields.mask = 1;
>  	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
>  	ioapic->ioregsel = 0;
>  	ioapic->irr = 0;
>  	ioapic->id = 0;
> +	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
>  	rtc_irq_eoi_tracking_reset(ioapic);
>  	update_handled_vectors(ioapic);
>  }
> @@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm)
>  	if (!ioapic)
>  		return -ENOMEM;
>  	spin_lock_init(&ioapic->lock);
> +	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
>  	kvm->arch.vioapic = ioapic;
>  	kvm_ioapic_reset(ioapic);
>  	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
> @@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm)
>  {
>  	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
>  
> +	cancel_delayed_work_sync(&ioapic->eoi_inject);
>  	if (ioapic) {
>  		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
>  		kvm->arch.vioapic = NULL;
> diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
> index 90d43e9..260ac0a 100644
> --- a/virt/kvm/ioapic.h
> +++ b/virt/kvm/ioapic.h
> @@ -34,6 +34,10 @@ struct kvm_vcpu;
>  #define	IOAPIC_INIT			0x5
>  #define	IOAPIC_EXTINT			0x7
>  
> +/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached,
> +   which can be considered as interrupt storm happened */
> +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
> +
>  #ifdef CONFIG_X86
>  #define RTC_GSI 8
>  #else
> @@ -59,6 +63,8 @@ struct kvm_ioapic {
>  	spinlock_t lock;
>  	DECLARE_BITMAP(handled_vectors, 256);
>  	struct rtc_status rtc_status;
> +	struct delayed_work eoi_inject;
> +	u32 irq_eoi[IOAPIC_NUM_PINS];
>  };
>  
>  #ifdef DEBUG
> 

The general approach looks sane to me. I was first concerned the delay
could affect non-broken guests as well, but the loop counter prevents
this effectively.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-09-11  6:01 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-03-12  9:07 [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast Jason Wang
2012-03-12  9:23 ` Gleb Natapov
2012-03-12  9:44   ` Jason Wang
2012-03-12 10:22     ` Gleb Natapov
  -- strict thread matches above, loose matches on Subject: below --
2014-09-10 12:32 Zhang Haoyu
2014-09-10 12:52 ` Zhang Haoyu
2014-09-11  5:06 Zhang Haoyu
2014-09-11  6:01 ` Jan Kiszka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).