qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Subject: [Qemu-devel] [PULL 19/21] ioapic: allow buggy guests mishandling level-triggered interrupts to make progress
Date: Wed, 15 May 2019 22:50:31 +0200	[thread overview]
Message-ID: <1557953433-19663-20-git-send-email-pbonzini@redhat.com> (raw)
In-Reply-To: <1557953433-19663-1-git-send-email-pbonzini@redhat.com>

From: Vitaly Kuznetsov <vkuznets@redhat.com>

It was found that Hyper-V 2016 on KVM in some configurations (q35 machine +
piix4-usb-uhci) hangs on boot. Root-cause was that one of Hyper-V
level-triggered interrupt handler performs EOI before fixing the cause of
the interrupt. This results in IOAPIC keep re-raising the level-triggered
interrupt after EOI because irq-line remains asserted.

Gory details: https://www.spinics.net/lists/kvm/msg184484.html
(the whole thread).

Turns out we were dealing with similar issues before; in-kernel IOAPIC
implementation has commit 184564efae4d ("kvm: ioapic: conditionally delay
irq delivery duringeoi broadcast") which describes a very similar issue.

Steal the idea from the above mentioned commit for IOAPIC implementation in
QEMU. SUCCESSIVE_IRQ_MAX_COUNT, delay and the comment are borrowed as well.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20190402080215.10747-1-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/intc/ioapic.c                  | 57 +++++++++++++++++++++++++++++++++++----
 hw/intc/trace-events              |  1 +
 include/hw/i386/ioapic_internal.h |  3 +++
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 9d75f84..7074489 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -139,6 +139,15 @@ static void ioapic_service(IOAPICCommonState *s)
     }
 }
 
+#define SUCCESSIVE_IRQ_MAX_COUNT 10000
+
+static void delayed_ioapic_service_cb(void *opaque)
+{
+    IOAPICCommonState *s = opaque;
+
+    ioapic_service(s);
+}
+
 static void ioapic_set_irq(void *opaque, int vector, int level)
 {
     IOAPICCommonState *s = opaque;
@@ -222,13 +231,39 @@ void ioapic_eoi_broadcast(int vector)
         }
         for (n = 0; n < IOAPIC_NUM_PINS; n++) {
             entry = s->ioredtbl[n];
-            if ((entry & IOAPIC_LVT_REMOTE_IRR)
-                && (entry & IOAPIC_VECTOR_MASK) == vector) {
-                trace_ioapic_clear_remote_irr(n, vector);
-                s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
-                if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
+
+            if ((entry & IOAPIC_VECTOR_MASK) != vector ||
+                ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
+                continue;
+            }
+
+            if (!(entry & IOAPIC_LVT_REMOTE_IRR)) {
+                continue;
+            }
+
+            trace_ioapic_clear_remote_irr(n, vector);
+            s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
+
+            if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
+                ++s->irq_eoi[vector];
+                if (s->irq_eoi[vector] >= SUCCESSIVE_IRQ_MAX_COUNT) {
+                    /*
+                     * Real hardware does not deliver the interrupt immediately
+                     * during eoi broadcast, and this lets a buggy guest make
+                     * slow progress even if it does not correctly handle a
+                     * level-triggered interrupt. Emulate this behavior if we
+                     * detect an interrupt storm.
+                     */
+                    s->irq_eoi[vector] = 0;
+                    timer_mod_anticipate(s->delayed_ioapic_service_timer,
+                                         qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+                                         NANOSECONDS_PER_SECOND / 100);
+                    trace_ioapic_eoi_delayed_reassert(vector);
+                } else {
                     ioapic_service(s);
                 }
+            } else {
+                s->irq_eoi[vector] = 0;
             }
         }
     }
@@ -401,6 +436,9 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
     memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s,
                           "ioapic", 0x1000);
 
+    s->delayed_ioapic_service_timer =
+        timer_new_ns(QEMU_CLOCK_VIRTUAL, delayed_ioapic_service_cb, s);
+
     qdev_init_gpio_in(dev, ioapic_set_irq, IOAPIC_NUM_PINS);
 
     ioapics[ioapic_no] = s;
@@ -408,6 +446,14 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
     qemu_add_machine_init_done_notifier(&s->machine_done);
 }
 
+static void ioapic_unrealize(DeviceState *dev, Error **errp)
+{
+    IOAPICCommonState *s = IOAPIC_COMMON(dev);
+
+    timer_del(s->delayed_ioapic_service_timer);
+    timer_free(s->delayed_ioapic_service_timer);
+}
+
 static Property ioapic_properties[] = {
     DEFINE_PROP_UINT8("version", IOAPICCommonState, version, IOAPIC_VER_DEF),
     DEFINE_PROP_END_OF_LIST(),
@@ -419,6 +465,7 @@ static void ioapic_class_init(ObjectClass *klass, void *data)
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     k->realize = ioapic_realize;
+    k->unrealize = ioapic_unrealize;
     /*
      * If APIC is in kernel, we need to update the kernel cache after
      * migration, otherwise first 24 gsi routes will be invalid.
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index a28bdce..90c9d07 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -25,6 +25,7 @@ apic_mem_writel(uint64_t addr, uint32_t val) "0x%"PRIx64" = 0x%08x"
 ioapic_set_remote_irr(int n) "set remote irr for pin %d"
 ioapic_clear_remote_irr(int n, int vector) "clear remote irr for pin %d vector %d"
 ioapic_eoi_broadcast(int vector) "EOI broadcast for vector %d"
+ioapic_eoi_delayed_reassert(int vector) "delayed reassert on EOI broadcast for vector %d"
 ioapic_mem_read(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem read addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" retval 0x%"PRIx32
 ioapic_mem_write(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem write addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" val 0x%"PRIx32
 ioapic_set_irq(int vector, int level) "vector: %d level: %d"
diff --git a/include/hw/i386/ioapic_internal.h b/include/hw/i386/ioapic_internal.h
index 9848f39..07002f9 100644
--- a/include/hw/i386/ioapic_internal.h
+++ b/include/hw/i386/ioapic_internal.h
@@ -96,6 +96,7 @@ typedef struct IOAPICCommonClass {
     SysBusDeviceClass parent_class;
 
     DeviceRealize realize;
+    DeviceUnrealize unrealize;
     void (*pre_save)(IOAPICCommonState *s);
     void (*post_load)(IOAPICCommonState *s);
 } IOAPICCommonClass;
@@ -111,6 +112,8 @@ struct IOAPICCommonState {
     uint8_t version;
     uint64_t irq_count[IOAPIC_NUM_PINS];
     int irq_level[IOAPIC_NUM_PINS];
+    int irq_eoi[IOAPIC_NUM_PINS];
+    QEMUTimer *delayed_ioapic_service_timer;
 };
 
 void ioapic_reset_common(DeviceState *dev);
-- 
1.8.3.1




  parent reply	other threads:[~2019-05-15 21:04 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-15 20:50 [Qemu-devel] [PULL 00/21] Misc patches for 2019-05-15 Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 01/21] hw/input: Add a CONFIG_PS2 switch for the ps2.c file Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 02/21] roms: assert if max rom size is less than the used size Paolo Bonzini
2019-05-16 12:40   ` Thomas Huth
2019-05-15 20:50 ` [Qemu-devel] [PULL 03/21] Declare -realtime as deprecated Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 04/21] vl: Add missing descriptions to the VGA adapters list Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 05/21] megasas: fix mapped frame size Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 06/21] hvf: Add missing break statement Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 07/21] vl: fix -sandbox parsing crash when seccomp support is disabled Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 08/21] memory: correct the comment to DIRTY_MEMORY_MIGRATION Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 09/21] hw/acpi/piix4: Move TYPE_PIIX4_PM to a public header Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 10/21] hw/i386/acpi: Add object_resolve_type_unambiguous to improve modularity Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 11/21] hw/i386/acpi: Assert a pointer is not null BEFORE using it Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 12/21] mips-fulong2e: obey -vga none Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 13/21] sun4m: " Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 14/21] trace: only include trace-event-subdirs when they are needed Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 15/21] build: replace GENERATED_FILES by generated-files-y Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 16/21] configure: qemu-ga is only needed with softmmu targets Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 17/21] build: chardev is only needed for " Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 18/21] build: don't build hardware objects with linux-user Paolo Bonzini
2019-05-21 11:52   ` Daniel P. Berrangé
2019-05-21 12:52     ` Laurent Vivier
2019-05-21 12:54       ` Daniel P. Berrangé
2019-05-21 20:16         ` Laurent Vivier
2019-07-04 13:09           ` Philippe Mathieu-Daudé
2019-05-15 20:50 ` Paolo Bonzini [this message]
2019-07-04 12:57   ` [Qemu-devel] [PULL 19/21] ioapic: allow buggy guests mishandling level-triggered interrupts to make progress Marc-André Lureau
2019-07-04 13:00     ` Li Qiang
2019-07-04 13:05       ` Marc-André Lureau
2019-07-04 13:13         ` Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 20/21] hw/char: Move multi-serial devices into separate file Paolo Bonzini
2019-05-15 20:50 ` [Qemu-devel] [PULL 21/21] hw/net/ne2000: Extract the PCI device from the chipset common code Paolo Bonzini
2019-05-16 12:14 ` [Qemu-devel] [PULL 00/21] Misc patches for 2019-05-15 Peter Maydell
2019-05-16 17:58   ` Paolo Bonzini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1557953433-19663-20-git-send-email-pbonzini@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=vkuznets@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).