[Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load.

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load.
@ 2008-06-29 14:02 Gleb Natapov
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information Gleb Natapov
                   ` (3 more replies)
  0 siblings, 4 replies; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 14:02 UTC (permalink / raw)
  To: qemu-devel

Resending one more time. There was no response last time I've sent it.

Qemu device emulation for timers might be inaccurate and
causes coalescing of several IRQs into one. It happens when the
load on the host is high and the guest did not manage to ack the
previous IRQ. The first patch in the series fixes this by changing
qemu_irq subsystem to return IRQ delivery status information.
If device is notified that IRQs where lost it can regenerate them
as needed. The following two patches add IRQ regeneration to PIC and RTC
devices.

---

Gleb Natapov (3):
      Fix time drift problem under high load when RTC is in use.
      Fix time drift problem under high load when PIT is in use.
      Change qemu_set_irq() to return status information.

 hw/apic.c        |  103 +++++++++++++++++++++++++++++++++++-------------------
 hw/esp.c         |    4 ++
 hw/fdc.c         |    4 ++
 hw/i8254.c       |   20 ++++++++++
 hw/i8259.c       |   19 +++++++---
 hw/ide.c         |    8 +++-
 hw/irq.c         |   10 +++--
 hw/irq.h         |   22 +++++++-----
 hw/max7310.c     |    4 ++
 hw/mc146818rtc.c |   11 +++++-
 hw/pc.c          |    6 ++-
 hw/pc.h          |    2 +
 hw/pci.c         |    8 +++-
 hw/ssd0323.c     |    4 ++
 hw/twl92230.c    |    8 +++-
 15 files changed, 159 insertions(+), 74 deletions(-)

-- 
		Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:02 [Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load Gleb Natapov
@ 2008-06-29 14:02 ` Gleb Natapov
  2008-06-29 14:14   ` Avi Kivity
  2008-06-29 14:38   ` Paul Brook
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 2/3] Fix time drift problem under high load when PIT is in use Gleb Natapov
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 14:02 UTC (permalink / raw)
  To: qemu-devel

The return value is less then zero if interrupt is masked, zero if it
is known that interrupt is lost (due to coalescing) or greater then zero
if interrupt is delivered or was successfully queued for delivery by
interrupt controller. Device emulation can use this info as it pleases.
Included patch adds detection of interrupt coalescing into PIC and APIC
code for edge triggered interrupts.

Signed-off-by: Gleb Natapov <gleb@qumranet.com>
---

 hw/apic.c     |  103 +++++++++++++++++++++++++++++++++++++--------------------
 hw/esp.c      |    4 ++
 hw/fdc.c      |    4 ++
 hw/i8259.c    |   19 +++++++----
 hw/ide.c      |    8 +++-
 hw/irq.c      |   10 +++---
 hw/irq.h      |   22 +++++++-----
 hw/max7310.c  |    4 ++
 hw/pc.c       |    6 ++-
 hw/pc.h       |    2 +
 hw/pci.c      |    8 +++-
 hw/ssd0323.c  |    4 ++
 hw/twl92230.c |    8 +++-
 13 files changed, 131 insertions(+), 71 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index 6fd0160..9bdaed2 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -101,7 +101,7 @@ static APICState *local_apics[MAX_APICS + 1];
 static int last_apic_id = 0;
 
 static void apic_init_ipi(APICState *s);
-static void apic_set_irq(APICState *s, int vector_num, int trigger_mode);
+static int apic_set_irq(APICState *s, int vector_num, int trigger_mode);
 static void apic_update_irq(APICState *s);
 
 /* Find first bit starting from msb. Return 0 if value = 0 */
@@ -166,6 +166,14 @@ static inline void reset_bit(uint32_t *tab, int index)
     tab[i] &= ~mask;
 }
 
+static inline int get_bit(uint32_t *tab, int index)
+{
+    int i, mask;
+    i = index >> 5;
+    mask = 1 << (index & 0x1f);
+    return !!(tab[i] & mask);
+}
+
 void apic_local_deliver(CPUState *env, int vector)
 {
     APICState *s = env->apic_state;
@@ -215,12 +223,13 @@ void apic_local_deliver(CPUState *env, int vector)
     }\
 }
 
-static void apic_bus_deliver(const uint32_t *deliver_bitmask,
+static int apic_bus_deliver(const uint32_t *deliver_bitmask,
                              uint8_t delivery_mode,
                              uint8_t vector_num, uint8_t polarity,
                              uint8_t trigger_mode)
 {
     APICState *apic_iter;
+    int ret = 0;
 
     switch (delivery_mode) {
         case APIC_DM_LOWPRI:
@@ -237,11 +246,12 @@ static void apic_bus_deliver(const uint32_t *deliver_bitmask,
                 if (d >= 0) {
                     apic_iter = local_apics[d];
                     if (apic_iter) {
-                        apic_set_irq(apic_iter, vector_num, trigger_mode);
+                        ret = apic_set_irq(apic_iter, vector_num, trigger_mode);
                     }
-                }
+                } else
+                    ret = 1;
             }
-            return;
+            return ret;
 
         case APIC_DM_FIXED:
             break;
@@ -249,29 +259,33 @@ static void apic_bus_deliver(const uint32_t *deliver_bitmask,
         case APIC_DM_SMI:
             foreach_apic(apic_iter, deliver_bitmask,
                 cpu_interrupt(apic_iter->cpu_env, CPU_INTERRUPT_SMI) );
-            return;
+            return 1;
 
         case APIC_DM_NMI:
             foreach_apic(apic_iter, deliver_bitmask,
                 cpu_interrupt(apic_iter->cpu_env, CPU_INTERRUPT_NMI) );
-            return;
+            return 1;
 
         case APIC_DM_INIT:
             /* normal INIT IPI sent to processors */
             foreach_apic(apic_iter, deliver_bitmask,
                          apic_init_ipi(apic_iter) );
-            return;
+            return 1;
 
         case APIC_DM_EXTINT:
             /* handled in I/O APIC code */
             break;
 
         default:
-            return;
+            return ret;
     }
 
+    /* if interrupt should be delivered to more then one apic report it as
+     * delivered if at least one apic accepts it */
     foreach_apic(apic_iter, deliver_bitmask,
-                 apic_set_irq(apic_iter, vector_num, trigger_mode) );
+                 ret += apic_set_irq(apic_iter, vector_num, trigger_mode) );
+
+    return ret;
 }
 
 void cpu_set_apic_base(CPUState *env, uint64_t val)
@@ -361,14 +375,18 @@ static void apic_update_irq(APICState *s)
     cpu_interrupt(s->cpu_env, CPU_INTERRUPT_HARD);
 }
 
-static void apic_set_irq(APICState *s, int vector_num, int trigger_mode)
+static int apic_set_irq(APICState *s, int vector_num, int trigger_mode)
 {
+    int ret = !get_bit(s->irr, vector_num);
+
     set_bit(s->irr, vector_num);
     if (trigger_mode)
         set_bit(s->tmr, vector_num);
     else
         reset_bit(s->tmr, vector_num);
     apic_update_irq(s);
+
+    return ret;
 }
 
 static void apic_eoi(APICState *s)
@@ -906,9 +924,8 @@ int apic_init(CPUState *env)
     return 0;
 }
 
-static void ioapic_service(IOAPICState *s)
+static int ioapic_service_one(IOAPICState *s, uint8_t pin)
 {
-    uint8_t i;
     uint8_t trig_mode;
     uint8_t vector;
     uint8_t delivery_mode;
@@ -919,34 +936,44 @@ static void ioapic_service(IOAPICState *s)
     uint8_t polarity;
     uint32_t deliver_bitmask[MAX_APIC_WORDS];
 
+    mask = 1 << pin;
+    if (!(s->irr & mask))
+        return 1;
+    
+    entry = s->ioredtbl[pin];
+    if ((entry & APIC_LVT_MASKED))
+        return -1;
+
+    trig_mode = ((entry >> 15) & 1);
+    dest = entry >> 56;
+    dest_mode = (entry >> 11) & 1;
+    delivery_mode = (entry >> 8) & 7;
+    polarity = (entry >> 13) & 1;
+    if (trig_mode == APIC_TRIGGER_EDGE)
+        s->irr &= ~mask;
+    if (delivery_mode == APIC_DM_EXTINT)
+        vector = pic_read_irq(isa_pic);
+    else
+        vector = entry & 0xff;
+
+    apic_get_delivery_bitmask(deliver_bitmask, dest, dest_mode);
+    return apic_bus_deliver(deliver_bitmask, delivery_mode, vector, polarity,
+            trig_mode);
+}
+
+static void ioapic_service(IOAPICState *s)
+{
+    uint8_t i;
+
     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
-        mask = 1 << i;
-        if (s->irr & mask) {
-            entry = s->ioredtbl[i];
-            if (!(entry & APIC_LVT_MASKED)) {
-                trig_mode = ((entry >> 15) & 1);
-                dest = entry >> 56;
-                dest_mode = (entry >> 11) & 1;
-                delivery_mode = (entry >> 8) & 7;
-                polarity = (entry >> 13) & 1;
-                if (trig_mode == APIC_TRIGGER_EDGE)
-                    s->irr &= ~mask;
-                if (delivery_mode == APIC_DM_EXTINT)
-                    vector = pic_read_irq(isa_pic);
-                else
-                    vector = entry & 0xff;
-
-                apic_get_delivery_bitmask(deliver_bitmask, dest, dest_mode);
-                apic_bus_deliver(deliver_bitmask, delivery_mode,
-                                 vector, polarity, trig_mode);
-            }
-        }
+        ioapic_service_one(s, i);
     }
 }
 
-void ioapic_set_irq(void *opaque, int vector, int level)
+int ioapic_set_irq(void *opaque, int vector, int level)
 {
     IOAPICState *s = opaque;
+    int ret = 1;
 
     if (vector >= 0 && vector < IOAPIC_NUM_PINS) {
         uint32_t mask = 1 << vector;
@@ -956,7 +983,7 @@ void ioapic_set_irq(void *opaque, int vector, int level)
             /* level triggered */
             if (level) {
                 s->irr |= mask;
-                ioapic_service(s);
+                ioapic_service_one(s, vector);
             } else {
                 s->irr &= ~mask;
             }
@@ -964,10 +991,12 @@ void ioapic_set_irq(void *opaque, int vector, int level)
             /* edge triggered */
             if (level) {
                 s->irr |= mask;
-                ioapic_service(s);
+                ret = ioapic_service_one(s, vector);
             }
         }
     }
+
+    return ret;
 }
 
 static uint32_t ioapic_mem_readl(void *opaque, target_phys_addr_t addr)
diff --git a/hw/esp.c b/hw/esp.c
index 1843fb8..aa5e546 100644
--- a/hw/esp.c
+++ b/hw/esp.c
@@ -409,10 +409,12 @@ static void esp_reset(void *opaque)
     s->do_cmd = 0;
 }
 
-static void parent_esp_reset(void *opaque, int irq, int level)
+static int parent_esp_reset(void *opaque, int irq, int level)
 {
     if (level)
         esp_reset(opaque);
+
+    return 1;
 }
 
 static uint32_t esp_mem_readb(void *opaque, target_phys_addr_t addr)
diff --git a/hw/fdc.c b/hw/fdc.c
index cd00420..b212d89 100644
--- a/hw/fdc.c
+++ b/hw/fdc.c
@@ -713,7 +713,7 @@ static void fdctrl_external_reset(void *opaque)
     fdctrl_reset(s, 0);
 }
 
-static void fdctrl_handle_tc(void *opaque, int irq, int level)
+static int fdctrl_handle_tc(void *opaque, int irq, int level)
 {
     //fdctrl_t *s = opaque;
 
@@ -721,6 +721,8 @@ static void fdctrl_handle_tc(void *opaque, int irq, int level)
         // XXX
         FLOPPY_DPRINTF("TC pulsed\n");
     }
+
+    return 1;
 }
 
 /* XXX: may change if moved to bdrv */
diff --git a/hw/i8259.c b/hw/i8259.c
index add6345..ab7eb2d 100644
--- a/hw/i8259.c
+++ b/hw/i8259.c
@@ -72,9 +72,9 @@ static uint64_t irq_count[16];
 #endif
 
 /* set irq level. If an edge is detected, then the IRR is set to 1 */
-static inline void pic_set_irq1(PicState *s, int irq, int level)
+static inline int pic_set_irq1(PicState *s, int irq, int level)
 {
-    int mask;
+    int mask, ret = 1;
     mask = 1 << irq;
     if (s->elcr & mask) {
         /* level triggered */
@@ -88,13 +88,18 @@ static inline void pic_set_irq1(PicState *s, int irq, int level)
     } else {
         /* edge triggered */
         if (level) {
-            if ((s->last_irr & mask) == 0)
+            if ((s->last_irr & mask) == 0) {
+                if((s->irr & mask))
+                    ret = 0;
                 s->irr |= mask;
+            }
             s->last_irr |= mask;
         } else {
             s->last_irr &= ~mask;
         }
     }
+
+    return (s->imr & mask) ? -1 : ret;
 }
 
 /* return the highest priority found in mask (highest = smallest
@@ -178,9 +183,10 @@ void pic_update_irq(PicState2 *s)
 int64_t irq_time[16];
 #endif
 
-static void i8259_set_irq(void *opaque, int irq, int level)
+static int i8259_set_irq(void *opaque, int irq, int level)
 {
     PicState2 *s = opaque;
+    int pic_ret, ioapic_ret;
 
 #if defined(DEBUG_PIC) || defined(DEBUG_IRQ_COUNT)
     if (level != irq_level[irq]) {
@@ -199,11 +205,12 @@ static void i8259_set_irq(void *opaque, int irq, int level)
         irq_time[irq] = qemu_get_clock(vm_clock);
     }
 #endif
-    pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+    pic_ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
     /* used for IOAPIC irqs */
     if (s->alt_irq_func)
-        s->alt_irq_func(s->alt_irq_opaque, irq, level);
+        ioapic_ret = s->alt_irq_func(s->alt_irq_opaque, irq, level);
     pic_update_irq(s);
+    return MAX(pic_ret, ioapic_ret);
 }
 
 /* acknowledge interrupt 'irq' */
diff --git a/hw/ide.c b/hw/ide.c
index dc41982..d155bae 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -2942,7 +2942,7 @@ static void cmd646_update_irq(PCIIDEState *d)
 }
 
 /* the PCI irq level is the logical OR of the two channels */
-static void cmd646_set_irq(void *opaque, int channel, int level)
+static int cmd646_set_irq(void *opaque, int channel, int level)
 {
     PCIIDEState *d = opaque;
     int irq_mask;
@@ -2953,6 +2953,8 @@ static void cmd646_set_irq(void *opaque, int channel, int level)
     else
         d->dev.config[MRDMODE] &= ~irq_mask;
     cmd646_update_irq(d);
+
+    return 1;
 }
 
 /* CMD646 PCI IDE controller */
@@ -3345,7 +3347,7 @@ static inline void md_interrupt_update(struct md_s *s)
                     !(s->opt & OPT_SRESET));
 }
 
-static void md_set_irq(void *opaque, int irq, int level)
+static int md_set_irq(void *opaque, int irq, int level)
 {
     struct md_s *s = (struct md_s *) opaque;
     if (level)
@@ -3354,6 +3356,8 @@ static void md_set_irq(void *opaque, int irq, int level)
         s->stat &= ~STAT_INT;
 
     md_interrupt_update(s);
+
+    return 1;
 }
 
 static void md_reset(struct md_s *s)
diff --git a/hw/irq.c b/hw/irq.c
index eca707d..78ece4c 100644
--- a/hw/irq.c
+++ b/hw/irq.c
@@ -30,12 +30,12 @@ struct IRQState {
     int n;
 };
 
-void qemu_set_irq(qemu_irq irq, int level)
+int qemu_set_irq(qemu_irq irq, int level)
 {
     if (!irq)
-        return;
+        return 1;
 
-    irq->handler(irq->opaque, irq->n, level);
+    return irq->handler(irq->opaque, irq->n, level);
 }
 
 qemu_irq *qemu_allocate_irqs(qemu_irq_handler handler, void *opaque, int n)
@@ -56,11 +56,11 @@ qemu_irq *qemu_allocate_irqs(qemu_irq_handler handler, void *opaque, int n)
     return s;
 }
 
-static void qemu_notirq(void *opaque, int line, int level)
+static int qemu_notirq(void *opaque, int line, int level)
 {
     struct IRQState *irq = opaque;
 
-    irq->handler(irq->opaque, irq->n, !level);
+    return irq->handler(irq->opaque, irq->n, !level);
 }
 
 qemu_irq qemu_irq_invert(qemu_irq irq)
diff --git a/hw/irq.h b/hw/irq.h
index 0880ad2..36abdae 100644
--- a/hw/irq.h
+++ b/hw/irq.h
@@ -4,25 +4,29 @@
 /* Generic IRQ/GPIO pin infrastructure.  */
 
 /* FIXME: Rmove one of these.  */
-typedef void (*qemu_irq_handler)(void *opaque, int n, int level);
-typedef void SetIRQFunc(void *opaque, int irq_num, int level);
+typedef int (*qemu_irq_handler)(void *opaque, int n, int level);
+typedef int SetIRQFunc(void *opaque, int irq_num, int level);
 
-void qemu_set_irq(qemu_irq irq, int level);
+int qemu_set_irq(qemu_irq irq, int level);
 
-static inline void qemu_irq_raise(qemu_irq irq)
+static inline int qemu_irq_raise(qemu_irq irq)
 {
-    qemu_set_irq(irq, 1);
+    return qemu_set_irq(irq, 1);
 }
 
-static inline void qemu_irq_lower(qemu_irq irq)
+static inline int qemu_irq_lower(qemu_irq irq)
 {
-    qemu_set_irq(irq, 0);
+    return qemu_set_irq(irq, 0);
 }
 
-static inline void qemu_irq_pulse(qemu_irq irq)
+static inline int qemu_irq_pulse(qemu_irq irq)
 {
-    qemu_set_irq(irq, 1);
+    int ret;
+
+    ret = qemu_set_irq(irq, 1);
     qemu_set_irq(irq, 0);
+
+    return ret;
 }
 
 /* Returns an array of N IRQs.  */
diff --git a/hw/max7310.c b/hw/max7310.c
index 397950a..6d498d5 100644
--- a/hw/max7310.c
+++ b/hw/max7310.c
@@ -179,7 +179,7 @@ static int max7310_load(QEMUFile *f, void *opaque, int version_id)
 
 static int max7310_iid = 0;
 
-static void max7310_gpio_set(void *opaque, int line, int level)
+static int max7310_gpio_set(void *opaque, int line, int level)
 {
     struct max7310_s *s = (struct max7310_s *) opaque;
     if (line >= sizeof(s->handler) / sizeof(*s->handler) || line  < 0)
@@ -189,6 +189,8 @@ static void max7310_gpio_set(void *opaque, int line, int level)
         s->level |= s->direction & (1 << line);
     else
         s->level &= ~(s->direction & (1 << line));
+
+    return 1;
 }
 
 /* MAX7310 is SMBus-compatible (can be used with only SMBus protocols),
diff --git a/hw/pc.c b/hw/pc.c
index 4c5e1c3..8e52041 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -114,18 +114,20 @@ int cpu_get_pic_interrupt(CPUState *env)
     return intno;
 }
 
-static void pic_irq_request(void *opaque, int irq, int level)
+static int pic_irq_request(void *opaque, int irq, int level)
 {
     CPUState *env = first_cpu;
 
     if (!level)
-        return;
+        return 1;
 
     while (env) {
         if (apic_accept_pic_intr(env))
             apic_local_deliver(env, APIC_LINT0);
         env = env->next_cpu;
     }
+
+    return 1;
 }
 
 /* PC cmos mappings */
diff --git a/hw/pc.h b/hw/pc.h
index 0c98ff0..b789f98 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -47,7 +47,7 @@ int apic_accept_pic_intr(CPUState *env);
 void apic_local_deliver(CPUState *env, int vector);
 int apic_get_interrupt(CPUState *env);
 IOAPICState *ioapic_init(void);
-void ioapic_set_irq(void *opaque, int vector, int level);
+int ioapic_set_irq(void *opaque, int vector, int level);
 
 /* i8254.c */
 
diff --git a/hw/pci.c b/hw/pci.c
index bc55989..6d85ccb 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -47,7 +47,7 @@ struct PCIBus {
 };
 
 static void pci_update_mappings(PCIDevice *d);
-static void pci_set_irq(void *opaque, int irq_num, int level);
+static int pci_set_irq(void *opaque, int irq_num, int level);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -485,7 +485,7 @@ uint32_t pci_data_read(void *opaque, uint32_t addr, int len)
 /* generic PCI irq support */
 
 /* 0 <= irq_num <= 3. level must be 0 or 1 */
-static void pci_set_irq(void *opaque, int irq_num, int level)
+static int pci_set_irq(void *opaque, int irq_num, int level)
 {
     PCIDevice *pci_dev = (PCIDevice *)opaque;
     PCIBus *bus;
@@ -493,7 +493,7 @@ static void pci_set_irq(void *opaque, int irq_num, int level)
 
     change = level - pci_dev->irq_state[irq_num];
     if (!change)
-        return;
+        return 1;
 
     pci_dev->irq_state[irq_num] = level;
     for (;;) {
@@ -505,6 +505,8 @@ static void pci_set_irq(void *opaque, int irq_num, int level)
     }
     bus->irq_count[irq_num] += change;
     bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
+
+    return 1;
 }
 
 /***********************************************************/
diff --git a/hw/ssd0323.c b/hw/ssd0323.c
index e2e619f..dc5eb88 100644
--- a/hw/ssd0323.c
+++ b/hw/ssd0323.c
@@ -265,11 +265,13 @@ static void ssd0323_invalidate_display(void * opaque)
 }
 
 /* Command/data input.  */
-static void ssd0323_cd(void *opaque, int n, int level)
+static int ssd0323_cd(void *opaque, int n, int level)
 {
     ssd0323_state *s = (ssd0323_state *)opaque;
     DPRINTF("%s mode\n", level ? "Data" : "Command");
     s->mode = level ? SSD0323_DATA : SSD0323_CMD;
+
+    return 1;
 }
 
 void *ssd0323_init(DisplayState *ds, qemu_irq *cmd_p)
diff --git a/hw/twl92230.c b/hw/twl92230.c
index 8d0ce5d..6d49bea 100644
--- a/hw/twl92230.c
+++ b/hw/twl92230.c
@@ -195,16 +195,18 @@ static inline int from_bcd(uint8_t val)
     return ((val >> 4) * 10) + (val & 0x0f);
 }
 
-static void menelaus_gpio_set(void *opaque, int line, int level)
+static int menelaus_gpio_set(void *opaque, int line, int level)
 {
     struct menelaus_s *s = (struct menelaus_s *) opaque;
 
     /* No interrupt generated */
     s->inputs &= ~(1 << line);
     s->inputs |= level << line;
+
+    return 1;
 }
 
-static void menelaus_pwrbtn_set(void *opaque, int line, int level)
+static int menelaus_pwrbtn_set(void *opaque, int line, int level)
 {
     struct menelaus_s *s = (struct menelaus_s *) opaque;
 
@@ -213,6 +215,8 @@ static void menelaus_pwrbtn_set(void *opaque, int line, int level)
         menelaus_update(s);
     }
     s->pwrbtn_state = level;
+
+    return 1;
 }
 
 #define MENELAUS_REV		0x01

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 2/3] Fix time drift problem under high load when PIT is in use.
  2008-06-29 14:02 [Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load Gleb Natapov
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information Gleb Natapov
@ 2008-06-29 14:02 ` Gleb Natapov
  2008-06-29 14:40   ` [Qemu-devel] " Jan Kiszka
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 3/3] Fix time drift problem under high load when RTC " Gleb Natapov
  2008-06-29 14:37 ` [Qemu-devel] Re: [PATCH 0/3] Fix guest time drift under heavy load Jan Kiszka
  3 siblings, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 14:02 UTC (permalink / raw)
  To: qemu-devel

Count the number of interrupts that was lost due to interrupt coalescing
and re-inject them back when possible. This fixes time drift problem when
pit is used as a time source.

Signed-off-by: Gleb Natapov <gleb@qumranet.com>
---

 hw/i8254.c |   20 +++++++++++++++++++-
 1 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/hw/i8254.c b/hw/i8254.c
index 4813b03..c4f0f46 100644
--- a/hw/i8254.c
+++ b/hw/i8254.c
@@ -61,6 +61,8 @@ static PITState pit_state;

 static void pit_irq_timer_update(PITChannelState *s, int64_t current_time);

+static uint32_t pit_irq_coalesced;
+
 static int pit_get_count(PITChannelState *s)
 {
     uint64_t d;
@@ -369,12 +371,28 @@ static void pit_irq_timer_update(PITChannelState *s, int64_t current_time)
         return;
     expire_time = pit_get_next_transition_time(s, current_time);
     irq_level = pit_get_out1(s, current_time);
-    qemu_set_irq(s->irq, irq_level);
+    if(irq_level) {
+        if(!qemu_irq_raise(s->irq))
+            pit_irq_coalesced++;
+    } else {
+        qemu_irq_lower(s->irq);
+        if(pit_irq_coalesced > 0) {
+            if(qemu_irq_raise(s->irq))
+                pit_irq_coalesced--;
+            qemu_irq_lower(s->irq);
+        }
+    }
+
 #ifdef DEBUG_PIT
     printf("irq_level=%d next_delay=%f\n",
            irq_level,
            (double)(expire_time - current_time) / ticks_per_sec);
 #endif
+    if(pit_irq_coalesced && expire_time != -1) {
+        uint32_t div = ((pit_irq_coalesced >> 10) & 0x7f) + 2;
+        expire_time -= ((expire_time - current_time) / div);
+    }
+
     s->next_transition_time = expire_time;
     if (expire_time != -1)
         qemu_mod_timer(s->irq_timer, expire_time);

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 3/3] Fix time drift problem under high load when RTC is in use.
  2008-06-29 14:02 [Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load Gleb Natapov
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information Gleb Natapov
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 2/3] Fix time drift problem under high load when PIT is in use Gleb Natapov
@ 2008-06-29 14:02 ` Gleb Natapov
  2008-06-29 14:37 ` [Qemu-devel] Re: [PATCH 0/3] Fix guest time drift under heavy load Jan Kiszka
  3 siblings, 0 replies; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 14:02 UTC (permalink / raw)
  To: qemu-devel

Count the number of interrupts that was lost due to interrupt coalescing
and re-inject them back when possible. This fixes time drift problem when
RTC is used as a time source.

Signed-off-by: Gleb Natapov <gleb@qumranet.com>
---

 hw/mc146818rtc.c |   11 +++++++++--
 1 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index 30bb044..64e4c11 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -72,6 +72,7 @@ struct RTCState {

 static void rtc_set_time(RTCState *s);
 static void rtc_copy_date(RTCState *s);
+static uint32_t rtc_irq_coalesced;

 static void rtc_timer_update(RTCState *s, int64_t current_time)
 {
@@ -101,7 +102,8 @@ static void rtc_periodic_timer(void *opaque)

     rtc_timer_update(s, s->next_periodic_time);
     s->cmos_data[RTC_REG_C] |= 0xc0;
-    qemu_irq_raise(s->irq);
+    if(!qemu_irq_raise(s->irq))
+        rtc_irq_coalesced++;
 }

 static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data)
@@ -360,7 +362,12 @@ static uint32_t cmos_ioport_read(void *opaque, uint32_t addr)
         case RTC_REG_C:
             ret = s->cmos_data[s->cmos_index];
             qemu_irq_lower(s->irq);
-            s->cmos_data[RTC_REG_C] = 0x00;
+            if(!rtc_irq_coalesced) {
+                s->cmos_data[RTC_REG_C] = 0x00;
+            } else {
+                if(qemu_irq_raise(s->irq))
+                    rtc_irq_coalesced--;
+            }
             break;
         default:
             ret = s->cmos_data[s->cmos_index];

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information Gleb Natapov
@ 2008-06-29 14:14   ` Avi Kivity
  2008-06-29 14:18     ` Gleb Natapov
  2008-06-29 14:38   ` Paul Brook
  1 sibling, 1 reply; 27+ messages in thread
From: Avi Kivity @ 2008-06-29 14:14 UTC (permalink / raw)
  To: gleb; +Cc: qemu-devel

Gleb Natapov wrote:
> The return value is less then zero if interrupt is masked, zero if it
> is known that interrupt is lost (due to coalescing) or greater then zero
> if interrupt is delivered or was successfully queued for delivery by
> interrupt controller. Device emulation can use this info as it pleases.
> Included patch adds detection of interrupt coalescing into PIC and APIC
> code for edge triggered interrupts.
>   

Instead of negative/positive/zero, consider returning an enum for 
readability.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:14   ` Avi Kivity
@ 2008-06-29 14:18     ` Gleb Natapov
  2008-06-29 14:53       ` Paul Brook
  0 siblings, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 14:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 05:14:01PM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>> The return value is less then zero if interrupt is masked, zero if it
>> is known that interrupt is lost (due to coalescing) or greater then zero
>> if interrupt is delivered or was successfully queued for delivery by
>> interrupt controller. Device emulation can use this info as it pleases.
>> Included patch adds detection of interrupt coalescing into PIC and APIC
>> code for edge triggered interrupts.
>>   
>
> Instead of negative/positive/zero, consider returning an enum for  
> readability.
>
I thought about that, but I sometimes do arithmetics on those values
(when delivering interrupt to multiple CPUs), so result can be more then
1 sometimes.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Qemu-devel] Re: [PATCH 0/3] Fix guest time drift under heavy load.
  2008-06-29 14:02 [Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load Gleb Natapov
                   ` (2 preceding siblings ...)
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 3/3] Fix time drift problem under high load when RTC " Gleb Natapov
@ 2008-06-29 14:37 ` Jan Kiszka
  3 siblings, 0 replies; 27+ messages in thread
From: Jan Kiszka @ 2008-06-29 14:37 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 834 bytes --]

Gleb Natapov wrote:
> Resending one more time. There was no response last time I've sent it.
> 
> Qemu device emulation for timers might be inaccurate and
> causes coalescing of several IRQs into one. It happens when the
> load on the host is high and the guest did not manage to ack the
> previous IRQ. The first patch in the series fixes this by changing
> qemu_irq subsystem to return IRQ delivery status information.
> If device is notified that IRQs where lost it can regenerate them
> as needed. The following two patches add IRQ regeneration to PIC and RTC
> devices.

s/PIC/PIT. :)

I must have overslept the first posting. In fact, we are currently
discussing such measures for our scenario as well - we are also
suffering from lost PIT IRQs. So I will definitely give your series a
try next week!

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information Gleb Natapov
  2008-06-29 14:14   ` Avi Kivity
@ 2008-06-29 14:38   ` Paul Brook
  2008-06-29 15:40     ` Gleb Natapov
  1 sibling, 1 reply; 27+ messages in thread
From: Paul Brook @ 2008-06-29 14:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov

On Sunday 29 June 2008, Gleb Natapov wrote:
> The return value is less then zero if interrupt is masked, zero if it
> is known that interrupt is lost (due to coalescing) or greater then zero
> if interrupt is delivered or was successfully queued for delivery by
> interrupt controller. Device emulation can use this info as it pleases.
> Included patch adds detection of interrupt coalescing into PIC and APIC
> code for edge triggered interrupts.

This is woefully incomplete, and obviously hasn't been tested on anything 
other than x86 targets.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Qemu-devel] Re: [PATCH 2/3] Fix time drift problem under high load when PIT is in use.
  2008-06-29 14:02 ` [Qemu-devel] [PATCH 2/3] Fix time drift problem under high load when PIT is in use Gleb Natapov
@ 2008-06-29 14:40   ` Jan Kiszka
  2008-06-29 15:52     ` Gleb Natapov
  0 siblings, 1 reply; 27+ messages in thread
From: Jan Kiszka @ 2008-06-29 14:40 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2262 bytes --]

Gleb Natapov wrote:
> Count the number of interrupts that was lost due to interrupt coalescing
> and re-inject them back when possible. This fixes time drift problem when
> pit is used as a time source.
> 
> Signed-off-by: Gleb Natapov <gleb@qumranet.com>
> ---
> 
>  hw/i8254.c |   20 +++++++++++++++++++-
>  1 files changed, 19 insertions(+), 1 deletions(-)
> 
> diff --git a/hw/i8254.c b/hw/i8254.c
> index 4813b03..c4f0f46 100644
> --- a/hw/i8254.c
> +++ b/hw/i8254.c
> @@ -61,6 +61,8 @@ static PITState pit_state;
>  
>  static void pit_irq_timer_update(PITChannelState *s, int64_t current_time);
>  
> +static uint32_t pit_irq_coalesced;
> +
>  static int pit_get_count(PITChannelState *s)
>  {
>      uint64_t d;
> @@ -369,12 +371,28 @@ static void pit_irq_timer_update(PITChannelState *s, int64_t current_time)
>          return;
>      expire_time = pit_get_next_transition_time(s, current_time);
>      irq_level = pit_get_out1(s, current_time);
> -    qemu_set_irq(s->irq, irq_level);
> +    if(irq_level) {
> +        if(!qemu_irq_raise(s->irq))
> +            pit_irq_coalesced++;
> +    } else {
> +        qemu_irq_lower(s->irq);
> +        if(pit_irq_coalesced > 0) {
> +            if(qemu_irq_raise(s->irq))
> +                pit_irq_coalesced--;
> +            qemu_irq_lower(s->irq);
> +        }
> +    }

That's graspable for my poor brain: reinject one coalesced IRQ right
after the falling edge of an in-time delivery...

> +
>  #ifdef DEBUG_PIT
>      printf("irq_level=%d next_delay=%f\n",
>             irq_level,
>             (double)(expire_time - current_time) / ticks_per_sec);
>  #endif
> +    if(pit_irq_coalesced && expire_time != -1) {
> +        uint32_t div = ((pit_irq_coalesced >> 10) & 0x7f) + 2;
> +        expire_time -= ((expire_time - current_time) / div);
> +    }
> +

... but could you comment on this algorithm? I think I got what it does:
splitting up the next regular period in short intervals. But there are a
bit too many magic numbers involved. Where do they come from? What
happens if pit_irq_coalesced becomes large (or: how large can it become
without risking an IRQ storm on the guest)? A few comments would help, I
guess.

Thanks,
Jan

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:18     ` Gleb Natapov
@ 2008-06-29 14:53       ` Paul Brook
  2008-06-29 15:37         ` Gleb Natapov
  0 siblings, 1 reply; 27+ messages in thread
From: Paul Brook @ 2008-06-29 14:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov

> >> The return value is less then zero if interrupt is masked, zero if it
> >> is known that interrupt is lost (due to coalescing) or greater then zero
> >> if interrupt is delivered or was successfully queued for delivery by
> >> interrupt controller. Device emulation can use this info as it pleases.
> >> Included patch adds detection of interrupt coalescing into PIC and APIC
> >> code for edge triggered interrupts.
> >
> > Instead of negative/positive/zero, consider returning an enum for
> > readability.
>
> I thought about that, but I sometimes do arithmetics on those values
> (when delivering interrupt to multiple CPUs), so result can be more then
> 1 sometimes.

At minimum you need to document the meaning of these values. Especially as 
you've now given two contradictory definitions - "greater than zero" isn't 
something you can do arithmetic with. If you require -1/0/+1 then you have to 
say that.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:53       ` Paul Brook
@ 2008-06-29 15:37         ` Gleb Natapov
  0 siblings, 0 replies; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 15:37 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 03:53:55PM +0100, Paul Brook wrote:
> > >> The return value is less then zero if interrupt is masked, zero if it
> > >> is known that interrupt is lost (due to coalescing) or greater then zero
> > >> if interrupt is delivered or was successfully queued for delivery by
> > >> interrupt controller. Device emulation can use this info as it pleases.
> > >> Included patch adds detection of interrupt coalescing into PIC and APIC
> > >> code for edge triggered interrupts.
> > >
> > > Instead of negative/positive/zero, consider returning an enum for
> > > readability.
> >
> > I thought about that, but I sometimes do arithmetics on those values
> > (when delivering interrupt to multiple CPUs), so result can be more then
> > 1 sometimes.
> 
> At minimum you need to document the meaning of these values. Especially as 
> you've now given two contradictory definitions - "greater than zero" isn't 
> something you can do arithmetic with. If you require -1/0/+1 then you have to 
> say that.
In only place I do arithmetic on those values it is guarantied that the value
is either zero (cpu lost interrupt) or one (cpu received interrupt), so the sum
of those values make sense. If it greater then zero at least one processor will
process an interrupt. I will document the return value in a comment in the
next version of the patch.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 14:38   ` Paul Brook
@ 2008-06-29 15:40     ` Gleb Natapov
  2008-06-29 18:11       ` Paul Brook
  0 siblings, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 15:40 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 03:38:10PM +0100, Paul Brook wrote:
> On Sunday 29 June 2008, Gleb Natapov wrote:
> > The return value is less then zero if interrupt is masked, zero if it
> > is known that interrupt is lost (due to coalescing) or greater then zero
> > if interrupt is delivered or was successfully queued for delivery by
> > interrupt controller. Device emulation can use this info as it pleases.
> > Included patch adds detection of interrupt coalescing into PIC and APIC
> > code for edge triggered interrupts.
> 
> This is woefully incomplete, and obviously hasn't been tested on anything 
> other than x86 targets.
> 
Yes, you are right. It was not tested on anything other than x86. Do you
see why this approach will not work on other architectures? Can you
elaborate on what current patch is missing for other architectures
support? The initial goal is to fix RTC/PIT problem on x86 while do not
hurt any other architectures in any way.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] Fix time drift problem under high load when PIT is in use.
  2008-06-29 14:40   ` [Qemu-devel] " Jan Kiszka
@ 2008-06-29 15:52     ` Gleb Natapov
  0 siblings, 0 replies; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 15:52 UTC (permalink / raw)
  To: qemu-devel

On Sun, Jun 29, 2008 at 04:40:43PM +0200, Jan Kiszka wrote:
> Gleb Natapov wrote:
> > Count the number of interrupts that was lost due to interrupt coalescing
> > and re-inject them back when possible. This fixes time drift problem when
> > pit is used as a time source.
> > 
> > Signed-off-by: Gleb Natapov <gleb@qumranet.com>
> > ---
> > 
> >  hw/i8254.c |   20 +++++++++++++++++++-
> >  1 files changed, 19 insertions(+), 1 deletions(-)
> > 
> > diff --git a/hw/i8254.c b/hw/i8254.c
> > index 4813b03..c4f0f46 100644
> > --- a/hw/i8254.c
> > +++ b/hw/i8254.c
> > @@ -61,6 +61,8 @@ static PITState pit_state;
> >  
> >  static void pit_irq_timer_update(PITChannelState *s, int64_t current_time);
> >  
> > +static uint32_t pit_irq_coalesced;
> > +
> >  static int pit_get_count(PITChannelState *s)
> >  {
> >      uint64_t d;
> > @@ -369,12 +371,28 @@ static void pit_irq_timer_update(PITChannelState *s, int64_t current_time)
> >          return;
> >      expire_time = pit_get_next_transition_time(s, current_time);
> >      irq_level = pit_get_out1(s, current_time);
> > -    qemu_set_irq(s->irq, irq_level);
> > +    if(irq_level) {
> > +        if(!qemu_irq_raise(s->irq))
> > +            pit_irq_coalesced++;
> > +    } else {
> > +        qemu_irq_lower(s->irq);
> > +        if(pit_irq_coalesced > 0) {
> > +            if(qemu_irq_raise(s->irq))
> > +                pit_irq_coalesced--;
> > +            qemu_irq_lower(s->irq);
> > +        }
> > +    }
> 
> That's graspable for my poor brain: reinject one coalesced IRQ right
> after the falling edge of an in-time delivery...
This works because I speed up timer that calls this function. I can
create another timer just for injection of lost interrupts instead of
doing it here.

> 
> > +
> >  #ifdef DEBUG_PIT
> >      printf("irq_level=%d next_delay=%f\n",
> >             irq_level,
> >             (double)(expire_time - current_time) / ticks_per_sec);
> >  #endif
> > +    if(pit_irq_coalesced && expire_time != -1) {
> > +        uint32_t div = ((pit_irq_coalesced >> 10) & 0x7f) + 2;
> > +        expire_time -= ((expire_time - current_time) / div);
> > +    }
> > +
> 
> ... but could you comment on this algorithm? I think I got what it does:
> splitting up the next regular period in short intervals. But there are a
> bit too many magic numbers involved. Where do they come from? What
> happens if pit_irq_coalesced becomes large (or: how large can it become
> without risking an IRQ storm on the guest)? A few comments would help, I
> guess.
> 

The numbers are come from empirical data :) The algorithm divide next
time interval by value that depends on number of coalesced interrupts.
The divider value can be any number between 2 and 129. The formula is:
div = (pit_irq_coalesced / A) % B + 2
I chose A and B so that bit operation could be used instead of
arithmetic.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 15:40     ` Gleb Natapov
@ 2008-06-29 18:11       ` Paul Brook
  2008-06-29 19:44         ` Gleb Natapov
  0 siblings, 1 reply; 27+ messages in thread
From: Paul Brook @ 2008-06-29 18:11 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: qemu-devel

On Sunday 29 June 2008, Gleb Natapov wrote:
> On Sun, Jun 29, 2008 at 03:38:10PM +0100, Paul Brook wrote:
> > On Sunday 29 June 2008, Gleb Natapov wrote:
> > > The return value is less then zero if interrupt is masked, zero if it
> > > is known that interrupt is lost (due to coalescing) or greater then
> > > zero if interrupt is delivered or was successfully queued for delivery
> > > by interrupt controller. Device emulation can use this info as it
> > > pleases. Included patch adds detection of interrupt coalescing into PIC
> > > and APIC code for edge triggered interrupts.
> >
> > This is woefully incomplete, and obviously hasn't been tested on anything
> > other than x86 targets.
>
> Yes, you are right. It was not tested on anything other than x86. Do you
> see why this approach will not work on other architectures? Can you
> elaborate on what current patch is missing for other architectures
> support? 

Well, if nothing else there's 40+ interrupt controllers that need fixing up 
before it'll even compile cleanly (mismatching function prototypes are IMHO 
not acceptable).

> The initial goal is to fix RTC/PIT problem on x86 while do not 
> hurt any other architectures in any way.

Well, you're introducing a fair amount of churn, so it'd better work for other 
architectures too. Otherwise we're liable to have to rewrite it later.  I 
can't say offhand whether your approach will work on other architectures. 
qemu has quite a wide variety of interrupt controllers and timers, you should 
check whether they fit into your model.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 18:11       ` Paul Brook
@ 2008-06-29 19:44         ` Gleb Natapov
  2008-06-29 20:34           ` Paul Brook
  0 siblings, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 19:44 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 07:11:01PM +0100, Paul Brook wrote:
> On Sunday 29 June 2008, Gleb Natapov wrote:
> > On Sun, Jun 29, 2008 at 03:38:10PM +0100, Paul Brook wrote:
> > > On Sunday 29 June 2008, Gleb Natapov wrote:
> > > > The return value is less then zero if interrupt is masked, zero if it
> > > > is known that interrupt is lost (due to coalescing) or greater then
> > > > zero if interrupt is delivered or was successfully queued for delivery
> > > > by interrupt controller. Device emulation can use this info as it
> > > > pleases. Included patch adds detection of interrupt coalescing into PIC
> > > > and APIC code for edge triggered interrupts.
> > >
> > > This is woefully incomplete, and obviously hasn't been tested on anything
> > > other than x86 targets.
> >
> > Yes, you are right. It was not tested on anything other than x86. Do you
> > see why this approach will not work on other architectures? Can you
> > elaborate on what current patch is missing for other architectures
> > support? 
> 
> Well, if nothing else there's 40+ interrupt controllers that need fixing up 
> before it'll even compile cleanly (mismatching function prototypes are IMHO 
> not acceptable).
Yes. 60 actually. I'll fix all of them.

> 
> > The initial goal is to fix RTC/PIT problem on x86 while do not 
> > hurt any other architectures in any way.
> 
> Well, you're introducing a fair amount of churn, so it'd better work for other 
> architectures too. Otherwise we're liable to have to rewrite it later.  I 
> can't say offhand whether your approach will work on other architectures. 
> qemu has quite a wide variety of interrupt controllers and timers, you should 
> check whether they fit into your model.
> 
The model takes into account that not all interrupt controller are
capable to detect missed interrupt (it is possible that there is no
interrupt controller at all). In this case irq function should
return one and everything will fall back to how it works now.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 19:44         ` Gleb Natapov
@ 2008-06-29 20:34           ` Paul Brook
  2008-06-29 20:49             ` Gleb Natapov
  2008-06-29 20:58             ` [Qemu-devel] " Jan Kiszka
  0 siblings, 2 replies; 27+ messages in thread
From: Paul Brook @ 2008-06-29 20:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov

> The model takes into account that not all interrupt controller are
> capable to detect missed interrupt (it is possible that there is no
> interrupt controller at all). In this case irq function should
> return one and everything will fall back to how it works now.

On a related note the correct way to fix this is "-icount auto".
You should make sure that your hacks are not enabled when we have a realistic 
virtual timebase.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 20:34           ` Paul Brook
@ 2008-06-29 20:49             ` Gleb Natapov
  2008-06-30 13:26               ` Gleb Natapov
  2008-06-29 20:58             ` [Qemu-devel] " Jan Kiszka
  1 sibling, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-29 20:49 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 09:34:08PM +0100, Paul Brook wrote:
> > The model takes into account that not all interrupt controller are
> > capable to detect missed interrupt (it is possible that there is no
> > interrupt controller at all). In this case irq function should
> > return one and everything will fall back to how it works now.
> 
> On a related note the correct way to fix this is "-icount auto".
> You should make sure that your hacks are not enabled when we have a realistic 
> virtual timebase.
> 
Can you educate me what "-icount auto" is? Quick grep in qemu tree shows
nothing.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Qemu-devel] Re: [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 20:34           ` Paul Brook
  2008-06-29 20:49             ` Gleb Natapov
@ 2008-06-29 20:58             ` Jan Kiszka
  2008-06-29 21:16               ` Paul Brook
  2008-06-30 13:18               ` Gleb Natapov
  1 sibling, 2 replies; 27+ messages in thread
From: Jan Kiszka @ 2008-06-29 20:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov

[-- Attachment #1: Type: text/plain, Size: 973 bytes --]

Paul Brook wrote:
>> The model takes into account that not all interrupt controller are
>> capable to detect missed interrupt (it is possible that there is no
>> interrupt controller at all). In this case irq function should
>> return one and everything will fall back to how it works now.
> 
> On a related note the correct way to fix this is "-icount auto".
> You should make sure that your hacks are not enabled when we have a realistic 
> virtual timebase.

Virtual timebases don't help if the guest has to interact with the real
life - ie. clocks you cannot tweak. In the end, you will have to bring
them in sync again, so this approach basically points in the right
direction.

However, I started to wonder if this model is already enough to handle
the case where QEMU itself looses ticks. At least the PIT should
currently not be able to deal with such issues, so you also won't see
coalesce counters increase. Or am I missing something?

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 20:58             ` [Qemu-devel] " Jan Kiszka
@ 2008-06-29 21:16               ` Paul Brook
  2008-06-29 21:42                 ` Dor Laor
  2008-06-29 21:47                 ` Jan Kiszka
  2008-06-30 13:18               ` Gleb Natapov
  1 sibling, 2 replies; 27+ messages in thread
From: Paul Brook @ 2008-06-29 21:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov, Jan Kiszka

On Sunday 29 June 2008, Jan Kiszka wrote:
> Paul Brook wrote:
> >> The model takes into account that not all interrupt controller are
> >> capable to detect missed interrupt (it is possible that there is no
> >> interrupt controller at all). In this case irq function should
> >> return one and everything will fall back to how it works now.
> >
> > On a related note the correct way to fix this is "-icount auto".
> > You should make sure that your hacks are not enabled when we have a
> > realistic virtual timebase.
>
> Virtual timebases don't help if the guest has to interact with the real
> life - ie. clocks you cannot tweak. In the end, you will have to bring 
> them in sync again, so this approach basically points in the right
> direction.

That's exactly what -icount auto does. It gradually varies the virtual clock 
rate to keep it in line with the outside world. You get some clock skew, but 
that true whatever you do (including the proposed patch).

In my experience hacks to prevent "coalescing" of interrupts cause as many 
problems as they solve. The guest OS can get awfully confused when it 
compares two different time sources and gets wildly different and conflicting 
answers. I believe VMware has some fairly sophisticated hacks for this, but 
it often still doesn't work in practice.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 21:16               ` Paul Brook
@ 2008-06-29 21:42                 ` Dor Laor
  2008-06-29 21:47                 ` Jan Kiszka
  1 sibling, 0 replies; 27+ messages in thread
From: Dor Laor @ 2008-06-29 21:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov, Jan Kiszka

On Sun, 2008-06-29 at 22:16 +0100, Paul Brook wrote:
> On Sunday 29 June 2008, Jan Kiszka wrote:
> > Paul Brook wrote:
> > >> The model takes into account that not all interrupt controller are
> > >> capable to detect missed interrupt (it is possible that there is no
> > >> interrupt controller at all). In this case irq function should
> > >> return one and everything will fall back to how it works now.
> > >
> > > On a related note the correct way to fix this is "-icount auto".
> > > You should make sure that your hacks are not enabled when we have a
> > > realistic virtual timebase.
> >
> > Virtual timebases don't help if the guest has to interact with the real
> > life - ie. clocks you cannot tweak. In the end, you will have to bring 
> > them in sync again, so this approach basically points in the right
> > direction.
> 
> That's exactly what -icount auto does. It gradually varies the virtual clock 
> rate to keep it in line with the outside world. You get some clock skew, but 
> that true whatever you do (including the proposed patch).
> 

Do you mean it slows down the virtual clock?
It's good for emulation and correctness, but not for production VMs.

> In my experience hacks to prevent "coalescing" of interrupts cause as many 
> problems as they solve. The guest OS can get awfully confused when it 
> compares two different time sources and gets wildly different and conflicting 
> answers. I believe VMware has some fairly sophisticated hacks for this, but 
> it often still doesn't work in practice.
> 
> Paul
> 

The current approach works well for windows guests, whether they use rtc
(acpi HAL) or pit (standard HAL). It even works if dozens of coalesced
irqs are injected one after another.

The problem you're describing about two different time source can still
happen without time drift fix since there is no global synchronization
of all the time sources (including tsc [Xen do some of it]). Indeed in
theory such two time sources synchronization will jitter with this
patch, but practically it works for Windows. Linux will have its own
para virtual time and even without it offer greater control over the
time source.

The time drift problem is indeed something we need to deal since if you
use a RHEL host, it does not have a user space high precision time, so
qemu timer pops with up to 30-100msec delay. When multimedia is played
in windows guest the timer freq is 1000HZ. This is a potential loss of
100 irqs.

Regards,
Dor

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 21:16               ` Paul Brook
  2008-06-29 21:42                 ` Dor Laor
@ 2008-06-29 21:47                 ` Jan Kiszka
  2008-06-29 21:54                   ` Paul Brook
  1 sibling, 1 reply; 27+ messages in thread
From: Jan Kiszka @ 2008-06-29 21:47 UTC (permalink / raw)
  To: Paul Brook; +Cc: Gleb Natapov, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1131 bytes --]

Paul Brook wrote:
> On Sunday 29 June 2008, Jan Kiszka wrote:
>> Paul Brook wrote:
>>>> The model takes into account that not all interrupt controller are
>>>> capable to detect missed interrupt (it is possible that there is no
>>>> interrupt controller at all). In this case irq function should
>>>> return one and everything will fall back to how it works now.
>>> On a related note the correct way to fix this is "-icount auto".
>>> You should make sure that your hacks are not enabled when we have a
>>> realistic virtual timebase.
>> Virtual timebases don't help if the guest has to interact with the real
>> life - ie. clocks you cannot tweak. In the end, you will have to bring 
>> them in sync again, so this approach basically points in the right
>> direction.
> 
> That's exactly what -icount auto does. It gradually varies the virtual clock 
> rate to keep it in line with the outside world. You get some clock skew, but 
> that true whatever you do (including the proposed patch).

Interesting. This seems to require a closer look, and maybe some
quantification of the potential overhead.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 21:47                 ` Jan Kiszka
@ 2008-06-29 21:54                   ` Paul Brook
  0 siblings, 0 replies; 27+ messages in thread
From: Paul Brook @ 2008-06-29 21:54 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov, Jan Kiszka

> > That's exactly what -icount auto does. It gradually varies the virtual
> > clock rate to keep it in line with the outside world. You get some clock
> > skew, but that true whatever you do (including the proposed patch).
>
> Interesting. This seems to require a closer look, and maybe some
> quantification of the potential overhead.

About 10% slower than normal qemu in my measurements.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Qemu-devel] Re: [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 20:58             ` [Qemu-devel] " Jan Kiszka
  2008-06-29 21:16               ` Paul Brook
@ 2008-06-30 13:18               ` Gleb Natapov
  1 sibling, 0 replies; 27+ messages in thread
From: Gleb Natapov @ 2008-06-30 13:18 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 10:58:46PM +0200, Jan Kiszka wrote:
> However, I started to wonder if this model is already enough to handle
> the case where QEMU itself looses ticks. At least the PIT should
> currently not be able to deal with such issues, so you also won't see
> coalesce counters increase. Or am I missing something?
> 
You don't miss anything. If QEMU losses ticks the time will still drift
with this patches too.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-29 20:49             ` Gleb Natapov
@ 2008-06-30 13:26               ` Gleb Natapov
  2008-06-30 14:00                 ` Paul Brook
  0 siblings, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-30 13:26 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Sun, Jun 29, 2008 at 11:49:07PM +0300, Gleb Natapov wrote:
> On Sun, Jun 29, 2008 at 09:34:08PM +0100, Paul Brook wrote:
> > > The model takes into account that not all interrupt controller are
> > > capable to detect missed interrupt (it is possible that there is no
> > > interrupt controller at all). In this case irq function should
> > > return one and everything will fall back to how it works now.
> > 
> > On a related note the correct way to fix this is "-icount auto".
> > You should make sure that your hacks are not enabled when we have a realistic 
> > virtual timebase.
> > 
> Can you educate me what "-icount auto" is? Quick grep in qemu tree shows
> nothing.
> 
I forgot to rebase my branch that is why grep showed nothing ;) But
there is no much documentation about virtual time base. I can't see why
this hack cannot coexist with virtual time base. If virtual time base
somehow guaranties that interrupts will never be lost then qemu_irq_rase()
will never return zero and the hack will be as good as disabled.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-30 13:26               ` Gleb Natapov
@ 2008-06-30 14:00                 ` Paul Brook
  2008-06-30 14:28                   ` Gleb Natapov
  0 siblings, 1 reply; 27+ messages in thread
From: Paul Brook @ 2008-06-30 14:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov

> I forgot to rebase my branch that is why grep showed nothing ;) But
> there is no much documentation about virtual time base. I can't see why
> this hack cannot coexist with virtual time base. If virtual time base
> somehow guaranties that interrupts will never be lost then qemu_irq_rase()
> will never return zero and the hack will be as good as disabled.

The virtual time base guarantees that a reasonable number of instruction will 
have been executed in a given virtual time period. Ticks can still be "lost" 
if the guest OS keeps the IRQ disable for a long time. However this behaviour 
matches that of real hardware, so they should not be reinserted later.

The only reason this is a problem with normal qemu is because the virtual CPU 
speed is extremely variable, and can stall for very long periods of time.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-30 14:00                 ` Paul Brook
@ 2008-06-30 14:28                   ` Gleb Natapov
  2008-06-30 14:35                     ` Paul Brook
  0 siblings, 1 reply; 27+ messages in thread
From: Gleb Natapov @ 2008-06-30 14:28 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

I am running QEMU with -icount auto and get many lines of
"Bad clock read" output.

--
			Gleb.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information.
  2008-06-30 14:28                   ` Gleb Natapov
@ 2008-06-30 14:35                     ` Paul Brook
  0 siblings, 0 replies; 27+ messages in thread
From: Paul Brook @ 2008-06-30 14:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gleb Natapov

On Monday 30 June 2008, Gleb Natapov wrote:
> I am running QEMU with -icount auto and get many lines of
> "Bad clock read" output.

I probably missed the rdtsc instructon.

Paul

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2008-06-30 14:35 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-29 14:02 [Qemu-devel] [PATCH 0/3] Fix guest time drift under heavy load Gleb Natapov
2008-06-29 14:02 ` [Qemu-devel] [PATCH 1/3] Change qemu_set_irq() to return status information Gleb Natapov
2008-06-29 14:14   ` Avi Kivity
2008-06-29 14:18     ` Gleb Natapov
2008-06-29 14:53       ` Paul Brook
2008-06-29 15:37         ` Gleb Natapov
2008-06-29 14:38   ` Paul Brook
2008-06-29 15:40     ` Gleb Natapov
2008-06-29 18:11       ` Paul Brook
2008-06-29 19:44         ` Gleb Natapov
2008-06-29 20:34           ` Paul Brook
2008-06-29 20:49             ` Gleb Natapov
2008-06-30 13:26               ` Gleb Natapov
2008-06-30 14:00                 ` Paul Brook
2008-06-30 14:28                   ` Gleb Natapov
2008-06-30 14:35                     ` Paul Brook
2008-06-29 20:58             ` [Qemu-devel] " Jan Kiszka
2008-06-29 21:16               ` Paul Brook
2008-06-29 21:42                 ` Dor Laor
2008-06-29 21:47                 ` Jan Kiszka
2008-06-29 21:54                   ` Paul Brook
2008-06-30 13:18               ` Gleb Natapov
2008-06-29 14:02 ` [Qemu-devel] [PATCH 2/3] Fix time drift problem under high load when PIT is in use Gleb Natapov
2008-06-29 14:40   ` [Qemu-devel] " Jan Kiszka
2008-06-29 15:52     ` Gleb Natapov
2008-06-29 14:02 ` [Qemu-devel] [PATCH 3/3] Fix time drift problem under high load when RTC " Gleb Natapov
2008-06-29 14:37 ` [Qemu-devel] Re: [PATCH 0/3] Fix guest time drift under heavy load Jan Kiszka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).