[PATCH] Std VGA Performance

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Std VGA Performance
@ 2007-10-24 21:36 Ben Guthro
  2007-10-25 14:14 ` Keir Fraser
  2007-10-29 18:48 ` Alex Williamson
  0 siblings, 2 replies; 12+ messages in thread
From: Ben Guthro @ 2007-10-24 21:36 UTC (permalink / raw)
  To: xen-devel; +Cc: Robert Phillips

[-- Attachment #1: Type: text/plain, Size: 1310 bytes --]

This patch improves the performance of Standard VGA,
the mode used during Windows boot and by the Linux
splash screen.

It does so by buffering all the stdvga programmed output ops
and memory mapped ops (both reads and writes) that are sent to QEMU.

We maintain locally essential VGA state so we can respond
immediately to input and read ops without waiting for
QEMU.  We snoop output and write ops to keep our state
up-to-date.

PIO input ops are satisfied from cached state without
bothering QEMU.

PIO output and mmio ops are passed through to QEMU, including
mmio read ops.  This is necessary because mmio reads
can have side effects.

I have changed the format of the buffered_iopage.
It used to contain 80 elements of type ioreq_t (48 bytes each).
Now it contains 672 elements of type buf_ioreq_t (6 bytes each).
Being able to pipeline 8 times as many ops improves
VGA performance by a factor of 8.

I changed hvm_buffered_io_intercept to use the same
registration and callback mechanism as hvm_portio_intercept
rather than the hacky hardcoding it used before.

In platform.c, I fixed send_timeoffset_req() to sets its
ioreq size to 8 (rather than 4), and its count to 1 (which
was missing).

Signed-off-by: Ben Guthro <bguthro@virtualron.com>
Signed-off-by: Robert Phillips <rphillips@virtualiron.com>

[-- Attachment #2: stdvga-perf.patch --]
[-- Type: text/x-patch, Size: 33153 bytes --]

diff -r 118a21c66fd5 tools/ioemu/target-i386-dm/helper2.c
--- a/tools/ioemu/target-i386-dm/helper2.c	Mon Oct 22 21:06:11 2007 +0100
+++ b/tools/ioemu/target-i386-dm/helper2.c	Wed Oct 24 17:31:57 2007 -0400
@@ -478,6 +478,7 @@ void cpu_ioreq_timeoffset(CPUState *env,
 
     time_offset += (ulong)req->data;
 
+    fprintf(logfile, "Time offset set %ld, added offset %ld\n", time_offset, req->data);
     sprintf(b, "%ld", time_offset);
     xenstore_vm_write(domid, "rtc/timeoffset", b);
 }
@@ -538,20 +539,39 @@ void __handle_ioreq(CPUState *env, ioreq
 
 void __handle_buffered_iopage(CPUState *env)
 {
-    ioreq_t *req = NULL;
+    buf_ioreq_t *buf_req = NULL;
+    ioreq_t req;
+    int qw = 0;
 
     if (!buffered_io_page)
         return;
 
     while (buffered_io_page->read_pointer !=
            buffered_io_page->write_pointer) {
-        req = &buffered_io_page->ioreq[buffered_io_page->read_pointer %
+        memset(&req, 0, sizeof(req));
+        buf_req = &buffered_io_page->buf_ioreq[buffered_io_page->read_pointer %
 				       IOREQ_BUFFER_SLOT_NUM];
-
-        __handle_ioreq(env, req);
+        req.size = 1UL << buf_req->size;
+        req.count = 1;
+        req.data = buf_req->data;
+        req.state = STATE_IOREQ_READY;
+        req.dir  = buf_req->dir;
+        req.type = buf_req->type;
+        qw = req.size == 8;
+        if (qw) {
+            req.data |= ((uint64_t)buf_req->addr) << 16;
+            buf_req = &buffered_io_page->buf_ioreq[(buffered_io_page->read_pointer+1) %
+                                               IOREQ_BUFFER_SLOT_NUM];
+            req.data |= ((uint64_t)buf_req->data) << 32;
+            req.data |= ((uint64_t)buf_req->addr) << 48;
+        }
+        else
+            req.addr = buf_req->addr;
+
+        __handle_ioreq(env, &req);
 
         mb();
-        buffered_io_page->read_pointer++;
+        buffered_io_page->read_pointer += qw ? 2 : 1;
     }
 }
 
diff -r 118a21c66fd5 tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c	Mon Oct 22 21:06:11 2007 +0100
+++ b/tools/ioemu/xenstore.c	Wed Oct 24 17:31:57 2007 -0400
@@ -724,7 +724,7 @@ int xenstore_vm_write(int domid, char *k
 
     pasprintf(&buf, "%s/%s", path, key);
     rc = xs_write(xsh, XBT_NULL, buf, value, strlen(value));
-    if (rc) {
+    if (rc == 0) {
         fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key);
         goto out;
     }
diff -r 118a21c66fd5 xen/arch/x86/hvm/Makefile
--- a/xen/arch/x86/hvm/Makefile	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/arch/x86/hvm/Makefile	Wed Oct 24 17:31:57 2007 -0400
@@ -17,3 +17,4 @@ obj-y += vlapic.o
 obj-y += vlapic.o
 obj-y += vpic.o
 obj-y += save.o
+obj-y += stdvga.o
diff -r 118a21c66fd5 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/arch/x86/hvm/hvm.c	Wed Oct 24 17:31:57 2007 -0400
@@ -238,6 +238,8 @@ int hvm_domain_initialise(struct domain 
     if ( rc != 0 )
         return rc;
 
+    stdvga_init(d);
+
     hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
     hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
 
@@ -253,6 +255,7 @@ void hvm_domain_relinquish_resources(str
     rtc_deinit(d);
     pmtimer_deinit(d);
     hpet_deinit(d);
+    stdvga_deinit(d);
 }
 
 void hvm_domain_destroy(struct domain *d)
diff -r 118a21c66fd5 xen/arch/x86/hvm/intercept.c
--- a/xen/arch/x86/hvm/intercept.c	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/arch/x86/hvm/intercept.c	Wed Oct 24 17:31:57 2007 -0400
@@ -45,20 +45,6 @@ static struct hvm_mmio_handler *hvm_mmio
     &vioapic_mmio_handler
 };
 
-struct hvm_buffered_io_range {
-    unsigned long start_addr;
-    unsigned long length;
-};
-
-#define HVM_BUFFERED_IO_RANGE_NR 1
-
-static struct hvm_buffered_io_range buffered_stdvga_range = {0xA0000, 0x20000};
-static struct hvm_buffered_io_range
-*hvm_buffered_io_ranges[HVM_BUFFERED_IO_RANGE_NR] =
-{
-    &buffered_stdvga_range
-};
-
 static inline void hvm_mmio_access(struct vcpu *v,
                                    ioreq_t *p,
                                    hvm_mmio_read_t read_handler,
@@ -170,47 +156,68 @@ int hvm_buffered_io_send(ioreq_t *p)
     struct vcpu *v = current;
     struct hvm_ioreq_page *iorp = &v->domain->arch.hvm_domain.buf_ioreq;
     buffered_iopage_t *pg = iorp->va;
-
+    buf_ioreq_t bp;
+    /* Timeoffset sends 64b data, but no address.  Use two consecutive slots. */
+    int qw = 0;
+
+    /* Ensure buffered_iopage fits in a page */
+    BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
+
+    /* Return 0 for the cases we can't deal with. */
+    if (p->addr > 0xffffful || p->data_is_ptr || p->df || p->count != 1)
+        return 0;
+
+    bp.type = p->type;
+    bp.dir  = p->dir;
+    switch (p->size) {
+    case 1:
+        bp.size = 0;
+        break;
+    case 2:
+        bp.size = 1;
+        break;
+    case 4:
+        bp.size = 2;
+        break;
+    case 8:
+        bp.size = 3;
+        qw = 1;
+        gdprintk(XENLOG_INFO, "quadword ioreq type:%d data:%ld\n", p->type, p->data);
+        break;
+    default:
+        gdprintk(XENLOG_WARNING, "unexpected ioreq size:%ld\n", p->size);
+        return 0;
+    }
+    
+    bp.data = p->data;
+    bp.addr = qw ? ((p->data >> 16) & 0xfffful) : (p->addr & 0xffffful);
+    
     spin_lock(&iorp->lock);
 
-    if ( (pg->write_pointer - pg->read_pointer) == IOREQ_BUFFER_SLOT_NUM )
+    if ( (pg->write_pointer - pg->read_pointer) >= IOREQ_BUFFER_SLOT_NUM - (qw ? 1 : 0))
     {
         /* The queue is full: send the iopacket through the normal path. */
         spin_unlock(&iorp->lock);
         return 0;
     }
-
-    memcpy(&pg->ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM],
-           p, sizeof(ioreq_t));
+    
+    memcpy(&pg->buf_ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM],
+           &bp, sizeof(bp));
+    
+    if (qw) {
+        bp.data = p->data >> 32;
+        bp.addr = (p->data >> 48) & 0xfffful;
+        memcpy(&pg->buf_ioreq[(pg->write_pointer+1) % IOREQ_BUFFER_SLOT_NUM],
+               &bp, sizeof(bp));
+    }
 
     /* Make the ioreq_t visible /before/ write_pointer. */
     wmb();
-    pg->write_pointer++;
-
+    pg->write_pointer += qw ? 2 : 1;
+    
     spin_unlock(&iorp->lock);
-
+    
     return 1;
-}
-
-int hvm_buffered_io_intercept(ioreq_t *p)
-{
-    int i;
-
-    /* ignore READ ioreq_t! */
-    if ( p->dir == IOREQ_READ )
-        return 0;
-
-    for ( i = 0; i < HVM_BUFFERED_IO_RANGE_NR; i++ ) {
-        if ( p->addr >= hvm_buffered_io_ranges[i]->start_addr &&
-             p->addr + p->size - 1 < hvm_buffered_io_ranges[i]->start_addr +
-                                     hvm_buffered_io_ranges[i]->length )
-            break;
-    }
-
-    if ( i == HVM_BUFFERED_IO_RANGE_NR )
-        return 0;
-
-    return hvm_buffered_io_send(p);
 }
 
 int hvm_mmio_intercept(ioreq_t *p)
@@ -253,7 +260,7 @@ int hvm_io_intercept(ioreq_t *p, int typ
         addr = handler->hdl_list[i].addr;
         size = handler->hdl_list[i].size;
         if (p->addr >= addr &&
-            p->addr <  addr + size)
+            p->addr + p->size <=  addr + size)
             return handler->hdl_list[i].action(p);
     }
     return 0;
diff -r 118a21c66fd5 xen/arch/x86/hvm/platform.c
--- a/xen/arch/x86/hvm/platform.c	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/arch/x86/hvm/platform.c	Wed Oct 24 17:31:57 2007 -0400
@@ -944,7 +944,8 @@ void send_timeoffset_req(unsigned long t
     memset(p, 0, sizeof(*p));
 
     p->type = IOREQ_TYPE_TIMEOFFSET;
-    p->size = 4;
+    p->size = 8;
+    p->count = 1;
     p->dir = IOREQ_WRITE;
     p->data = timeoff;
 
diff -r 118a21c66fd5 xen/arch/x86/hvm/stdvga.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/stdvga.c	Wed Oct 24 17:34:54 2007 -0400
@@ -0,0 +1,712 @@
+/*
+ *  Copyright (c) 2003-2007, Virtual Iron Software, Inc.
+ *
+ *  Portions have been modified by Virtual Iron Software, Inc.
+ *  (c) 2007. This file and the modifications can be redistributed and/or
+ *  modified under the terms and conditions of the GNU General Public
+ *  License, version 2.1 and not any later version of the GPL, as published
+ *  by the Free Software Foundation. 
+ *
+ *
+ *
+ *  This improves the performance of Standard VGA,
+ *  the mode used during Windows boot and by the Linux
+ *  splash screen.
+ *
+ *  It does so by buffering all the stdvga programmed output ops
+ *  and memory mapped ops (both reads and writes) that are sent to QEMU.
+ *
+ *  We maintain locally essential VGA state so we can respond
+ *  immediately to input and read ops without waiting for
+ *  QEMU.  We snoop output and write ops to keep our state
+ *  up-to-date.
+ *
+ *  PIO input ops are satisfied from cached state without
+ *  bothering QEMU.
+ *
+    PIO output and mmio ops are passed through to QEMU, including
+ *  mmio read ops.  This is necessary because mmio reads
+ *  can have side effects.
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/sched.h>
+#include <asm/hvm/support.h>
+
+#define vram_b(_s, _a) (((uint8_t*) (_s)->vram_ptr[((_a)>>12)&0x3f])[(_a)&0xfff])
+#define vram_w(_s, _a) (((uint16_t*)(_s)->vram_ptr[((_a)>>11)&0x3f])[(_a)&0x7ff])
+#define vram_l(_s, _a) (((uint32_t*)(_s)->vram_ptr[((_a)>>10)&0x3f])[(_a)&0x3ff])
+
+#ifdef STDVGA_STATS
+#define UPDATE_STATS(x) x
+#else
+#define UPDATE_STATS(x)
+#endif
+
+#define PAT(x) (x)
+static const uint32_t mask16[16] = {
+    PAT(0x00000000),
+    PAT(0x000000ff),
+    PAT(0x0000ff00),
+    PAT(0x0000ffff),
+    PAT(0x00ff0000),
+    PAT(0x00ff00ff),
+    PAT(0x00ffff00),
+    PAT(0x00ffffff),
+    PAT(0xff000000),
+    PAT(0xff0000ff),
+    PAT(0xff00ff00),
+    PAT(0xff00ffff),
+    PAT(0xffff0000),
+    PAT(0xffff00ff),
+    PAT(0xffffff00),
+    PAT(0xffffffff),
+};
+
+/* force some bits to zero */
+const uint8_t sr_mask[8] = {
+    (uint8_t)~0xfc,
+    (uint8_t)~0xc2,
+    (uint8_t)~0xf0,
+    (uint8_t)~0xc0,
+    (uint8_t)~0xf1,
+    (uint8_t)~0xff,
+    (uint8_t)~0xff,
+    (uint8_t)~0x00,
+};
+
+const uint8_t gr_mask[16] = {
+    (uint8_t)~0xf0, /* 0x00 */
+    (uint8_t)~0xf0, /* 0x01 */
+    (uint8_t)~0xf0, /* 0x02 */
+    (uint8_t)~0xe0, /* 0x03 */
+    (uint8_t)~0xfc, /* 0x04 */
+    (uint8_t)~0x84, /* 0x05 */
+    (uint8_t)~0xf0, /* 0x06 */
+    (uint8_t)~0xf0, /* 0x07 */
+    (uint8_t)~0x00, /* 0x08 */
+};
+
+static uint64_t stdvga_inb(uint64_t addr)
+{
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
+    uint8_t val = 0;
+    switch (addr) {
+    case 0x3c4:                 /* sequencer address register */
+        val = s->sr_index;
+        break;
+
+    case 0x3c5:                 /* sequencer data register */
+        if (s->sr_index < sizeof(s->sr))
+            val = s->sr[s->sr_index];
+        break;
+
+    case 0x3ce:                 /* graphics address register */
+        val = s->gr_index;
+        break;
+
+    case 0x3cf:                 /* graphics data register */
+        val = s->gr[s->gr_index];
+        break;
+
+    default:
+        gdprintk(XENLOG_WARNING, "unexpected io addr 0x%04x\n", (int)addr);
+    }
+    return val;
+}
+
+static uint64_t stdvga_in(ioreq_t *p)
+{
+    /* Satisfy reads from sequence and graphics registers using local values */
+    uint64_t data = 0;
+    switch (p->size) {
+    case 1:
+        data = stdvga_inb(p->addr);
+        break;
+
+    case 2:
+        data = stdvga_inb(p->addr);
+        data |= stdvga_inb(p->addr + 1) << 8;
+        break;
+
+    case 4:
+        data = stdvga_inb(p->addr);
+        data |= stdvga_inb(p->addr + 1) << 8;
+        data |= stdvga_inb(p->addr + 2) << 16;
+        data |= stdvga_inb(p->addr + 3) << 24;
+        break;
+
+    case 8:
+        data = stdvga_inb(p->addr);
+        data |= stdvga_inb(p->addr + 1) << 8;
+        data |= stdvga_inb(p->addr + 2) << 16;
+        data |= stdvga_inb(p->addr + 3) << 24;
+        data |= stdvga_inb(p->addr + 4) << 32;
+        data |= stdvga_inb(p->addr + 5) << 40;
+        data |= stdvga_inb(p->addr + 6) << 48;
+        data |= stdvga_inb(p->addr + 7) << 56;
+        break;
+
+    default:
+        gdprintk(XENLOG_WARNING, "invalid io size:%d\n", (int)p->size);
+    }
+    return data;
+}
+
+static void stdvga_outb(uint64_t addr, uint8_t val)
+{
+    /* Bookkeep (via snooping) the sequencer and graphics registers */
+
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
+    int prev_stdvga = s->stdvga;
+
+    switch (addr) {
+    case 0x3c4:                 /* sequencer address register */
+        s->sr_index = val;
+        break;
+
+    case 0x3c5:                 /* sequencer data register */
+        switch (s->sr_index) {
+        case 0x00 ... 0x05:
+        case 0x07:
+            s->sr[s->sr_index] = val & sr_mask[s->sr_index];
+            break;
+        case 0x06:
+            s->sr[s->sr_index] = ((val & 0x17) == 0x12) ? 0x12 : 0x0f;
+            break;
+        default:
+            if (s->sr_index < sizeof(s->sr))
+                s->sr[s->sr_index] = val;
+            break;
+        }
+        break;
+
+    case 0x3ce:                 /* graphics address register */
+        s->gr_index = val;
+        break;
+
+    case 0x3cf:                 /* graphics data register */
+        if (s->gr_index < sizeof(gr_mask)) {
+            s->gr[s->gr_index] = val & gr_mask[s->gr_index];
+        }
+        else if (s->gr_index == 0xff && s->vram_ptr != NULL) {
+            uint32_t addr;
+            for (addr = 0xa0000; addr < 0xa4000; addr += 2)
+                vram_w(s, addr) = (val << 8) | s->gr[0xfe];
+        }
+        else
+            s->gr[s->gr_index] = val;
+        break;
+    }
+
+    /* When in standard vga mode, emulate here all writes to the vram buffer
+     * so we can immediately satisfy reads without waiting for qemu. */
+    s->stdvga =
+        s->sr[0x07] == 0 &&          /* standard vga mode */
+        s->gr[6] == 0x05;            /* misc graphics register w/ MemoryMapSelect=1  0xa0000-0xaffff (64K region) and AlphaDis=1 */
+
+    if (!prev_stdvga && s->stdvga) {
+        s->cache = 1;       /* (re)start caching video buffer */
+        gdprintk(XENLOG_INFO, "entering stdvga and caching modes\n");
+    }
+    else
+    if (prev_stdvga && !s->stdvga)
+        gdprintk(XENLOG_INFO, "leaving  stdvga\n");
+}
+
+static void stdvga_outv(uint64_t addr, uint64_t data, uint32_t size)
+{
+    switch (size) {
+    case 1:
+        stdvga_outb(addr, data);
+        break;
+
+    case 2:
+        stdvga_outb(addr+0, data >>  0);
+        stdvga_outb(addr+1, data >>  8);
+        break;
+
+    case 4:
+        stdvga_outb(addr+0, data >>  0);
+        stdvga_outb(addr+1, data >>  8);
+        stdvga_outb(addr+2, data >> 16);
+        stdvga_outb(addr+3, data >> 24);
+        break;
+
+    case 8:
+        stdvga_outb(addr+0, data >>  0);
+        stdvga_outb(addr+1, data >>  8);
+        stdvga_outb(addr+2, data >> 16);
+        stdvga_outb(addr+3, data >> 24);
+        stdvga_outb(addr+4, data >> 32);
+        stdvga_outb(addr+5, data >> 40);
+        stdvga_outb(addr+6, data >> 48);
+        stdvga_outb(addr+7, data >> 56);
+        break;
+
+    default:
+        gdprintk(XENLOG_WARNING, "invalid io size:%d\n", size);
+    }
+}
+
+static void stdvga_out(ioreq_t *p)
+{
+    if (p->data_is_ptr) {
+        int i, sign = p->df ? -1 : 1;
+        uint64_t addr = p->addr, data = p->data, tmp;
+        for (i = 0; i < p->count; i++) {
+            hvm_copy_from_guest_phys(&tmp, data, p->size);
+            stdvga_outv(addr, tmp, p->size);
+            data += sign * p->size;
+            addr += sign * p->size;
+        }
+    }
+    else
+        stdvga_outv(p->addr, p->data, p->size);
+}
+
+int stdvga_intercept_pio(ioreq_t *p)
+{
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
+    int buf = 0;
+
+    if (p->size > 8) {
+        gdprintk(XENLOG_WARNING, "stdvga bad access size %d\n", (int)p->size);
+        return 0;
+    }
+
+    spin_lock(&s->lock);
+    if ( p->dir == IOREQ_READ ) {
+        if (p->size != 1)
+            gdprintk(XENLOG_WARNING, "unexpected io size:%d\n", (int)p->size);
+        if (!(p->addr == 0x3c5 && s->sr_index >= sizeof(sr_mask)) &&
+            !(p->addr == 0x3cf && s->gr_index >= sizeof(gr_mask)))
+        {
+            p->data = stdvga_in(p);
+            buf = 1;
+        }
+    }
+    else {
+        stdvga_out(p);
+        buf = 1;
+    }
+
+    if (buf && hvm_buffered_io_send(p)) {
+        UPDATE_STATS(s->stats.nr_pio_buffered_wr++);
+        spin_unlock(&s->lock);
+        return 1;
+    }
+    else {
+        UPDATE_STATS(s->stats.nr_pio_unbuffered_wr++);
+        spin_unlock(&s->lock);
+        return 0;
+    }
+}
+
+#define GET_PLANE(data, p) (((data) >> ((p) * 8)) & 0xff)
+
+static uint8_t stdvga_mem_readb(uint64_t addr)
+{
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
+    int plane;
+    uint32_t ret;
+
+    addr &= 0x1ffff;
+    if (addr >= 0x10000)
+        return 0xff;
+
+    if (s->sr[4] & 0x08) {
+        /* chain 4 mode : simplest access */
+        ret = vram_b(s, addr);
+    } else if (s->gr[5] & 0x10) {
+        /* odd/even mode (aka text mode mapping) */
+        plane = (s->gr[4] & 2) | (addr & 1);
+        ret = vram_b(s, ((addr & ~1) << 1) | plane);
+    } else {
+        /* standard VGA latched access */
+        s->latch = vram_l(s, addr);
+
+        if (!(s->gr[5] & 0x08)) {
+            /* read mode 0 */
+            plane = s->gr[4];
+            ret = GET_PLANE(s->latch, plane);
+        } else {
+            /* read mode 1 */
+            ret = (s->latch ^ mask16[s->gr[2]]) & mask16[s->gr[7]];
+            ret |= ret >> 16;
+            ret |= ret >> 8;
+            ret = (~ret) & 0xff;
+        }
+    }
+    return ret;
+}
+
+static uint32_t stdvga_mem_read(uint32_t addr, uint32_t size)
+{
+    uint32_t data = 0;
+
+    switch (size) {
+    case 1:
+        data = stdvga_mem_readb(addr);
+        break;
+
+    case 2:
+        data = stdvga_mem_readb(addr);
+        data |= stdvga_mem_readb(addr + 1) << 8;
+        break;
+
+    case 4:
+        data = stdvga_mem_readb(addr);
+        data |= stdvga_mem_readb(addr + 1) << 8;
+        data |= stdvga_mem_readb(addr + 2) << 16;
+        data |= stdvga_mem_readb(addr + 3) << 24;
+        break;
+
+    default:
+        gdprintk(XENLOG_WARNING, "invalid io size:%d\n", size);
+    }
+    return data;
+}
+
+static void stdvga_mem_writeb(uint64_t addr, uint32_t val)
+{
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
+    int plane, write_mode, b, func_select, mask;
+    uint32_t write_mask, bit_mask, set_mask;
+
+    addr &= 0x1ffff;
+    if (addr >= 0x10000)
+        return;
+
+    if (s->sr[4] & 0x08) {
+        /* chain 4 mode : simplest access */
+        plane = addr & 3;
+        mask = (1 << plane);
+        if (s->sr[2] & mask) {
+            vram_b(s, addr) = val;
+        }
+    } else if (s->gr[5] & 0x10) {
+        /* odd/even mode (aka text mode mapping) */
+        plane = (s->gr[4] & 2) | (addr & 1);
+        mask = (1 << plane);
+        if (s->sr[2] & mask) {
+            addr = ((addr & ~1) << 1) | plane;
+            vram_b(s, addr) = val;
+        }
+    } else {
+        write_mode = s->gr[5] & 3;
+        switch(write_mode) {
+        default:
+        case 0:
+            /* rotate */
+            b = s->gr[3] & 7;
+            val = ((val >> b) | (val << (8 - b))) & 0xff;
+            val |= val << 8;
+            val |= val << 16;
+
+            /* apply set/reset mask */
+            set_mask = mask16[s->gr[1]];
+            val = (val & ~set_mask) | (mask16[s->gr[0]] & set_mask);
+            bit_mask = s->gr[8];
+            break;
+        case 1:
+            val = s->latch;
+            goto do_write;
+        case 2:
+            val = mask16[val & 0x0f];
+            bit_mask = s->gr[8];
+            break;
+        case 3:
+            /* rotate */
+            b = s->gr[3] & 7;
+            val = (val >> b) | (val << (8 - b));
+
+            bit_mask = s->gr[8] & val;
+            val = mask16[s->gr[0]];
+            break;
+        }
+
+        /* apply logical operation */
+        func_select = s->gr[3] >> 3;
+        switch(func_select) {
+        case 0:
+        default:
+            /* nothing to do */
+            break;
+        case 1:
+            /* and */
+            val &= s->latch;
+            break;
+        case 2:
+            /* or */
+            val |= s->latch;
+            break;
+        case 3:
+            /* xor */
+            val ^= s->latch;
+            break;
+        }
+
+        /* apply bit mask */
+        bit_mask |= bit_mask << 8;
+        bit_mask |= bit_mask << 16;
+        val = (val & bit_mask) | (s->latch & ~bit_mask);
+
+    do_write:
+        /* mask data according to sr[2] */
+        mask = s->sr[2];
+        write_mask = mask16[mask];
+        vram_l(s, addr) =
+            (vram_l(s, addr) & ~write_mask) |
+            (val & write_mask);
+    }
+}
+
+static void stdvga_mem_write(uint32_t addr, uint32_t data, uint32_t size)
+{
+    /* Intercept mmio write */
+    switch (size) {
+    case 1:
+        stdvga_mem_writeb(addr, (data >>  0) & 0xff);
+        break;
+
+    case 2:
+        stdvga_mem_writeb(addr+0, (data >>  0) & 0xff);
+        stdvga_mem_writeb(addr+1, (data >>  8) & 0xff);
+        break;
+
+    case 4:
+        stdvga_mem_writeb(addr+0, (data >>  0) & 0xff);
+        stdvga_mem_writeb(addr+1, (data >>  8) & 0xff);
+        stdvga_mem_writeb(addr+2, (data >> 16) & 0xff);
+        stdvga_mem_writeb(addr+3, (data >> 24) & 0xff);
+        break;
+
+    default:
+        gdprintk(XENLOG_WARNING, "invalid io size:%d\n", size);
+    }
+}
+
+static uint32_t read_data;
+
+static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p)
+{
+    int i;
+    int sign = p->df ? -1 : 1;
+
+    if (p->data_is_ptr) {
+        if (p->dir == IOREQ_READ ) {
+            uint32_t addr = p->addr, data = p->data, tmp;
+            for (i = 0; i < p->count; i++) {
+                tmp = stdvga_mem_read(addr, p->size);
+                hvm_copy_to_guest_phys(data, &tmp, p->size);
+                data += sign * p->size;
+                addr += sign * p->size;
+            }
+        }
+        else {
+            uint32_t addr = p->addr, data = p->data, tmp;
+            for (i = 0; i < p->count; i++) {
+                hvm_copy_from_guest_phys(&tmp, data, p->size);
+                stdvga_mem_write(addr, tmp, p->size);
+                data += sign * p->size;
+                addr += sign * p->size;
+            }
+        }
+    }
+    else {
+        if (p->dir == IOREQ_READ ) {
+            uint32_t addr = p->addr;
+            for (i = 0; i < p->count; i++) {
+                p->data = stdvga_mem_read(addr, p->size);
+                addr += sign * p->size;
+            }
+        }
+        else {
+            uint32_t addr = p->addr;
+            for (i = 0; i < p->count; i++) {
+                stdvga_mem_write(addr, p->data, p->size);
+                addr += sign * p->size;
+            }
+        }
+    }
+
+    read_data = p->data;
+    return 1;
+}
+
+static uint32_t op_and(uint32_t a, uint32_t b) { return a & b; }
+static uint32_t op_or (uint32_t a, uint32_t b) { return a | b; }
+static uint32_t op_xor(uint32_t a, uint32_t b) { return a ^ b; }
+static uint32_t op_add(uint32_t a, uint32_t b) { return a + b; }
+static uint32_t op_sub(uint32_t a, uint32_t b) { return a - b; }
+static uint32_t (*op_array[])(uint32_t, uint32_t) = {
+    [IOREQ_TYPE_AND] = op_and,
+    [IOREQ_TYPE_OR ] = op_or,
+    [IOREQ_TYPE_XOR] = op_xor,
+    [IOREQ_TYPE_ADD] = op_add,
+    [IOREQ_TYPE_SUB] = op_sub
+};
+
+static int mmio_op(struct hvm_hw_stdvga *s, ioreq_t *p)
+{
+    uint32_t orig, mod = 0;
+    orig = stdvga_mem_read(p->addr, p->size);
+    if (p->dir == IOREQ_WRITE) {
+        mod = (op_array[p->type])(orig, p->data);
+        stdvga_mem_write(p->addr, mod, p->size);
+    }
+    // p->data = orig; // Can't modify p->data yet.  QEMU still needs to use it.  So return zero below.
+    return 0; /* Don't try to buffer these operations */
+}
+
+int stdvga_intercept_mmio(ioreq_t *p)
+{
+    struct domain *d = current->domain;
+    struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga;
+    int buf = 0;
+
+    if (p->size > 8) {
+        gdprintk(XENLOG_WARNING, "invalid mmio size %d\n", (int)p->size);
+        return 0;
+    }
+
+    spin_lock(&s->lock);
+
+    if (s->stdvga && s->cache) {
+        switch (p->type) {
+        case IOREQ_TYPE_COPY:
+            buf = mmio_move(s, p);
+            break;
+        case IOREQ_TYPE_AND:
+        case IOREQ_TYPE_OR:
+        case IOREQ_TYPE_XOR:
+        case IOREQ_TYPE_ADD:
+        case IOREQ_TYPE_SUB:
+            buf = mmio_op(s, p);
+            break;
+        default:
+            gdprintk(XENLOG_ERR, "unsupported mmio request type:%d "
+                     "addr:0x%04x data:0x%04x size:%d count:%d state:%d isptr:%d dir:%d df:%d\n",
+                     p->type,
+                     (int)p->addr, (int)p->data, (int)p->size, (int)p->count, p->state,
+                     p->data_is_ptr, p->dir, p->df);
+            s->cache = 0;
+        }
+    }
+    if (buf && hvm_buffered_io_send(p)) {
+        UPDATE_STATS(p->dir == IOREQ_READ ? s->stats.nr_mmio_buffered_rd++ : s->stats.nr_mmio_buffered_wr++);
+        spin_unlock(&s->lock);
+        return 1;
+    }
+    else {
+        UPDATE_STATS(p->dir == IOREQ_READ ? s->stats.nr_mmio_unbuffered_rd++ : s->stats.nr_mmio_unbuffered_wr++);
+        spin_unlock(&s->lock);
+        return 0;
+    }
+}
+
+void stdvga_init(struct domain *d)
+{
+    int i;
+    struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga;
+    memset(s, 0, sizeof(*s));
+    spin_lock_init(&s->lock);
+    
+    for (i = 0; i != ARRAY_SIZE(s->vram_ptr); i++) {
+        struct page_info *vram_page;
+        vram_page = alloc_domheap_page(NULL);
+        if (!vram_page)
+            break;
+        s->vram_ptr[i] = page_to_virt(vram_page);
+        memset(s->vram_ptr[i], 0, PAGE_SIZE);
+    }
+    if (i == ARRAY_SIZE(s->vram_ptr)) {
+        register_portio_handler(d, 0x3c4, 2, stdvga_intercept_pio); /* sequencer registers */
+        register_portio_handler(d, 0x3ce, 2, stdvga_intercept_pio); /* graphics registers */
+        register_buffered_io_handler(d, 0xa0000, 0x10000, stdvga_intercept_mmio); /* mmio */
+    }
+}
+
+void stdvga_deinit(struct domain *d)
+{
+    struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga;
+    int i;
+    for (i = 0; i != ARRAY_SIZE(s->vram_ptr); i++) {
+        struct page_info *vram_page;
+        if (s->vram_ptr[i] == NULL)
+            continue;
+        vram_page = virt_to_page(s->vram_ptr[i]);
+        free_domheap_page(vram_page);
+        s->vram_ptr[i] = NULL;
+    }
+}
+
+#ifdef STDVGA_STATS
+static void stdvga_stats_dump(unsigned char key)
+{
+    struct domain *d;
+
+    printk("%s: key '%c' pressed\n", __FUNCTION__, key);
+
+    rcu_read_lock(&domlist_read_lock);
+
+    for_each_domain ( d )
+    {
+        struct hvm_hw_stdvga *s;
+        int i;
+
+        if ( !is_hvm_domain(d) )
+            continue;
+
+        s = &d->arch.hvm_domain.stdvga;
+        spin_lock(&s->lock);
+        printk("\n>>> Domain %d <<<\n", d->domain_id);
+        printk("    modes: stdvga:%d caching:%d\n", s->stdvga, s->cache);
+        printk("                       %8s %8s\n", "read", "write");
+        printk("    nr_mmio_buffered:  %8u %8u\n", s->stats.nr_mmio_buffered_rd, s->stats.nr_mmio_buffered_wr);
+        printk("    nr_mmio_unbuffered:%8u %8u\n", s->stats.nr_mmio_unbuffered_rd, s->stats.nr_mmio_unbuffered_wr);
+        printk("    nr_pio_buffered:   %8u %8u\n", s->stats.nr_pio_buffered_rd, s->stats.nr_pio_buffered_wr);
+        printk("    nr_pio_unbuffered: %8u %8u\n", s->stats.nr_pio_unbuffered_rd, s->stats.nr_pio_unbuffered_wr);
+
+        for (i = 0; i != sizeof(s->sr); i++) {
+            if (i % 8 == 0)
+                printk("    sr[0x%02x] ", i);
+            printk("%02x ", s->sr[i]);
+            if (i % 8 == 7)
+                printk("\n");
+        }
+        if (i % 8 != 7)
+            printk("\n");
+
+        for (i = 0; i != sizeof(s->gr); i++) {
+            if (i % 8 == 0)
+                printk("    gr[0x%02x] ", i);
+            printk("%02x ", s->gr[i]);
+            if (i % 8 == 7)
+                printk("\n");
+        }
+        if (i % 8 != 7)
+            printk("\n");
+
+        memset(&s->stats, 0, sizeof(s->stats));
+
+        spin_unlock(&s->lock);
+    }
+
+    rcu_read_unlock(&domlist_read_lock);
+}
+
+#include <xen/keyhandler.h>
+
+static int __init setup_stdvga_stats_dump(void)
+{
+    register_keyhandler('<', stdvga_stats_dump, "dump stdvga stats");
+    return 0;
+}
+
+__initcall(setup_stdvga_stats_dump);
+
+#endif
+
diff -r 118a21c66fd5 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/include/asm-x86/hvm/domain.h	Wed Oct 24 17:31:57 2007 -0400
@@ -51,6 +51,7 @@ struct hvm_domain {
     struct hvm_irq         irq;
     struct hvm_hw_vpic     vpic[2]; /* 0=master; 1=slave */
     struct hvm_vioapic    *vioapic;
+    struct hvm_hw_stdvga   stdvga;
 
     /* hvm_print_line() logging. */
     char                   pbuf[80];
diff -r 118a21c66fd5 xen/include/asm-x86/hvm/io.h
--- a/xen/include/asm-x86/hvm/io.h	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/include/asm-x86/hvm/io.h	Wed Oct 24 17:31:57 2007 -0400
@@ -80,10 +80,11 @@ struct hvm_io_op {
     struct cpu_user_regs    io_context; /* current context */
 };
 
-#define MAX_IO_HANDLER              9
+#define MAX_IO_HANDLER             12
 
 #define HVM_PORTIO                  0
 #define HVM_MMIO                    1
+#define HVM_BUFFERED_IO             2
 
 typedef int (*intercept_action_t)(ioreq_t *);
 typedef unsigned long (*hvm_mmio_read_t)(struct vcpu *v,
@@ -126,15 +127,26 @@ static inline int hvm_portio_intercept(i
     return hvm_io_intercept(p, HVM_PORTIO);
 }
 
+static inline int hvm_buffered_io_intercept(ioreq_t *p)
+{
+    return hvm_io_intercept(p, HVM_BUFFERED_IO);
+}
+
 extern int hvm_mmio_intercept(ioreq_t *p);
 extern int hvm_buffered_io_send(ioreq_t *p);
-extern int hvm_buffered_io_intercept(ioreq_t *p);
 
 static inline int register_portio_handler(
     struct domain *d, unsigned long addr,
     unsigned long size, intercept_action_t action)
 {
     return register_io_handler(d, addr, size, action, HVM_PORTIO);
+}
+
+static inline int register_buffered_io_handler(
+    struct domain *d, unsigned long addr,
+    unsigned long size, intercept_action_t action)
+{
+    return register_io_handler(d, addr, size, action, HVM_BUFFERED_IO);
 }
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -154,5 +166,38 @@ extern void hvm_dpci_eoi(struct domain *
 extern void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq,
                          union vioapic_redir_entry *ent);
 
+
+#undef  STDVGA_STATS /* #define to enable stdvga statistics */
+#undef  STDVGA_CHECK /* debug: ensure cached value matches qemu value */
+
+struct hvm_hw_stdvga {
+    uint8_t sr_index;
+    uint8_t sr[0x18];
+    uint8_t gr_index;
+    uint8_t gr[256];
+    uint32_t latch;
+    int stdvga;
+    int cache;
+    uint8_t *vram_ptr[64];  /* shadow of 0xa0000-0xaffff */
+    spinlock_t lock;
+    
+#ifdef STDVGA_STATS
+    struct {
+        uint32_t nr_mmio_buffered_rd;
+        uint32_t nr_mmio_buffered_wr;
+        uint32_t nr_mmio_unbuffered_rd;
+        uint32_t nr_mmio_unbuffered_wr;
+        uint32_t nr_pio_buffered_rd;
+        uint32_t nr_pio_buffered_wr;
+        uint32_t nr_pio_unbuffered_rd;
+        uint32_t nr_pio_unbuffered_wr;
+    } stats;
+#endif
+};
+
+extern void stdvga_init(struct domain *d);
+extern void stdvga_deinit(struct domain *d);
+extern void stdvga_check_cached_value(ioreq_t *p);
+
 #endif /* __ASM_X86_HVM_IO_H__ */
 
diff -r 118a21c66fd5 xen/include/public/hvm/ioreq.h
--- a/xen/include/public/hvm/ioreq.h	Mon Oct 22 21:06:11 2007 +0100
+++ b/xen/include/public/hvm/ioreq.h	Wed Oct 24 17:31:57 2007 -0400
@@ -77,13 +77,26 @@ struct shared_iopage {
 };
 typedef struct shared_iopage shared_iopage_t;
 
-#define IOREQ_BUFFER_SLOT_NUM     80
+#pragma pack(push,2)
+
+struct buf_ioreq {
+    uint8_t  type;   /*  I/O type                    */
+    uint8_t  dir:1;  /*  1=read, 0=write             */
+    uint8_t  size:2; /*  0=>1, 1=>2, 3=>8. If 8 then use two contig buf_ioreqs */
+    uint32_t addr:20; /*  physical address or high-order data */
+    uint16_t data;   /*  (low order) data            */
+};
+typedef struct buf_ioreq buf_ioreq_t;
+
+#define IOREQ_BUFFER_SLOT_NUM     672
 struct buffered_iopage {
-    unsigned int    read_pointer;
-    unsigned int    write_pointer;
-    ioreq_t         ioreq[IOREQ_BUFFER_SLOT_NUM];
+    volatile unsigned int read_pointer;
+    volatile unsigned int write_pointer;
+    buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
 }; /* NB. Size of this structure must be no greater than one page. */
 typedef struct buffered_iopage buffered_iopage_t;
+
+#pragma pack(pop)
 
 #if defined(__ia64__)
 struct pio_buffer {

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Std VGA Performance
  2007-10-24 21:36 [PATCH] Std VGA Performance Ben Guthro
@ 2007-10-25 14:14 ` Keir Fraser
  2007-10-25 15:28   ` Robert Phillips
  2007-10-29 18:48 ` Alex Williamson
  1 sibling, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2007-10-25 14:14 UTC (permalink / raw)
  To: Ben Guthro, xen-devel; +Cc: Robert Phillips

On 24/10/07 22:36, "Ben Guthro" <bguthro@virtualiron.com> wrote:

> This patch improves the performance of Standard VGA,
> the mode used during Windows boot and by the Linux
> splash screen.
> 
> It does so by buffering all the stdvga programmed output ops
> and memory mapped ops (both reads and writes) that are sent to QEMU.

How much benefit comes from immediate servicing of PIO input ops versus the
massive increase in buffered-io slots? Removing the former optimisation
would certainly make the patch a lot smaller!

What happens across save/restore? The hypervisor's state cache will go away,
won't it? I suppose it's okay if the guest is in SVGA LFB mode at that point
(actually, that's another thing - do you correctly handle hand-off between
VGA and SVGA modes), but I don't know that we want to rely on that.

 -- Keir

> We maintain locally essential VGA state so we can respond
> immediately to input and read ops without waiting for
> QEMU.  We snoop output and write ops to keep our state
> up-to-date.
> 
> PIO input ops are satisfied from cached state without
> bothering QEMU.
> 
> PIO output and mmio ops are passed through to QEMU, including
> mmio read ops.  This is necessary because mmio reads
> can have side effects.
> 
> I have changed the format of the buffered_iopage.
> It used to contain 80 elements of type ioreq_t (48 bytes each).
> Now it contains 672 elements of type buf_ioreq_t (6 bytes each).
> Being able to pipeline 8 times as many ops improves
> VGA performance by a factor of 8.
> 
> I changed hvm_buffered_io_intercept to use the same
> registration and callback mechanism as hvm_portio_intercept
> rather than the hacky hardcoding it used before.
> 
> In platform.c, I fixed send_timeoffset_req() to sets its
> ioreq size to 8 (rather than 4), and its count to 1 (which
> was missing).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Std VGA Performance
  2007-10-25 14:14 ` Keir Fraser
@ 2007-10-25 15:28   ` Robert Phillips
  2007-10-25 15:39     ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Robert Phillips @ 2007-10-25 15:28 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

[-- Attachment #1.1: Type: text/plain, Size: 3888 bytes --]

Good questions, Keir.  Answers below:

On 10/25/07, Keir Fraser <Keir.Fraser@cl.cam.ac.uk> wrote:
>
> On 24/10/07 22:36, "Ben Guthro" <bguthro@virtualiron.com> wrote:
>
> > This patch improves the performance of Standard VGA,
> > the mode used during Windows boot and by the Linux
> > splash screen.
> >
> > It does so by buffering all the stdvga programmed output ops
> > and memory mapped ops (both reads and writes) that are sent to QEMU.
>
> How much benefit comes from immediate servicing of PIO input ops versus
> the
> massive increase in buffered-io slots? Removing the former optimisation
> would certainly make the patch a lot smaller!

Subjectively, the performance improvement appears substantial.  We have
tested the code with the stdvga emulation and with and without the increased
number of slots. With more slots the screen painting goes from being fast to
very fast.

As you've noticed, the increase in number of slots is compensated by the
decrease in slot size (so there is no increase in memory use) at the cost of
packing (and unpacking) ioreqs as they are written to (and read from) the
buffer.

What happens across save/restore? The hypervisor's state cache will go away,
> won't it? I suppose it's okay if the guest is in SVGA LFB mode at that
> point
> (actually, that's another thing - do you correctly handle hand-off between
> VGA and SVGA modes), but I don't know that we want to rely on that.

This hasn't been a problem in practice.  The guest quickly switches from
VGA to SVGA mode causing the stdvga code to be largely inactive,
and we have only seen it switch back when the guest blue-screens.
The stdvga code detects that transition correctly and paints the blue-screen
quickly.

After a restore, the code assumes it is not in standard VGA mode so is
largely inactive.
That conservative assumption might not be optimal but it is correct.

I don't believe the failure to save/restore the stdvga cache will prove
problematic but
if it becomes so I will add corrective code.

One might ask (and we did) what is the point of all this VGA emulation code
when it is only
active during the boot process (or during blue-screen painting).

The answer is that one wants the user's first experience with Xen to be
positive;
as watching an excruciatingly slow Windows boot screen or Linux splash panel
is not.

-- rsp

-- Keir
>
> > We maintain locally essential VGA state so we can respond
> > immediately to input and read ops without waiting for
> > QEMU.  We snoop output and write ops to keep our state
> > up-to-date.
> >
> > PIO input ops are satisfied from cached state without
> > bothering QEMU.
> >
> > PIO output and mmio ops are passed through to QEMU, including
> > mmio read ops.  This is necessary because mmio reads
> > can have side effects.
> >
> > I have changed the format of the buffered_iopage.
> > It used to contain 80 elements of type ioreq_t (48 bytes each).
> > Now it contains 672 elements of type buf_ioreq_t (6 bytes each).
> > Being able to pipeline 8 times as many ops improves
> > VGA performance by a factor of 8.
> >
> > I changed hvm_buffered_io_intercept to use the same
> > registration and callback mechanism as hvm_portio_intercept
> > rather than the hacky hardcoding it used before.
> >
> > In platform.c, I fixed send_timeoffset_req() to sets its
> > ioreq size to 8 (rather than 4), and its count to 1 (which
> > was missing).
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

-- 
--------------------------------------------------------------------
Robert S. Phillips                          Virtual Iron Software
rphillips@virtualiron.com                Tower 1, Floor 2
978-849-1220                                 900 Chelmsford Street
                                                    Lowell, MA 01851

[-- Attachment #1.2: Type: text/html, Size: 5681 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Std VGA Performance
  2007-10-25 15:28   ` Robert Phillips
@ 2007-10-25 15:39     ` Keir Fraser
  2007-10-25 17:31       ` Robert Phillips
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2007-10-25 15:39 UTC (permalink / raw)
  To: Robert Phillips; +Cc: xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 1001 bytes --]

On 25/10/07 16:28, "Robert Phillips" <rsp.vi.xen@gmail.com> wrote:

>> How much benefit comes from immediate servicing of PIO input ops versus the
>> massive increase in buffered-io slots? Removing the former optimisation
>> would certainly make the patch a lot smaller!
> 
> Subjectively, the performance improvement appears substantial.  We have tested
> the code with the stdvga emulation and with and without the increased number
> of slots. With more slots the screen painting goes from being fast to very
> fast.
> 
> As you've noticed, the increase in number of slots is compensated by the
> decrease in slot size (so there is no increase in memory use) at the cost of
> packing (and unpacking) ioreqs as they are written to (and read from) the
> buffer.

I guess what I¹m really interested in is the performance /with/ the
increased number of slots and with versus without the stdvga emulation.
Since it¹s the stdvga emulation that really adds the complexity.

 -- Keir


[-- Attachment #1.2: Type: text/html, Size: 1554 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Std VGA Performance
  2007-10-25 15:39     ` Keir Fraser
@ 2007-10-25 17:31       ` Robert Phillips
  0 siblings, 0 replies; 12+ messages in thread
From: Robert Phillips @ 2007-10-25 17:31 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 2162 bytes --]

The performance is poor with increased slots and without emulation.

As presented the emulation code treats the buffered-io slots as an
asynchronous queue.  The stdvga emulator pushes ioreqs into the queue but
need not wait for any response because it can satisfy read requests
locally.  (The only time it must wait is when the queue becomes full.)

Without the emulation, the code must block on each read (of which there are
many) waiting for QEMU to provide an answer.  This really slows things down
and renders the buffer largely useless.  I don't believe it ever gets full;
there are never enough consecutive writes to fill it.

With both increased slots and emulation, the performance feels so very much
better.  Like taking a stone out of your shoe.  :-)

-- rsp

On 10/25/07, Keir Fraser <Keir.Fraser@cl.cam.ac.uk> wrote:
>
>  On 25/10/07 16:28, "Robert Phillips" <rsp.vi.xen@gmail.com> wrote:
>
> How much benefit comes from immediate servicing of PIO input ops versus
> the
> massive increase in buffered-io slots? Removing the former optimisation
> would certainly make the patch a lot smaller!
>
>
> Subjectively, the performance improvement appears substantial.  We have
> tested the code with the stdvga emulation and with and without the increased
> number of slots. With more slots the screen painting goes from being fast to
> very fast.
>
> As you've noticed, the increase in number of slots is compensated by the
> decrease in slot size (so there is no increase in memory use) at the cost of
> packing (and unpacking) ioreqs as they are written to (and read from) the
> buffer.
>
>
> I guess what I'm really interested in is the performance /with/ the
> increased number of slots and with versus without the stdvga emulation.
> Since it's the stdvga emulation that really adds the complexity.
>
>  -- Keir
>



-- 
--------------------------------------------------------------------
Robert S. Phillips                          Virtual Iron Software
rphillips@virtualiron.com                Tower 1, Floor 2
978-849-1220                                 900 Chelmsford Street
                                                    Lowell, MA 01851

[-- Attachment #1.2: Type: text/html, Size: 3802 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Std VGA Performance
  2007-10-24 21:36 [PATCH] Std VGA Performance Ben Guthro
  2007-10-25 14:14 ` Keir Fraser
@ 2007-10-29 18:48 ` Alex Williamson
  2007-10-29 19:17   ` Keir Fraser
  1 sibling, 1 reply; 12+ messages in thread
From: Alex Williamson @ 2007-10-29 18:48 UTC (permalink / raw)
  To: Ben Guthro; +Cc: xen-devel, Robert Phillips, xen-ia64-devel

[-- Attachment #1: Type: text/plain, Size: 3699 bytes --]

On Wed, 2007-10-24 at 17:36 -0400, Ben Guthro wrote:
> This patch improves the performance of Standard VGA,
> the mode used during Windows boot and by the Linux
> splash screen.

Hi,

   ia64 uses VGA too.  I've been able to regain some functionality with
the patch below, but the VGA modes used by our firmware still have
significant issues (once we boot to Linux userspace, VGA text mode gets
readable).  It seems like perhaps we've lost support for some basic text
VGA modes.  I haven't tried to understand the changes in qemu yet, but
are we sacrificing compatibility for performance?  Patch and screen shot
below.  Thanks,

	Alex

PS - for xen-ia64-devel, both the Open Source and Intel GFW have issues
with EFI text mode w/ this patch (use EFI shell to see it on Intel GFW).

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
---

diff -r 4034317507de xen/arch/ia64/vmx/mmio.c
--- a/xen/arch/ia64/vmx/mmio.c	Mon Oct 29 16:49:02 2007 +0000
+++ b/xen/arch/ia64/vmx/mmio.c	Mon Oct 29 12:29:18 2007 -0600
@@ -56,10 +56,12 @@ static int hvm_buffered_io_intercept(ior
 {
     struct vcpu *v = current;
     spinlock_t  *buffered_io_lock;
-    buffered_iopage_t *buffered_iopage =
+    buffered_iopage_t *pg =
         (buffered_iopage_t *)(v->domain->arch.hvm_domain.buffered_io_va);
-    unsigned long tmp_write_pointer = 0;
     int i;
+    buf_ioreq_t bp;
+    /* Timeoffset sends 64b data, but no address.  Use two consecutive slots. */
+    int qw = 0;
 
     /* ignore READ ioreq_t! */
     if ( p->dir == IOREQ_READ )
@@ -75,11 +77,41 @@ static int hvm_buffered_io_intercept(ior
     if ( i == HVM_BUFFERED_IO_RANGE_NR )
         return 0;
 
+    /* Return 0 for the cases we can't deal with. */
+    if ( p->addr > 0xffffful || p->data_is_ptr || p->df || p->count != 1 )
+        return 0;
+
+    bp.type = p->type;
+    bp.dir = p->dir;
+    switch (p->size) {
+    case 1:
+        bp.size = 0;
+        break;
+    case 2:
+        bp.size = 1;
+        break;
+    case 4:
+        bp.size = 2;
+        break;
+    case 8:
+        bp.size = 3;
+        qw = 1;
+        gdprintk(XENLOG_INFO, "quadword ioreq type:%d data:%"PRIx64"\n",
+                 p->type, p->data);
+        break;
+    default:
+        gdprintk(XENLOG_WARNING, "unexpected ioreq size:%"PRId64"\n", p->size);
+        return 0;
+    }
+
+    bp.data = p->data;
+    bp.addr = qw ? ((p->data >> 16) & 0xfffful) : (p->addr & 0xffffful);
+
     buffered_io_lock = &v->domain->arch.hvm_domain.buffered_io_lock;
     spin_lock(buffered_io_lock);
 
-    if ( buffered_iopage->write_pointer - buffered_iopage->read_pointer ==
-         (unsigned long)IOREQ_BUFFER_SLOT_NUM ) {
+    if ( pg->write_pointer - pg->read_pointer >=
+         (unsigned long)IOREQ_BUFFER_SLOT_NUM - (qw ? 1 : 0) ) {
         /* the queue is full.
          * send the iopacket through the normal path.
          * NOTE: The arithimetic operation could handle the situation for
@@ -89,13 +121,19 @@ static int hvm_buffered_io_intercept(ior
         return 0;
     }
 
-    tmp_write_pointer = buffered_iopage->write_pointer % IOREQ_BUFFER_SLOT_NUM;
-
-    memcpy(&buffered_iopage->ioreq[tmp_write_pointer], p, sizeof(ioreq_t));
+    memcpy(&pg->buf_ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM],
+           &bp, sizeof(bp));
+
+    if (qw) {
+        bp.data = p->data >> 32;
+        bp.addr = (p->data >> 48) & 0xfffful;
+        memcpy(&pg->buf_ioreq[(pg->write_pointer + 1) % IOREQ_BUFFER_SLOT_NUM],
+               &bp, sizeof(bp));
+    }
 
     /*make the ioreq_t visible before write_pointer*/
     wmb();
-    buffered_iopage->write_pointer++;
+    pg->write_pointer += qw ? 2 : 1;
 
     spin_unlock(buffered_io_lock);
 


[-- Attachment #2: corrupted_gfx.png --]
[-- Type: image/png, Size: 2772 bytes --]

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Std VGA Performance
  2007-10-29 18:48 ` Alex Williamson
@ 2007-10-29 19:17   ` Keir Fraser
  2007-10-30 16:19     ` [PATCH] " Alex Williamson
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2007-10-29 19:17 UTC (permalink / raw)
  To: Alex Williamson, Ben Guthro; +Cc: xen-devel, Robert Phillips, xen-ia64-devel

On 29/10/07 18:48, "Alex Williamson" <alex.williamson@hp.com> wrote:

> On Wed, 2007-10-24 at 17:36 -0400, Ben Guthro wrote:
>> This patch improves the performance of Standard VGA,
>> the mode used during Windows boot and by the Linux
>> splash screen.
> 
> Hi,
> 
>    ia64 uses VGA too.  I've been able to regain some functionality with
> the patch below, but the VGA modes used by our firmware still have
> significant issues (once we boot to Linux userspace, VGA text mode gets
> readable).  It seems like perhaps we've lost support for some basic text
> VGA modes.  I haven't tried to understand the changes in qemu yet, but
> are we sacrificing compatibility for performance?  Patch and screen shot
> below.  Thanks,

All that's changed for ia64 is the definition of the buffered ioreq
structure, which has become more densely packed. All the rest of the
acceleration is (currently) x86-specific. So this shouldn't be too hard to
track down...

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] Re: [PATCH] Std VGA Performance
  2007-10-29 19:17   ` Keir Fraser
@ 2007-10-30 16:19     ` Alex Williamson
  2007-10-30 16:24       ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Williamson @ 2007-10-30 16:19 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Robert Phillips, Ben Guthro, xen-ia64-devel

On Mon, 2007-10-29 at 19:17 +0000, Keir Fraser wrote:
> 
> All that's changed for ia64 is the definition of the buffered ioreq
> structure, which has become more densely packed. All the rest of the
> acceleration is (currently) x86-specific. So this shouldn't be too hard to
> track down...

   Yes, you're right, but easy to overlook, and I'm not sure how it
works on x86.  I copied the x86 code for filling in the buffered ioreq,
but failed to notice that it attempts to store 4 bytes of data into a 2
byte field...  The comment for the size entry in buf_ioreq could be
interpreted that only 1, 2, and 8 bytes are expected, but I definitely
see 4 bytes on occasion.  I'd guess x86 has a bug here that's simply not
exposed because of the 16bit code that's probably being used to
initialize VGA.  I also question the 8 byte support, which is why I
skipped it in the patch below.  Wouldn't an 8 byte MMIO access that
isn't a timeoffset be possible?  Keir, please apply this to the staging
tree.  Thanks,

	Alex

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
--

diff -r 4034317507de xen/arch/ia64/vmx/mmio.c
--- a/xen/arch/ia64/vmx/mmio.c	Mon Oct 29 16:49:02 2007 +0000
+++ b/xen/arch/ia64/vmx/mmio.c	Tue Oct 30 10:03:42 2007 -0600
@@ -55,53 +55,68 @@ static int hvm_buffered_io_intercept(ior
 static int hvm_buffered_io_intercept(ioreq_t *p)
 {
     struct vcpu *v = current;
-    spinlock_t  *buffered_io_lock;
-    buffered_iopage_t *buffered_iopage =
+    buffered_iopage_t *pg =
         (buffered_iopage_t *)(v->domain->arch.hvm_domain.buffered_io_va);
-    unsigned long tmp_write_pointer = 0;
+    buf_ioreq_t bp;
     int i;
 
+    /* Ensure buffered_iopage fits in a page */
+    BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
+
     /* ignore READ ioreq_t! */
-    if ( p->dir == IOREQ_READ )
-        return 0;
-
-    for ( i = 0; i < HVM_BUFFERED_IO_RANGE_NR; i++ ) {
-        if ( p->addr >= hvm_buffered_io_ranges[i]->start_addr &&
-             p->addr + p->size - 1 < hvm_buffered_io_ranges[i]->start_addr +
-                                     hvm_buffered_io_ranges[i]->length )
+    if (p->dir == IOREQ_READ)
+        return 0;
+
+    for (i = 0; i < HVM_BUFFERED_IO_RANGE_NR; i++) {
+        if (p->addr >= hvm_buffered_io_ranges[i]->start_addr &&
+            p->addr + p->size - 1 < hvm_buffered_io_ranges[i]->start_addr +
+                                    hvm_buffered_io_ranges[i]->length)
             break;
     }
 
-    if ( i == HVM_BUFFERED_IO_RANGE_NR )
-        return 0;
-
-    buffered_io_lock = &v->domain->arch.hvm_domain.buffered_io_lock;
-    spin_lock(buffered_io_lock);
-
-    if ( buffered_iopage->write_pointer - buffered_iopage->read_pointer ==
-         (unsigned long)IOREQ_BUFFER_SLOT_NUM ) {
+    if (i == HVM_BUFFERED_IO_RANGE_NR)
+        return 0;
+
+    bp.type = p->type;
+    bp.dir = p->dir;
+    switch (p->size) {
+    case 1:
+        bp.size = 0;
+        break;
+    case 2:
+        bp.size = 1;
+        break;
+    default:
+	/* Could use quad word semantics, but it only appears
+	 * to be useful for timeoffset data. */
+        return 0;
+    }
+    bp.data = (uint16_t)p->data;
+    bp.addr = (uint32_t)p->addr;
+
+    spin_lock(&v->domain->arch.hvm_domain.buffered_io_lock);
+
+    if (pg->write_pointer - pg->read_pointer == IOREQ_BUFFER_SLOT_NUM) {
         /* the queue is full.
          * send the iopacket through the normal path.
          * NOTE: The arithimetic operation could handle the situation for
          * write_pointer overflow.
          */
-        spin_unlock(buffered_io_lock);
-        return 0;
-    }
-
-    tmp_write_pointer = buffered_iopage->write_pointer % IOREQ_BUFFER_SLOT_NUM;
-
-    memcpy(&buffered_iopage->ioreq[tmp_write_pointer], p, sizeof(ioreq_t));
-
-    /*make the ioreq_t visible before write_pointer*/
+        spin_unlock(&v->domain->arch.hvm_domain.buffered_io_lock);
+        return 0;
+    }
+
+    memcpy(&pg->buf_ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM],
+           &bp, sizeof(bp));
+
+    /* Make the ioreq_t visible before write_pointer */
     wmb();
-    buffered_iopage->write_pointer++;
-
-    spin_unlock(buffered_io_lock);
+    pg->write_pointer++;
+
+    spin_unlock(&v->domain->arch.hvm_domain.buffered_io_lock);
 
     return 1;
 }
-
 
 static void low_mmio_access(VCPU *vcpu, u64 pa, u64 *val, size_t s, int dir)
 {
@@ -110,32 +125,36 @@ static void low_mmio_access(VCPU *vcpu, 
     ioreq_t *p;
 
     vio = get_vio(v->domain, v->vcpu_id);
-    if (vio == 0) {
-        panic_domain(NULL,"bad shared page: %lx", (unsigned long)vio);
-    }
+    if (!vio)
+        panic_domain(NULL, "bad shared page");
+
     p = &vio->vp_ioreq;
+
     p->addr = pa;
     p->size = s;
     p->count = 1;
+    if (dir == IOREQ_WRITE)
+        p->data = *val;
+    else
+        p->data = 0;
+    p->data_is_ptr = 0;
     p->dir = dir;
-    if (dir==IOREQ_WRITE)     // write;
-        p->data = *val;
-    else if (dir == IOREQ_READ)
-        p->data = 0;          // clear all bits
-    p->data_is_ptr = 0;
+    p->df = 0;
     p->type = 1;
-    p->df = 0;
 
     p->io_count++;
+
     if (hvm_buffered_io_intercept(p)) {
         p->state = STATE_IORESP_READY;
         vmx_io_assist(v);
-        return;
-    } else 
-        vmx_send_assist_req(v);
-    if (dir == IOREQ_READ) { // read
+        if (dir != IOREQ_READ)
+            return;
+    }
+
+    vmx_send_assist_req(v);
+    if (dir == IOREQ_READ)
         *val = p->data;
-    }
+
     return;
 }
 
@@ -227,16 +246,18 @@ static void legacy_io_access(VCPU *vcpu,
     ioreq_t *p;
 
     vio = get_vio(v->domain, v->vcpu_id);
-    if (vio == 0) {
-        panic_domain(NULL,"bad shared page\n");
-    }
+    if (!vio)
+        panic_domain(NULL, "bad shared page\n");
+
     p = &vio->vp_ioreq;
-    p->addr = TO_LEGACY_IO(pa&0x3ffffffUL);
+    p->addr = TO_LEGACY_IO(pa & 0x3ffffffUL);
     p->size = s;
     p->count = 1;
     p->dir = dir;
-    if (dir == IOREQ_WRITE)     // write;
+    if (dir == IOREQ_WRITE)
         p->data = *val;
+    else
+        p->data = 0;
     p->data_is_ptr = 0;
     p->type = 0;
     p->df = 0;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Re: [PATCH] Std VGA Performance
  2007-10-30 16:19     ` [PATCH] " Alex Williamson
@ 2007-10-30 16:24       ` Keir Fraser
  2007-10-30 16:40         ` Alex Williamson
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2007-10-30 16:24 UTC (permalink / raw)
  To: Alex Williamson; +Cc: xen-devel, Robert Phillips, Ben Guthro, xen-ia64-devel


Yeah, hopefully that's a bug in the comment. I would expect 4-byte accesses
to be possible and be handled. As for 8-byte accesses, they can certainly
happen, why not? Unlikely at start of day, but once we're in x86/64 mode
there's no reason why not.

 -- Keir

On 30/10/07 16:19, "Alex Williamson" <alex.williamson@hp.com> wrote:

>    Yes, you're right, but easy to overlook, and I'm not sure how it
> works on x86.  I copied the x86 code for filling in the buffered ioreq,
> but failed to notice that it attempts to store 4 bytes of data into a 2
> byte field...  The comment for the size entry in buf_ioreq could be
> interpreted that only 1, 2, and 8 bytes are expected, but I definitely
> see 4 bytes on occasion.  I'd guess x86 has a bug here that's simply not
> exposed because of the 16bit code that's probably being used to
> initialize VGA.  I also question the 8 byte support, which is why I
> skipped it in the patch below.  Wouldn't an 8 byte MMIO access that
> isn't a timeoffset be possible?  Keir, please apply this to the staging
> tree.  Thanks,

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Re: [PATCH] Std VGA Performance
  2007-10-30 16:24       ` Keir Fraser
@ 2007-10-30 16:40         ` Alex Williamson
  2007-10-30 17:02           ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Williamson @ 2007-10-30 16:40 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Robert Phillips, Ben Guthro, xen-ia64-devel


On Tue, 2007-10-30 at 16:24 +0000, Keir Fraser wrote:
> Yeah, hopefully that's a bug in the comment. I would expect 4-byte accesses
> to be possible and be handled. As for 8-byte accesses, they can certainly
> happen, why not? Unlikely at start of day, but once we're in x86/64 mode
> there's no reason why not.

   Right, and it seems that the "quadword" handling is quite specific to
timeoffset.  It takes advantage of the fact that that there's no address
and stuff some of the data in there.  So, I think both 4 & 8 byte
buffered mmio is likely broken right now on x86.  Thanks,

	Alex

-- 
Alex Williamson                             HP Open Source & Linux Org.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Re: [PATCH] Std VGA Performance
  2007-10-30 16:40         ` Alex Williamson
@ 2007-10-30 17:02           ` Keir Fraser
  2007-10-31 19:28             ` Robert Phillips
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2007-10-30 17:02 UTC (permalink / raw)
  To: Alex Williamson; +Cc: xen-devel, Robert Phillips, Ben Guthro, xen-ia64-devel

On 30/10/07 16:40, "Alex Williamson" <alex.williamson@hp.com> wrote:

> On Tue, 2007-10-30 at 16:24 +0000, Keir Fraser wrote:
>> Yeah, hopefully that's a bug in the comment. I would expect 4-byte accesses
>> to be possible and be handled. As for 8-byte accesses, they can certainly
>> happen, why not? Unlikely at start of day, but once we're in x86/64 mode
>> there's no reason why not.
> 
>    Right, and it seems that the "quadword" handling is quite specific to
> timeoffset.  It takes advantage of the fact that that there's no address
> and stuff some of the data in there.  So, I think both 4 & 8 byte
> buffered mmio is likely broken right now on x86.  Thanks,

I guess we'll see how testing goes over the next little while. If the
bufioreq changes prove to be broken we can back them out before 3.2.0, or
better yet fix them ;-).

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Re: [PATCH] Std VGA Performance
  2007-10-30 17:02           ` Keir Fraser
@ 2007-10-31 19:28             ` Robert Phillips
  0 siblings, 0 replies; 12+ messages in thread
From: Robert Phillips @ 2007-10-31 19:28 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Ben Guthro, Alex Williamson, xen-ia64-devel


[-- Attachment #1.1: Type: text/plain, Size: 1081 bytes --]

Alex is correct;  there is a bug with size=32 operations.  The fix is
simple.  We'll submit an updated patch very soon.
-- rsp

On 10/30/07, Keir Fraser <Keir.Fraser@cl.cam.ac.uk> wrote:
>
> On 30/10/07 16:40, "Alex Williamson" <alex.williamson@hp.com> wrote:
>
> > On Tue, 2007-10-30 at 16:24 +0000, Keir Fraser wrote:
> >> Yeah, hopefully that's a bug in the comment. I would expect 4-byte
> accesses
> >> to be possible and be handled. As for 8-byte accesses, they can
> certainly
> >> happen, why not? Unlikely at start of day, but once we're in x86/64
> mode
> >> there's no reason why not.
> >
> >    Right, and it seems that the "quadword" handling is quite specific to
> > timeoffset.  It takes advantage of the fact that that there's no address
> > and stuff some of the data in there.  So, I think both 4 & 8 byte
> > buffered mmio is likely broken right now on x86.  Thanks,
>
> I guess we'll see how testing goes over the next little while. If the
> bufioreq changes prove to be broken we can back them out before 3.2.0, or
> better yet fix them ;-).
>
> -- Keir
>
>
>

[-- Attachment #1.2: Type: text/html, Size: 1737 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2007-10-31 19:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-24 21:36 [PATCH] Std VGA Performance Ben Guthro
2007-10-25 14:14 ` Keir Fraser
2007-10-25 15:28   ` Robert Phillips
2007-10-25 15:39     ` Keir Fraser
2007-10-25 17:31       ` Robert Phillips
2007-10-29 18:48 ` Alex Williamson
2007-10-29 19:17   ` Keir Fraser
2007-10-30 16:19     ` [PATCH] " Alex Williamson
2007-10-30 16:24       ` Keir Fraser
2007-10-30 16:40         ` Alex Williamson
2007-10-30 17:02           ` Keir Fraser
2007-10-31 19:28             ` Robert Phillips

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.