[Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment
@ 2010-07-11 18:09 Alex Williamson
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 1/5] qemu_ram_map/unmap: Allow pre-allocated space to be mapped Alex Williamson
                   ` (5 more replies)
  0 siblings, 6 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:09 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: chrisw, alex.williamson, pugs, mst

The following series implements QEMU device assignment using the
proposed VFIO/UIOMMU kernel interfaces.  See the last patch for
further vfio description.  I've tested this on the v2 VFIO patch,
with a number of fixes hacked in along the way.  I'll update when
Tom releases a new version of VFIO.  Hopefully this will provide
some support for the usefulness of such an interfaces.  Thanks,

Alex

---

Alex Williamson (5):
      VFIO based device assignment
      APIC/IOAPIC EOI callback
      RAM API: Make use of it for x86 PC
      Minimal RAM API support
      qemu_ram_map/unmap: Allow pre-allocated space to be mapped


 Makefile.target |    2 
 cpu-common.h    |    5 
 exec.c          |   56 ++
 hw/apic.c       |   18 +
 hw/apic.h       |    4 
 hw/ioapic.c     |   29 +
 hw/linux-vfio.h |  200 ++++++++
 hw/pc.c         |   12 -
 hw/pc.h         |   12 -
 hw/vfio.c       | 1295 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio.h       |   90 ++++
 memory.c        |   77 +++
 memory.h        |   23 +
 13 files changed, 1812 insertions(+), 11 deletions(-)
 create mode 100644 hw/linux-vfio.h
 create mode 100644 hw/vfio.c
 create mode 100644 hw/vfio.h
 create mode 100644 memory.c
 create mode 100644 memory.h

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] [RFC PATCH 1/5] qemu_ram_map/unmap: Allow pre-allocated space to be mapped
  2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
@ 2010-07-11 18:09 ` Alex Williamson
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 2/5] Minimal RAM API support Alex Williamson
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:09 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: chrisw, alex.williamson, pugs, mst

For assigned devices, we want to map the PCI BARs into the VM
address space.  This is just like mapping an option ROM except
the host backing is an mmap area instead of a chunk of vmalloc
memory.  This allow registration and removal of such areas.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 cpu-common.h |    3 +++
 exec.c       |   56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/cpu-common.h b/cpu-common.h
index 71e7933..8d03f4e 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -40,7 +40,10 @@ static inline void cpu_register_physical_memory(target_phys_addr_t start_addr,
 }
 
 ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr);
+ram_addr_t qemu_ram_map(DeviceState *dev, const char *name,
+                        ram_addr_t size, void *host);
 ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size);
+void qemu_ram_unmap(ram_addr_t addr);
 void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
diff --git a/exec.c b/exec.c
index 5420f56..a98b4e9 100644
--- a/exec.c
+++ b/exec.c
@@ -2808,6 +2808,49 @@ static ram_addr_t last_ram_offset(void)
     return last;
 }
 
+ram_addr_t qemu_ram_map(DeviceState *dev, const char *name,
+                        ram_addr_t size, void *host)
+{
+    RAMBlock *new_block, *block;
+
+    size = TARGET_PAGE_ALIGN(size);
+    new_block = qemu_mallocz(sizeof(*new_block));
+
+    if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) {
+        char *id = dev->parent_bus->info->get_dev_path(dev);
+        if (id) {
+            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
+            qemu_free(id);
+        }
+    }
+    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(block->idstr, new_block->idstr)) {
+            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
+                    new_block->idstr);
+            abort();
+        }
+    }
+
+    new_block->host = host;
+
+    new_block->offset = find_ram_offset(size);
+    new_block->length = size;
+
+    QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+
+    ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
+                                       last_ram_offset() >> TARGET_PAGE_BITS);
+    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
+           0xff, size >> TARGET_PAGE_BITS);
+
+    if (kvm_enabled())
+        kvm_setup_guest_memory(new_block->host, size);
+
+    return new_block->offset;
+}
+
 ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size)
 {
     RAMBlock *new_block, *block;
@@ -2874,6 +2917,19 @@ ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size)
     return new_block->offset;
 }
 
+void qemu_ram_unmap(ram_addr_t addr)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (addr == block->offset) {
+            QLIST_REMOVE(block, next);
+            qemu_free(block);
+            return;
+        }
+    }
+}
+
 void qemu_ram_free(ram_addr_t addr)
 {
     RAMBlock *block;

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [Qemu-devel] [RFC PATCH 2/5] Minimal RAM API support
  2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 1/5] qemu_ram_map/unmap: Allow pre-allocated space to be mapped Alex Williamson
@ 2010-07-11 18:09 ` Alex Williamson
  2010-07-11 18:18   ` [Qemu-devel] " Alex Williamson
  2010-07-11 18:20   ` Avi Kivity
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 3/5] RAM API: Make use of it for x86 PC Alex Williamson
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:09 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: chrisw, alex.williamson, pugs, mst

This adds a minimum chunk of Anthony's RAM API support so that we
can identify actual VM RAM versus all the other things that make
use of qemu_ram_alloc.

Signed-off-by: Alex Williamson <alex.williamson@gmail.com>
---

 Makefile.target |    1 +
 cpu-common.h    |    2 +
 memory.c        |   77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 memory.h        |   23 ++++++++++++++++
 4 files changed, 103 insertions(+), 0 deletions(-)
 create mode 100644 memory.c
 create mode 100644 memory.h

diff --git a/Makefile.target b/Makefile.target
index 7489910..0c1b916 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -173,6 +173,7 @@ obj-$(CONFIG_VIRTFS) += virtio-9p.o
 obj-y += rwhandler.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
+obj-y += memory.o
 LIBS+=-lz
 
 QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
diff --git a/cpu-common.h b/cpu-common.h
index 8d03f4e..7e76f9d 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -23,6 +23,8 @@
 /* address in the RAM (different from a physical address) */
 typedef unsigned long ram_addr_t;
 
+#include "memory.h"
+
 /* memory API */
 
 typedef void CPUWriteMemoryFunc(void *opaque, target_phys_addr_t addr, uint32_t value);
diff --git a/memory.c b/memory.c
new file mode 100644
index 0000000..f5f9273
--- /dev/null
+++ b/memory.c
@@ -0,0 +1,77 @@
+/*
+ *  virtual page mapping and translated block handling
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "memory.h"
+
+QemuRamSlots ram_slots = { .slots = QLIST_HEAD_INITIALIZER(ram_slots) };
+
+static QemuRamSlot *qemu_ram_find_slot(target_phys_addr_t start_addr,
+                                       ram_addr_t size)
+{
+    QemuRamSlot *slot;
+
+    QLIST_FOREACH(slot, &ram_slots.slots, next) {
+        if (slot->start_addr == start_addr && slot->size == size) {
+            return slot;
+        }
+
+        if ((start_addr - slot->start_addr < slot->size) ||
+            (start_addr + size - slot->start_addr < slot->size))
+            abort();
+    }
+
+    return NULL;
+}
+
+void qemu_ram_register(target_phys_addr_t start_addr, ram_addr_t size,
+                       ram_addr_t phys_offset)
+{
+    QemuRamSlot *slot;
+
+    if (!size)
+        return;
+
+    slot = qemu_ram_find_slot(start_addr, size);
+    assert(slot == NULL);
+
+    slot = qemu_mallocz(sizeof(QemuRamSlot));
+
+    slot->start_addr = start_addr;
+    slot->size = size;
+    slot->offset = phys_offset;
+
+    QLIST_INSERT_HEAD(&ram_slots.slots, slot, next);
+
+    cpu_register_physical_memory(slot->start_addr, slot->size, slot->offset);
+}
+
+void qemu_ram_unregister(target_phys_addr_t start_addr, ram_addr_t size)
+{
+    QemuRamSlot *slot;
+
+    if (!size)
+        return;
+
+    slot = qemu_ram_find_slot(start_addr, size);
+    assert(slot != NULL);
+
+    QLIST_REMOVE(slot, next);
+    cpu_register_physical_memory(start_addr, size, IO_MEM_UNASSIGNED);
+
+    return;
+}
diff --git a/memory.h b/memory.h
new file mode 100644
index 0000000..91e552e
--- /dev/null
+++ b/memory.h
@@ -0,0 +1,23 @@
+#ifndef QEMU_MEMORY_H
+#define QEMU_MEMORY_H
+
+#include "qemu-common.h"
+#include "cpu-common.h"
+
+typedef struct QemuRamSlot {
+    target_phys_addr_t start_addr;
+    ram_addr_t size;
+    ram_addr_t offset;
+    void *host;
+    QLIST_ENTRY(QemuRamSlot) next;
+} QemuRamSlot;
+
+typedef struct QemuRamSlots {
+    QLIST_HEAD(slots, QemuRamSlot) slots;
+} QemuRamSlots;
+extern QemuRamSlots ram_slots;
+
+void qemu_ram_register(target_phys_addr_t start_addr, ram_addr_t size,
+                       ram_addr_t phys_offset);
+void qemu_ram_unregister(target_phys_addr_t start_addr, ram_addr_t size);
+#endif

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 2/5] Minimal RAM API support
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 2/5] Minimal RAM API support Alex Williamson
@ 2010-07-11 18:18   ` Alex Williamson
  2010-07-11 18:20   ` Avi Kivity
  1 sibling, 0 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:18 UTC (permalink / raw)
  To: kvm; +Cc: chrisw, mst, qemu-devel, pugs

On Sun, 2010-07-11 at 12:09 -0600, Alex Williamson wrote:
> This adds a minimum chunk of Anthony's RAM API support so that we
> can identify actual VM RAM versus all the other things that make
> use of qemu_ram_alloc.
> 
> Signed-off-by: Alex Williamson <alex.williamson@gmail.com>

Oops, I'm making up email address now ;)

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

> ---
> 
>  Makefile.target |    1 +
>  cpu-common.h    |    2 +
>  memory.c        |   77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  memory.h        |   23 ++++++++++++++++
>  4 files changed, 103 insertions(+), 0 deletions(-)
>  create mode 100644 memory.c
>  create mode 100644 memory.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index 7489910..0c1b916 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -173,6 +173,7 @@ obj-$(CONFIG_VIRTFS) += virtio-9p.o
>  obj-y += rwhandler.o
>  obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>  obj-$(CONFIG_NO_KVM) += kvm-stub.o
> +obj-y += memory.o
>  LIBS+=-lz
>  
>  QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
> diff --git a/cpu-common.h b/cpu-common.h
> index 8d03f4e..7e76f9d 100644
> --- a/cpu-common.h
> +++ b/cpu-common.h
> @@ -23,6 +23,8 @@
>  /* address in the RAM (different from a physical address) */
>  typedef unsigned long ram_addr_t;
>  
> +#include "memory.h"
> +
>  /* memory API */
>  
>  typedef void CPUWriteMemoryFunc(void *opaque, target_phys_addr_t addr, uint32_t value);
> diff --git a/memory.c b/memory.c
> new file mode 100644
> index 0000000..f5f9273
> --- /dev/null
> +++ b/memory.c
> @@ -0,0 +1,77 @@
> +/*
> + *  virtual page mapping and translated block handling
> + *
> + *  Copyright (c) 2003 Fabrice Bellard
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +#include "memory.h"
> +
> +QemuRamSlots ram_slots = { .slots = QLIST_HEAD_INITIALIZER(ram_slots) };
> +
> +static QemuRamSlot *qemu_ram_find_slot(target_phys_addr_t start_addr,
> +                                       ram_addr_t size)
> +{
> +    QemuRamSlot *slot;
> +
> +    QLIST_FOREACH(slot, &ram_slots.slots, next) {
> +        if (slot->start_addr == start_addr && slot->size == size) {
> +            return slot;
> +        }
> +
> +        if ((start_addr - slot->start_addr < slot->size) ||
> +            (start_addr + size - slot->start_addr < slot->size))
> +            abort();
> +    }
> +
> +    return NULL;
> +}
> +
> +void qemu_ram_register(target_phys_addr_t start_addr, ram_addr_t size,
> +                       ram_addr_t phys_offset)
> +{
> +    QemuRamSlot *slot;
> +
> +    if (!size)
> +        return;
> +
> +    slot = qemu_ram_find_slot(start_addr, size);
> +    assert(slot == NULL);
> +
> +    slot = qemu_mallocz(sizeof(QemuRamSlot));
> +
> +    slot->start_addr = start_addr;
> +    slot->size = size;
> +    slot->offset = phys_offset;
> +
> +    QLIST_INSERT_HEAD(&ram_slots.slots, slot, next);
> +
> +    cpu_register_physical_memory(slot->start_addr, slot->size, slot->offset);
> +}
> +
> +void qemu_ram_unregister(target_phys_addr_t start_addr, ram_addr_t size)
> +{
> +    QemuRamSlot *slot;
> +
> +    if (!size)
> +        return;
> +
> +    slot = qemu_ram_find_slot(start_addr, size);
> +    assert(slot != NULL);
> +
> +    QLIST_REMOVE(slot, next);
> +    cpu_register_physical_memory(start_addr, size, IO_MEM_UNASSIGNED);
> +
> +    return;
> +}
> diff --git a/memory.h b/memory.h
> new file mode 100644
> index 0000000..91e552e
> --- /dev/null
> +++ b/memory.h
> @@ -0,0 +1,23 @@
> +#ifndef QEMU_MEMORY_H
> +#define QEMU_MEMORY_H
> +
> +#include "qemu-common.h"
> +#include "cpu-common.h"
> +
> +typedef struct QemuRamSlot {
> +    target_phys_addr_t start_addr;
> +    ram_addr_t size;
> +    ram_addr_t offset;
> +    void *host;
> +    QLIST_ENTRY(QemuRamSlot) next;
> +} QemuRamSlot;
> +
> +typedef struct QemuRamSlots {
> +    QLIST_HEAD(slots, QemuRamSlot) slots;
> +} QemuRamSlots;
> +extern QemuRamSlots ram_slots;
> +
> +void qemu_ram_register(target_phys_addr_t start_addr, ram_addr_t size,
> +                       ram_addr_t phys_offset);
> +void qemu_ram_unregister(target_phys_addr_t start_addr, ram_addr_t size);
> +#endif
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 2/5] Minimal RAM API support
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 2/5] Minimal RAM API support Alex Williamson
  2010-07-11 18:18   ` [Qemu-devel] " Alex Williamson
@ 2010-07-11 18:20   ` Avi Kivity
  2010-07-11 18:24     ` Alex Williamson
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:20 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:09 PM, Alex Williamson wrote:
> This adds a minimum chunk of Anthony's RAM API support so that we
> can identify actual VM RAM versus all the other things that make
> use of qemu_ram_alloc.
>
>   typedef void CPUWriteMemoryFunc(void *opaque, target_phys_addr_t addr, uint32_t value);
> diff --git a/memory.c b/memory.c
> new file mode 100644
> index 0000000..f5f9273
> --- /dev/null
> +++ b/memory.c
> @@ -0,0 +1,77 @@
> +/*
> + *  virtual page mapping and translated block handling
> + *
> + *  Copyright (c) 2003 Fabrice Bellard
>    

Really?  Looks like new code.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 2/5] Minimal RAM API support
  2010-07-11 18:20   ` Avi Kivity
@ 2010-07-11 18:24     ` Alex Williamson
  2010-07-11 18:29       ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:24 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On Sun, 2010-07-11 at 21:20 +0300, Avi Kivity wrote:
> On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > This adds a minimum chunk of Anthony's RAM API support so that we
> > can identify actual VM RAM versus all the other things that make
> > use of qemu_ram_alloc.
> >
> >   typedef void CPUWriteMemoryFunc(void *opaque, target_phys_addr_t addr, uint32_t value);
> > diff --git a/memory.c b/memory.c
> > new file mode 100644
> > index 0000000..f5f9273
> > --- /dev/null
> > +++ b/memory.c
> > @@ -0,0 +1,77 @@
> > +/*
> > + *  virtual page mapping and translated block handling
> > + *
> > + *  Copyright (c) 2003 Fabrice Bellard
> >    
> 
> Really?  Looks like new code.

Yeah, I just stole the copyright Anthony used for this file.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 2/5] Minimal RAM API support
  2010-07-11 18:24     ` Alex Williamson
@ 2010-07-11 18:29       ` Avi Kivity
  0 siblings, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:29 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:24 PM, Alex Williamson wrote:
>
>    
>> Really?  Looks like new code.
>>      
> Yeah, I just stole the copyright Anthony used for this file.
>
>    

Traditionally, one steals the code and updates the attributions, not the 
other way round.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] [RFC PATCH 3/5] RAM API: Make use of it for x86 PC
  2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 1/5] qemu_ram_map/unmap: Allow pre-allocated space to be mapped Alex Williamson
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 2/5] Minimal RAM API support Alex Williamson
@ 2010-07-11 18:09 ` Alex Williamson
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 4/5] APIC/IOAPIC EOI callback Alex Williamson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:09 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: chrisw, alex.williamson, pugs, mst

Register the actual VM RAM using the new API

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/pc.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index a96187f..b9ce6ae 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -911,13 +911,13 @@ void pc_memory_init(ram_addr_t ram_size,
     /* allocate RAM */
     ram_addr = qemu_ram_alloc(NULL, "pc.ram",
                               below_4g_mem_size + above_4g_mem_size);
-    cpu_register_physical_memory(0, 0xa0000, ram_addr);
-    cpu_register_physical_memory(0x100000,
-                 below_4g_mem_size - 0x100000,
-                 ram_addr + 0x100000);
+
+    qemu_ram_register(0, 0xa0000, ram_addr);
+    qemu_ram_register(0x100000, below_4g_mem_size - 0x100000,
+                      ram_addr + 0x100000);
 #if TARGET_PHYS_ADDR_BITS > 32
-    cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size,
-                                 ram_addr + below_4g_mem_size);
+    qemu_ram_register(0x100000000ULL, above_4g_mem_size,
+                      ram_addr + below_4g_mem_size);
 #endif
 
     /* BIOS load */

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [Qemu-devel] [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
                   ` (2 preceding siblings ...)
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 3/5] RAM API: Make use of it for x86 PC Alex Williamson
@ 2010-07-11 18:09 ` Alex Williamson
  2010-07-11 18:14   ` [Qemu-devel] " Avi Kivity
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment Alex Williamson
  2010-07-11 18:17 ` [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO " Avi Kivity
  5 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:09 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: chrisw, alex.williamson, pugs, mst

For device assignment, we need to know when the VM writes an end
of interrupt to the APIC, which allows us to de-assert the interrupt
line and clear the DisINTx bit.  Add a new wrapper for ioapic
generated interrupts with a callback on eoi and create an interface
for drivers to be notified on eoi.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/apic.c   |   18 ++++++++++++++++--
 hw/apic.h   |    4 ++++
 hw/ioapic.c |   29 +++++++++++++++++++++++++++--
 hw/pc.h     |   12 +++++++++++-
 4 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index d686b51..8f512df 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -21,6 +21,7 @@
 #include "qemu-timer.h"
 #include "host-utils.h"
 #include "sysbus.h"
+#include "pc.h"
 
 //#define DEBUG_APIC
 //#define DEBUG_COALESCING
@@ -119,6 +120,7 @@ struct APICState {
     int wait_for_sipi;
 };
 
+static uint8_t vector_to_gsi_map[256] = { 0xff };
 static APICState *local_apics[MAX_APICS + 1];
 static int apic_irq_delivered;
 
@@ -308,6 +310,15 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode,
                      trigger_mode);
 }
 
+void apic_deliver_ioapic_irq(uint8_t dest, uint8_t dest_mode,
+                             uint8_t delivery_mode, uint8_t vector_num,
+                             uint8_t polarity, uint8_t trigger_mode, int gsi)
+{
+    vector_to_gsi_map[vector_num] = gsi;
+    apic_deliver_irq(dest, dest_mode, delivery_mode,
+                     vector_num, polarity, trigger_mode);
+}
+
 void cpu_set_apic_base(DeviceState *d, uint64_t val)
 {
     APICState *s = DO_UPCAST(APICState, busdev.qdev, d);
@@ -432,8 +443,11 @@ static void apic_eoi(APICState *s)
     if (isrv < 0)
         return;
     reset_bit(s->isr, isrv);
-    /* XXX: send the EOI packet to the APIC bus to allow the I/O APIC to
-            set the remote IRR bit for level triggered interrupts. */
+  
+    if (vector_to_gsi_map[isrv] != 0xff) {
+        ioapic_eoi(vector_to_gsi_map[isrv]);
+        vector_to_gsi_map[isrv] = 0xff;
+    }
     apic_update_irq(s);
 }
 
diff --git a/hw/apic.h b/hw/apic.h
index 8a0c9d0..59d0e37 100644
--- a/hw/apic.h
+++ b/hw/apic.h
@@ -8,6 +8,10 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode,
                              uint8_t delivery_mode,
                              uint8_t vector_num, uint8_t polarity,
                              uint8_t trigger_mode);
+void apic_deliver_ioapic_irq(uint8_t dest, uint8_t dest_mode,
+                             uint8_t delivery_mode,
+                             uint8_t vector_num, uint8_t polarity,
+                             uint8_t trigger_mode, int gsi);
 int apic_accept_pic_intr(DeviceState *s);
 void apic_deliver_pic_intr(DeviceState *s, int level);
 int apic_get_interrupt(DeviceState *s);
diff --git a/hw/ioapic.c b/hw/ioapic.c
index 5ae21e9..1e2fc2e 100644
--- a/hw/ioapic.c
+++ b/hw/ioapic.c
@@ -26,6 +26,7 @@
 #include "qemu-timer.h"
 #include "host-utils.h"
 #include "sysbus.h"
+#include "qlist.h"
 
 //#define DEBUG_IOAPIC
 
@@ -61,6 +62,30 @@ struct IOAPICState {
     uint64_t ioredtbl[IOAPIC_NUM_PINS];
 };
 
+static QLIST_HEAD(ioapic_eoi_client_list,
+                  ioapic_eoi_client) ioapic_eoi_client_list =
+                  QLIST_HEAD_INITIALIZER(ioapic_eoi_client_list);
+
+void ioapic_register_eoi_client(ioapic_eoi_client *client)
+{
+    QLIST_INSERT_HEAD(&ioapic_eoi_client_list, client, list);
+}
+
+void ioapic_unregister_eoi_client(ioapic_eoi_client *client)
+{
+    QLIST_REMOVE(client, list);
+}
+
+void ioapic_eoi(int gsi)
+{
+    ioapic_eoi_client *client;
+    QLIST_FOREACH(client, &ioapic_eoi_client_list, list) {
+        if (client->irq == gsi) {
+            client->eoi(client);
+        }
+    }
+}
+
 static void ioapic_service(IOAPICState *s)
 {
     uint8_t i;
@@ -90,8 +115,8 @@ static void ioapic_service(IOAPICState *s)
                 else
                     vector = entry & 0xff;
 
-                apic_deliver_irq(dest, dest_mode, delivery_mode,
-                                 vector, polarity, trig_mode);
+                apic_deliver_ioapic_irq(dest, dest_mode, delivery_mode,
+                                        vector, polarity, trig_mode, i);
             }
         }
     }
diff --git a/hw/pc.h b/hw/pc.h
index 63b0249..a88019f 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -48,8 +48,18 @@ typedef struct isa_irq_state {
 
 void isa_irq_handler(void *opaque, int n, int level);
 
-/* i8254.c */
+struct ioapic_eoi_client;
+typedef struct ioapic_eoi_client ioapic_eoi_client;
+struct ioapic_eoi_client {
+    void (*eoi)(struct ioapic_eoi_client *client);
+    int irq;
+    QLIST_ENTRY(ioapic_eoi_client) list;
+};
+void ioapic_register_eoi_client(ioapic_eoi_client *client);
+void ioapic_unregister_eoi_client(ioapic_eoi_client *client);
+void ioapic_eoi(int gsi);
 
+/* i8254.c */
 #define PIT_FREQ 1193182
 
 typedef struct PITState PITState;

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 4/5] APIC/IOAPIC EOI callback Alex Williamson
@ 2010-07-11 18:14   ` Avi Kivity
  2010-07-11 18:26     ` Alex Williamson
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:14 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:09 PM, Alex Williamson wrote:
> For device assignment, we need to know when the VM writes an end
> of interrupt to the APIC, which allows us to de-assert the interrupt
> line and clear the DisINTx bit.  Add a new wrapper for ioapic
> generated interrupts with a callback on eoi and create an interface
> for drivers to be notified on eoi.
>    

You aren't going to get this with kvm's in-kernel irqchip, so we need a 
new interface there.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:14   ` [Qemu-devel] " Avi Kivity
@ 2010-07-11 18:26     ` Alex Williamson
  2010-07-11 18:30       ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
> On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > For device assignment, we need to know when the VM writes an end
> > of interrupt to the APIC, which allows us to de-assert the interrupt
> > line and clear the DisINTx bit.  Add a new wrapper for ioapic
> > generated interrupts with a callback on eoi and create an interface
> > for drivers to be notified on eoi.
> >    
> 
> You aren't going to get this with kvm's in-kernel irqchip, so we need a 
> new interface there.

Registering an eventfd for the eoi seems like a reasonable alternative.
I also need to figure out how to avoid bouncing the vfio interrupt
events through qemu, but it's a functional start.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:26     ` Alex Williamson
@ 2010-07-11 18:30       ` Avi Kivity
  2010-07-11 18:54         ` Michael S. Tsirkin
  2010-07-12  6:33         ` Avi Kivity
  0 siblings, 2 replies; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:30 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:26 PM, Alex Williamson wrote:
> On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
>    
>> On 07/11/2010 09:09 PM, Alex Williamson wrote:
>>      
>>> For device assignment, we need to know when the VM writes an end
>>> of interrupt to the APIC, which allows us to de-assert the interrupt
>>> line and clear the DisINTx bit.  Add a new wrapper for ioapic
>>> generated interrupts with a callback on eoi and create an interface
>>> for drivers to be notified on eoi.
>>>
>>>        
>> You aren't going to get this with kvm's in-kernel irqchip, so we need a
>> new interface there.
>>      
> Registering an eventfd for the eoi seems like a reasonable alternative.
>    

I'm worried about that racing (with what?)

> I also need to figure out how to avoid bouncing the vfio interrupt
> events through qemu, but it's a functional start.  Thanks,
>    

I thought the scheduler has/wants to have something that moves the irq 
to whatever thread it wakes up.  With irqfd, it would flow naturally.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:30       ` Avi Kivity
@ 2010-07-11 18:54         ` Michael S. Tsirkin
  2010-07-11 19:21           ` Alex Williamson
  2010-07-12  6:33         ` Avi Kivity
  1 sibling, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-07-11 18:54 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, Alex Williamson, qemu-devel, kvm, pugs

On Sun, Jul 11, 2010 at 09:30:59PM +0300, Avi Kivity wrote:
> On 07/11/2010 09:26 PM, Alex Williamson wrote:
> >On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
> >>On 07/11/2010 09:09 PM, Alex Williamson wrote:
> >>>For device assignment, we need to know when the VM writes an end
> >>>of interrupt to the APIC, which allows us to de-assert the interrupt
> >>>line and clear the DisINTx bit.  Add a new wrapper for ioapic
> >>>generated interrupts with a callback on eoi and create an interface
> >>>for drivers to be notified on eoi.
> >>>
> >>You aren't going to get this with kvm's in-kernel irqchip, so we need a
> >>new interface there.
> >Registering an eventfd for the eoi seems like a reasonable alternative.
> 
> I'm worried about that racing (with what?)

With device asserting the interrupt?
Need to make sure that all possible scenarious work well:

	device asserts interrupt
	driver clears interrupt
	device asserts interrupt
	eoi

	device asserts interrupt
	driver clears interrupt
	eoi
	device asserts interrupt

etc

Not that I see issues, these are things we need to check.

> >I also need to figure out how to avoid bouncing the vfio interrupt
> >events through qemu, but it's a functional start.  Thanks,
> 
> I thought the scheduler has/wants to have something that moves the
> irq to whatever thread it wakes up.  With irqfd, it would flow
> naturally.
> 
> -- 
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:54         ` Michael S. Tsirkin
@ 2010-07-11 19:21           ` Alex Williamson
  2010-07-11 19:23             ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 19:21 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: chrisw, pugs, Avi Kivity, kvm, qemu-devel

On Sun, 2010-07-11 at 21:54 +0300, Michael S. Tsirkin wrote:
> On Sun, Jul 11, 2010 at 09:30:59PM +0300, Avi Kivity wrote:
> > On 07/11/2010 09:26 PM, Alex Williamson wrote:
> > >On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
> > >>On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > >>>For device assignment, we need to know when the VM writes an end
> > >>>of interrupt to the APIC, which allows us to de-assert the interrupt
> > >>>line and clear the DisINTx bit.  Add a new wrapper for ioapic
> > >>>generated interrupts with a callback on eoi and create an interface
> > >>>for drivers to be notified on eoi.
> > >>>
> > >>You aren't going to get this with kvm's in-kernel irqchip, so we need a
> > >>new interface there.
> > >Registering an eventfd for the eoi seems like a reasonable alternative.
> > 
> > I'm worried about that racing (with what?)
> 
> With device asserting the interrupt?
> Need to make sure that all possible scenarious work well:
> 
> 	device asserts interrupt
> 	driver clears interrupt
> 	device asserts interrupt
> 	eoi
> 
> 	device asserts interrupt
> 	driver clears interrupt
> 	eoi
> 	device asserts interrupt
> 
> etc
> 
> Not that I see issues, these are things we need to check.

I think those are all protected by host and qemu vfio drivers managing
DisINTx.  The way I understand it to work now is:

	device asserts interrupt
	interrupt lands in host vfio driver
	host vfio sets DisINTx on the device
	host vfio sends eventfd
	eventfd lands in qemu vfio, does a qemu_set_irq
        ... guest processes
	guest writes eoi to apic, lands back in qemu vfio driver
	qemu vfio deasserts qemu interrupt
	qemu vfio clears DisINTx

So I don't think there's a race as long as ordering is sane for toggling
DisINTx.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 19:21           ` Alex Williamson
@ 2010-07-11 19:23             ` Michael S. Tsirkin
  2010-07-11 20:03               ` Alex Williamson
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-07-11 19:23 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, pugs, Avi Kivity, kvm, qemu-devel

On Sun, Jul 11, 2010 at 01:21:18PM -0600, Alex Williamson wrote:
> On Sun, 2010-07-11 at 21:54 +0300, Michael S. Tsirkin wrote:
> > On Sun, Jul 11, 2010 at 09:30:59PM +0300, Avi Kivity wrote:
> > > On 07/11/2010 09:26 PM, Alex Williamson wrote:
> > > >On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
> > > >>On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > > >>>For device assignment, we need to know when the VM writes an end
> > > >>>of interrupt to the APIC, which allows us to de-assert the interrupt
> > > >>>line and clear the DisINTx bit.  Add a new wrapper for ioapic
> > > >>>generated interrupts with a callback on eoi and create an interface
> > > >>>for drivers to be notified on eoi.
> > > >>>
> > > >>You aren't going to get this with kvm's in-kernel irqchip, so we need a
> > > >>new interface there.
> > > >Registering an eventfd for the eoi seems like a reasonable alternative.
> > > 
> > > I'm worried about that racing (with what?)
> > 
> > With device asserting the interrupt?
> > Need to make sure that all possible scenarious work well:
> > 
> > 	device asserts interrupt
> > 	driver clears interrupt
> > 	device asserts interrupt
> > 	eoi
> > 
> > 	device asserts interrupt
> > 	driver clears interrupt
> > 	eoi
> > 	device asserts interrupt
> > 
> > etc
> > 
> > Not that I see issues, these are things we need to check.
> 
> I think those are all protected by host and qemu vfio drivers managing
> DisINTx.  The way I understand it to work now is:
> 
> 	device asserts interrupt
> 	interrupt lands in host vfio driver
> 	host vfio sets DisINTx on the device
> 	host vfio sends eventfd
> 	eventfd lands in qemu vfio, does a qemu_set_irq
>         ... guest processes
> 	guest writes eoi to apic, lands back in qemu vfio driver
> 	qemu vfio deasserts qemu interrupt
> 	qemu vfio clears DisINTx
> 
> So I don't think there's a race as long as ordering is sane for toggling
> DisINTx.  Thanks,
> 
> Alex
> 

What about threaded interrupts? I think (correct me if I am wrong)
that they work like this:

 	device asserts interrupt
	guest disables interrupt
 	eoi
	guest enables interrupt
 	driver clears interrupt
 	device asserts interrupt

If so, your code will clear DisINTx immediately which
will always get us another host interrupt:
performance will be hurt. I am also not sure
we'll not lose interrupts.

It seems we need to track interrupt disable/enable as well, and only
clear DisINTx after eoi with interrupts enabled.  Not sure what is the
interface for this.


-- 
MST

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 19:23             ` Michael S. Tsirkin
@ 2010-07-11 20:03               ` Alex Williamson
  2010-07-11 20:05                 ` Michael S. Tsirkin
  2010-07-11 20:12                 ` Michael S. Tsirkin
  0 siblings, 2 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 20:03 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: chrisw, pugs, Avi Kivity, kvm, qemu-devel

On Sun, 2010-07-11 at 22:23 +0300, Michael S. Tsirkin wrote:
> On Sun, Jul 11, 2010 at 01:21:18PM -0600, Alex Williamson wrote:
> > On Sun, 2010-07-11 at 21:54 +0300, Michael S. Tsirkin wrote:
> > > On Sun, Jul 11, 2010 at 09:30:59PM +0300, Avi Kivity wrote:
> > > > On 07/11/2010 09:26 PM, Alex Williamson wrote:
> > > > >On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
> > > > >>On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > > > >>>For device assignment, we need to know when the VM writes an end
> > > > >>>of interrupt to the APIC, which allows us to de-assert the interrupt
> > > > >>>line and clear the DisINTx bit.  Add a new wrapper for ioapic
> > > > >>>generated interrupts with a callback on eoi and create an interface
> > > > >>>for drivers to be notified on eoi.
> > > > >>>
> > > > >>You aren't going to get this with kvm's in-kernel irqchip, so we need a
> > > > >>new interface there.
> > > > >Registering an eventfd for the eoi seems like a reasonable alternative.
> > > > 
> > > > I'm worried about that racing (with what?)
> > > 
> > > With device asserting the interrupt?
> > > Need to make sure that all possible scenarious work well:
> > > 
> > > 	device asserts interrupt
> > > 	driver clears interrupt
> > > 	device asserts interrupt
> > > 	eoi
> > > 
> > > 	device asserts interrupt
> > > 	driver clears interrupt
> > > 	eoi
> > > 	device asserts interrupt
> > > 
> > > etc
> > > 
> > > Not that I see issues, these are things we need to check.
> > 
> > I think those are all protected by host and qemu vfio drivers managing
> > DisINTx.  The way I understand it to work now is:
> > 
> > 	device asserts interrupt
> > 	interrupt lands in host vfio driver
> > 	host vfio sets DisINTx on the device
> > 	host vfio sends eventfd
> > 	eventfd lands in qemu vfio, does a qemu_set_irq
> >         ... guest processes
> > 	guest writes eoi to apic, lands back in qemu vfio driver
> > 	qemu vfio deasserts qemu interrupt
> > 	qemu vfio clears DisINTx
> > 
> > So I don't think there's a race as long as ordering is sane for toggling
> > DisINTx.  Thanks,
> > 
> > Alex
> > 
> 
> What about threaded interrupts? I think (correct me if I am wrong)
> that they work like this:
> 
>  	device asserts interrupt
> 	guest disables interrupt

Is this the guest manipulating DisINTx itself?  I suppose it could be a
device dependent disable as well.

>  	eoi
> 	guest enables interrupt
>  	driver clears interrupt

These two are hopefully reversed or else the driver is expecting to
clear and potentially reassert interrupts anyway.

>  	device asserts interrupt
> 
> If so, your code will clear DisINTx immediately which
> will always get us another host interrupt:
> performance will be hurt. I am also not sure
> we'll not lose interrupts.

Level interrupts are lossy afaik, if it gets cleared but an interrupt
condition still exists, it should be reasserted.

> It seems we need to track interrupt disable/enable as well, and only
> clear DisINTx after eoi with interrupts enabled.  Not sure what is the
> interface for this.

If a driver uses device dependent code to disable interrupts, there's no
issue, we'll clear DisINTx, but the device still won't generate an
interrupt until the dependent code is re-enabled by the guest (assuming
there's no cross talk between DisINTx and device dependent components).

For the case that a guest driver disables via DisINTx, it seems easy to
trap and track that.  So we get:

        device asserts interrupt
        guest disables interrupt
        (trapped, qemu-vfio sets intx.guest_disabled = 1)
        eoi
        (qemu-vfio deasserts qemu interrupts, but because of above doesn't clear DisINTx)
        guest enables interrupt
        (allowed to pass through, intx.guest_disabled = 0)
        driver clears interrupt
        device asserts interrupt

I've already got an intx.pending bit, so I think this just changes the eoi to:

    vdev->intx.pending = 0;
    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
    if (!vdev->intx.guest_disabled) {
        vfio_unmask_intx(vdev);
    }

Writing the command register DisINTx bit then just gets some kind of:

    if (cmd & PCI_COMMAND_INTX_DISABLE && intx.pending) {
        intx.guest_disabled = 1;
        cmd &= ~PCI_COMMAND_INTX_DISABLE;
    } else if (!(cmd & PCI_COMMAND_INTX_DISABLE) && intx.guest_disabled) {
        intx.guest_disabled = 0;
    }
    ... allow write

That work?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 20:03               ` Alex Williamson
@ 2010-07-11 20:05                 ` Michael S. Tsirkin
  2010-07-11 20:12                 ` Michael S. Tsirkin
  1 sibling, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-07-11 20:05 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, pugs, Avi Kivity, kvm, qemu-devel

On Sun, Jul 11, 2010 at 02:03:34PM -0600, Alex Williamson wrote:
> On Sun, 2010-07-11 at 22:23 +0300, Michael S. Tsirkin wrote:
> > On Sun, Jul 11, 2010 at 01:21:18PM -0600, Alex Williamson wrote:
> > > On Sun, 2010-07-11 at 21:54 +0300, Michael S. Tsirkin wrote:
> > > > On Sun, Jul 11, 2010 at 09:30:59PM +0300, Avi Kivity wrote:
> > > > > On 07/11/2010 09:26 PM, Alex Williamson wrote:
> > > > > >On Sun, 2010-07-11 at 21:14 +0300, Avi Kivity wrote:
> > > > > >>On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > > > > >>>For device assignment, we need to know when the VM writes an end
> > > > > >>>of interrupt to the APIC, which allows us to de-assert the interrupt
> > > > > >>>line and clear the DisINTx bit.  Add a new wrapper for ioapic
> > > > > >>>generated interrupts with a callback on eoi and create an interface
> > > > > >>>for drivers to be notified on eoi.
> > > > > >>>
> > > > > >>You aren't going to get this with kvm's in-kernel irqchip, so we need a
> > > > > >>new interface there.
> > > > > >Registering an eventfd for the eoi seems like a reasonable alternative.
> > > > > 
> > > > > I'm worried about that racing (with what?)
> > > > 
> > > > With device asserting the interrupt?
> > > > Need to make sure that all possible scenarious work well:
> > > > 
> > > > 	device asserts interrupt
> > > > 	driver clears interrupt
> > > > 	device asserts interrupt
> > > > 	eoi
> > > > 
> > > > 	device asserts interrupt
> > > > 	driver clears interrupt
> > > > 	eoi
> > > > 	device asserts interrupt
> > > > 
> > > > etc
> > > > 
> > > > Not that I see issues, these are things we need to check.
> > > 
> > > I think those are all protected by host and qemu vfio drivers managing
> > > DisINTx.  The way I understand it to work now is:
> > > 
> > > 	device asserts interrupt
> > > 	interrupt lands in host vfio driver
> > > 	host vfio sets DisINTx on the device
> > > 	host vfio sends eventfd
> > > 	eventfd lands in qemu vfio, does a qemu_set_irq
> > >         ... guest processes
> > > 	guest writes eoi to apic, lands back in qemu vfio driver
> > > 	qemu vfio deasserts qemu interrupt
> > > 	qemu vfio clears DisINTx
> > > 
> > > So I don't think there's a race as long as ordering is sane for toggling
> > > DisINTx.  Thanks,
> > > 
> > > Alex
> > > 
> > 
> > What about threaded interrupts? I think (correct me if I am wrong)
> > that they work like this:
> > 
> >  	device asserts interrupt
> > 	guest disables interrupt
> 
> Is this the guest manipulating DisINTx itself?  I suppose it could be a
> device dependent disable as well.

It can manipulate it, so we need to virtualize it, but that's a
separate issue.

> >  	eoi
> > 	guest enables interrupt
> >  	driver clears interrupt
> 
> These two are hopefully reversed or else the driver is expecting to
> clear and potentially reassert interrupts anyway.

Yes. Sorry.

> >  	device asserts interrupt
> > 
> > If so, your code will clear DisINTx immediately which
> > will always get us another host interrupt:
> > performance will be hurt. I am also not sure
> > we'll not lose interrupts.
> 
> Level interrupts are lossy afaik, if it gets cleared but an interrupt
> condition still exists, it should be reasserted.

Yes but I mean we won't interrupt the guest. So it wil lstay disabled
forever.

> > It seems we need to track interrupt disable/enable as well, and only
> > clear DisINTx after eoi with interrupts enabled.  Not sure what is the
> > interface for this.
> 
> If a driver uses device dependent code to disable interrupts,
> there's no
> issue, we'll clear DisINTx, but the device still won't generate an
> interrupt until the dependent code is re-enabled by the guest (assuming
> there's no cross talk between DisINTx and device dependent components).
> 
> For the case that a guest driver disables via DisINTx, it seems easy to
> trap and track that.  So we get:
> 
>         device asserts interrupt
>         guest disables interrupt
>         (trapped, qemu-vfio sets intx.guest_disabled = 1)
>         eoi
>         (qemu-vfio deasserts qemu interrupts, but because of above doesn't clear DisINTx)
>         guest enables interrupt
>         (allowed to pass through, intx.guest_disabled = 0)
>         driver clears interrupt
>         device asserts interrupt
> 
> I've already got an intx.pending bit, so I think this just changes the eoi to:
> 
>     vdev->intx.pending = 0;
>     qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
>     if (!vdev->intx.guest_disabled) {
>         vfio_unmask_intx(vdev);
>     }
> 
> Writing the command register DisINTx bit then just gets some kind of:
> 
>     if (cmd & PCI_COMMAND_INTX_DISABLE && intx.pending) {
>         intx.guest_disabled = 1;
>         cmd &= ~PCI_COMMAND_INTX_DISABLE;
>     } else if (!(cmd & PCI_COMMAND_INTX_DISABLE) && intx.guest_disabled) {
>         intx.guest_disabled = 0;
>     }
>     ... allow write
> 
> That work?  Thanks,
> 
> Alex

No, I mean guest OS disables the specific interrupt with
disable_irq.

-- 
MST

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 20:03               ` Alex Williamson
  2010-07-11 20:05                 ` Michael S. Tsirkin
@ 2010-07-11 20:12                 ` Michael S. Tsirkin
  2010-07-11 21:59                   ` Alex Williamson
  1 sibling, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-07-11 20:12 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, pugs, Avi Kivity, kvm, qemu-devel

On Sun, Jul 11, 2010 at 02:03:34PM -0600, Alex Williamson wrote:
> > What about threaded interrupts?

Just to make the point, imagine a nested virt situation
which uses current kvm device assignment in guest.
Look at the interrupt handler we have there.


-- 
Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 20:12                 ` Michael S. Tsirkin
@ 2010-07-11 21:59                   ` Alex Williamson
  0 siblings, 0 replies; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 21:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: chrisw, pugs, Avi Kivity, kvm, qemu-devel

On Sun, 2010-07-11 at 23:12 +0300, Michael S. Tsirkin wrote:
> On Sun, Jul 11, 2010 at 02:03:34PM -0600, Alex Williamson wrote:
> > > What about threaded interrupts?
> 
> Just to make the point, imagine a nested virt situation
> which uses current kvm device assignment in guest.
> Look at the interrupt handler we have there.

Is the problem you're worried about this:

	guest masks qemu ioapic rte
	device interrupt
	host vfio DisINTx+
	qemu vfio calls qemu_set_irq
	...

In that case, the qemu ioapic irr bit is only toggled by qemu_set_irq
for level triggered interrupts, so the interrupt will be asserted in the
guest when it gets unmasked and we'll get the eoi.

I can't figure out where your other scenario can leave the DisINTx+:

	device asserts interrupt
	a) DisINTx+ via host vfio
	guest disables interrupt
	b) DisINTx+ via guest, already set
	eoi
	c) DisINTx- via qemu vfio
	driver clears interrupt        
	guest enables interrupt
	d) DisINTx- via guest
	device asserts interrupt

So between c) & d) we're potentially getting more interrupts than we
want, but I can't see anywhere that we drop DisINTx.  If you have a
scenario, let me know.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-11 18:30       ` Avi Kivity
  2010-07-11 18:54         ` Michael S. Tsirkin
@ 2010-07-12  6:33         ` Avi Kivity
  2010-07-12  9:05           ` Gleb Natapov
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-12  6:33 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:30 PM, Avi Kivity wrote:
>> Registering an eventfd for the eoi seems like a reasonable alternative.
>
> I'm worried about that racing (with what?)

I don't think there's a problem.

First, the EOI message is itself asynchronous.  While the write to the 
local APIC is synchronous, effects on the rest of the system are 
effected using an APIC message, which travels asynchronously.

Second, a component that needs timely information doesn't have to wait; 
it can read the eventfd and be sure it has seen all EOIs up to now.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-12  6:33         ` Avi Kivity
@ 2010-07-12  9:05           ` Gleb Natapov
  2010-07-12  9:13             ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Gleb Natapov @ 2010-07-12  9:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, kvm, mst, qemu-devel, Alex Williamson, pugs

On Mon, Jul 12, 2010 at 09:33:12AM +0300, Avi Kivity wrote:
> On 07/11/2010 09:30 PM, Avi Kivity wrote:
> >>Registering an eventfd for the eoi seems like a reasonable alternative.
> >
> >I'm worried about that racing (with what?)
> 
> I don't think there's a problem.
> 
> First, the EOI message is itself asynchronous.  While the write to
> the local APIC is synchronous, effects on the rest of the system are
> effected using an APIC message, which travels asynchronously.
> 
> Second, a component that needs timely information doesn't have to
> wait; it can read the eventfd and be sure it has seen all EOIs up to
> now.
> 
I remember we already discussed the use of eventfd for reporting EOI and 
decided against it, but I don't remember why. :( Was it because if we
are going to export EOI to userspace anyway we want to be able to use it
for RTC timedrift fixing and for that we need to know what CPU called
EOI and eventfd can't provide that?

--
			Gleb.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 4/5] APIC/IOAPIC EOI callback
  2010-07-12  9:05           ` Gleb Natapov
@ 2010-07-12  9:13             ` Avi Kivity
  0 siblings, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2010-07-12  9:13 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: chrisw, kvm, mst, qemu-devel, Alex Williamson, pugs

On 07/12/2010 12:05 PM, Gleb Natapov wrote:
> On Mon, Jul 12, 2010 at 09:33:12AM +0300, Avi Kivity wrote:
>    
>> On 07/11/2010 09:30 PM, Avi Kivity wrote:
>>      
>>>> Registering an eventfd for the eoi seems like a reasonable alternative.
>>>>          
>>> I'm worried about that racing (with what?)
>>>        
>> I don't think there's a problem.
>>
>> First, the EOI message is itself asynchronous.  While the write to
>> the local APIC is synchronous, effects on the rest of the system are
>> effected using an APIC message, which travels asynchronously.
>>
>> Second, a component that needs timely information doesn't have to
>> wait; it can read the eventfd and be sure it has seen all EOIs up to
>> now.
>>
>>      
> I remember we already discussed the use of eventfd for reporting EOI and
> decided against it, but I don't remember why. :( Was it because if we
> are going to export EOI to userspace anyway we want to be able to use it
> for RTC timedrift fixing and for that we need to know what CPU called
> EOI and eventfd can't provide that?
>    

IIRC it was the synchronity argument.  But it's bogus: if the RTC wants 
to know whether an ack occured before it makes some decision, all it has 
to do is read() the eventfd and find out.

Another issue is which cpu issued the ack.  I suppose we can have 
per-vcpu eventfds, though that's ugly.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment
  2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
                   ` (3 preceding siblings ...)
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 4/5] APIC/IOAPIC EOI callback Alex Williamson
@ 2010-07-11 18:09 ` Alex Williamson
  2010-07-11 18:27   ` [Qemu-devel] " Avi Kivity
  2010-07-11 18:17 ` [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO " Avi Kivity
  5 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:09 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: chrisw, alex.williamson, pugs, mst

This patch adds qemu device assignment support using the proposed
VFIO/UIOMMU kernel interfaces.  The existing KVM-only device assignment
code makes use of various pci sysfs files for config space, MMIO BAR
mapping, and misc other config items.  It then jumps over to KVM-specific
ioctls for enabling interrupts and assigning devices to IOMMU domains.
Finally, IO-port support uses in/out directly.  This is a messy model
to support and causes numerous issues when we try to allow unprivileged
users to access PCI devices.

VFIO/UIOMMU reduces this to two interfaces, /dev/vfioX and /dev/uiommu.
The VFIO device file provides all the necessary support for accessing
PCI config space, read/write/mmap BARs (including IO-port space),
configuring INTx/MSI/MSI-X interupts and setting up DMA mapping.  The
UIOMMU interface allows iommu domains to be created, and via vfio,
devices can be bound to a domain.  This provides an easier model to
support (IMHO) and removes the bindings that make current device
assignment only useable for KVM enabled guests.

Usage is similar to KVM device assignment.  Rather than binding the
device to the pci-stub driver, vfio devices need to be bound to the
vfio driver.  From there, it's a simple matter of specifying the
device as:

-device vfio,host=01:00.0

This example requires either root privileges or proper permissions on
/dev/uiommu and /dev/vfioX.  To support unprivileged operation, the
options vfiofd= and uiommufd= are available.  Depending on the usage
of uiommufd, each guest device can be assigned to the same iommu
domain, or to independent iommu domains.  In the example above, each
device is assigned to a separate iommu domain.

As VFIO has no KVM dependencies, this patch works with or without
-enable-kvm.  I have successfully used a couple assigned devices in a
guest without KVM support, however Michael Tsirkin warns that tcg
may not provide atomic operations to memory visible to the passthrough
device, which could result in failures for devices depending on such
for synchronization.

This patch is functional, but hasn't seen a lot of testing.  I've
tested 82576 PFs and VFs, an Intel HDA audio device, and UHCI and EHCI
USB devices (this actually includes INTx/MSI/MSI-X, 4k aligned MMIO
BARs, non-4k aligned MMIO BARs, and IO-Port BARs).

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Makefile.target |    1 
 hw/linux-vfio.h |  200 ++++++++
 hw/vfio.c       | 1295 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio.h       |   90 ++++
 4 files changed, 1586 insertions(+), 0 deletions(-)
 create mode 100644 hw/linux-vfio.h
 create mode 100644 hw/vfio.c
 create mode 100644 hw/vfio.h

diff --git a/Makefile.target b/Makefile.target
index 0c1b916..4936d96 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -197,6 +197,7 @@ obj-i386-y += vmmouse.o vmport.o hpet.o
 obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
 obj-i386-y += debugcon.o multiboot.o
 obj-i386-y += pc_piix.o
+obj-i386-y += vfio.o
 
 # shared objects
 obj-ppc-y = ppc.o
diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h
new file mode 100644
index 0000000..06bd3f3
--- /dev/null
+++ b/hw/linux-vfio.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Portions derived from drivers/uio/uio.c:
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Portions derived from drivers/uio/uio_pci_generic.c:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ */
+
+/*
+ * VFIO driver - allow mapping and use of certain PCI devices
+ * in unprivileged user processes. (If IOMMU is present)
+ * Especially useful for Virtual Function parts of SR-IOV devices
+ */
+
+#ifdef __KERNEL__
+
+struct vfio_dev {
+	struct device	*dev;
+	struct pci_dev	*pdev;
+	u8		*pci_config_map;
+	int		pci_config_size;
+	char		name[8];
+	int		devnum;
+	int		pmaster;
+	void __iomem	*bar[PCI_ROM_RESOURCE+1];
+	spinlock_t	irqlock;	/* guards command register accesses */
+	int		listeners;
+	u32		locked_pages;
+	struct mutex	lgate;		/* listener gate */
+	struct mutex	dgate;		/* dma op gate */
+	struct mutex	igate;		/* intr op gate */
+	struct msix_entry	*msix;
+	int			nvec;
+	struct uiommu_domain	*udomain;
+	int			cachec;
+	struct eventfd_ctx	*ev_irq;
+	struct eventfd_ctx	*ev_msi;
+	struct eventfd_ctx	**ev_msix;
+	struct {
+		u8	intr;
+		u8	bardirty;
+		u8	rombar[4];
+		u8	bar[6*4];
+		u8	msi[24];
+	} vinfo;
+};
+
+struct vfio_listener {
+	struct vfio_dev	*vdev;
+	struct list_head	dm_list;
+	struct mm_struct	*mm;
+	struct mmu_notifier	mmu_notifier;
+};
+
+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+	struct list_head list;
+	struct page     **pages;
+	dma_addr_t      daddr;
+	unsigned long	vaddr;
+	int		npage;
+	int		rdwr;
+};
+
+/* VFIO class infrastructure */
+struct vfio_class {
+	struct kref kref;
+	struct class *class;
+};
+extern struct vfio_class *vfio_class;
+
+ssize_t vfio_io_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+ssize_t vfio_mem_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+ssize_t vfio_config_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+
+void vfio_disable_msi(struct vfio_dev *);
+void vfio_disable_msix(struct vfio_dev *);
+int vfio_enable_msi(struct vfio_dev *, int);
+int vfio_enable_msix(struct vfio_dev *, int, void __user *);
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define	PCI_MSIX_ENTRY_SIZE	16
+#endif
+#ifndef PCI_STATUS_INTERRUPT
+#define	PCI_STATUS_INTERRUPT	0x08
+#endif
+
+struct vfio_dma_map;
+void vfio_dma_unmapall(struct vfio_listener *);
+int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *);
+int vfio_dma_map_common(struct vfio_listener *, unsigned int,
+			struct vfio_dma_map *);
+int vfio_domain_set(struct vfio_dev *, int);
+void vfio_domain_unset(struct vfio_dev *);
+
+int vfio_class_init(void);
+void vfio_class_destroy(void);
+int vfio_dev_add_attributes(struct vfio_dev *);
+extern struct idr vfio_idr;
+extern struct mutex vfio_minor_lock;
+int vfio_build_config_map(struct vfio_dev *);
+
+irqreturn_t vfio_interrupt(int, void *);
+
+#endif	/* __KERNEL__ */
+
+/* Kernel & User level defines for ioctls */
+
+/*
+ * Structure for DMA mapping of user buffers
+ * vaddr, dmaaddr, and size must all be page aligned
+ * buffer may only be larger than 1 page if (a) there is
+ * an iommu in the system, or (b) buffer is part of a huge page
+ */
+struct vfio_dma_map {
+	__u64	vaddr;		/* process virtual addr */
+	__u64	dmaaddr;	/* desired and/or returned dma address */
+	__u64	size;		/* size in bytes */
+	__u64	flags;		/* bool: 0 for r/o; 1 for r/w */
+#define	VFIO_FLAG_WRITE		0x1	/* req writeable DMA mem */
+};
+
+/* map user pages at specific dma address */
+/* requires previous VFIO_DOMAIN_SET */
+#define	VFIO_DMA_MAP_IOVA	_IOWR(';', 101, struct vfio_dma_map)
+
+/* unmap user pages */
+#define	VFIO_DMA_UNMAP		_IOW(';', 102, struct vfio_dma_map)
+
+/* set device DMA mask & master status */
+#define	VFIO_DMA_MASK		_IOW(';', 103, __u64)
+
+/* request IRQ interrupts; use given eventfd */
+#define	VFIO_EVENTFD_IRQ	_IOW(';', 104, int)
+
+/* request MSI interrupts; use given eventfd */
+#define	VFIO_EVENTFD_MSI	_IOW(';', 105, int)
+
+/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define	VFIO_EVENTFDS_MSIX	_IOW(';', 106, int)
+
+/* Get length of a BAR */
+#define	VFIO_BAR_LEN		_IOWR(';', 107, __u32)
+
+/* Set the IOMMU domain - arg is fd from uiommu driver */
+#define	VFIO_DOMAIN_SET		_IOW(';', 108, int)
+
+/* Unset the IOMMU domain */
+#define	VFIO_DOMAIN_UNSET	_IO(';', 109)
+
+/*
+ * Reads, writes, and mmaps determine which PCI BAR (or config space)
+ * from the high level bits of the file offset
+ */
+#define	VFIO_PCI_BAR0_RESOURCE		0x0
+#define	VFIO_PCI_BAR1_RESOURCE		0x1
+#define	VFIO_PCI_BAR2_RESOURCE		0x2
+#define	VFIO_PCI_BAR3_RESOURCE		0x3
+#define	VFIO_PCI_BAR4_RESOURCE		0x4
+#define	VFIO_PCI_BAR5_RESOURCE		0x5
+#define	VFIO_PCI_ROM_RESOURCE		0x6
+#define	VFIO_PCI_CONFIG_RESOURCE	0xF
+#define	VFIO_PCI_SPACE_SHIFT	32
+#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE)
+
+static inline int vfio_offset_to_pci_space(__u64 off)
+{
+	return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF;
+}
+
+static inline __u64 vfio_pci_space_to_offset(int sp)
+{
+	return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT;
+}
diff --git a/hw/vfio.c b/hw/vfio.c
new file mode 100644
index 0000000..d9ff3d8
--- /dev/null
+++ b/hw/vfio.c
@@ -0,0 +1,1295 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "event_notifier.h"
+#include "hw.h"
+#include "memory.h"
+#include "monitor.h"
+#include "pc.h"
+#include "qemu-error.h"
+#include "vfio.h"
+#include <pci/header.h>
+#include <pci/types.h>
+#include <linux/types.h>
+#include "linux-vfio.h"
+
+//#define DEBUG_VFIO
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len);
+/*
+ * Generic
+ */
+static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap)
+{
+    int id;
+    int max_cap = 48;
+    int pos = PCI_CAPABILITY_LIST;
+    int status;
+
+    status = pdev->config[PCI_STATUS];
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        pos = pdev->config[pos];
+        if (pos < 0x40) {
+            break;
+        }
+
+        pos &= ~3;
+        id = pdev->config[pos + PCI_CAP_LIST_ID];
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+    const char *p = str;
+    int n, seg, bus, dev, func;
+    char field[5];
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') {
+        return -1;
+    }
+
+    seg = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1) {
+        return -1;
+    }
+
+    if (p[n] == ':') {
+        bus = strtol(field, NULL, 16);
+        p += n + 1;
+    } else {
+        bus = seg;
+        seg = 0;
+    }
+
+    if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') {
+        return -1;
+    }
+
+    dev = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (!qemu_isdigit(*p)) {
+        return -1;
+    }
+
+    func = *p - '0';
+
+    ptr->seg = seg;
+    ptr->bus = bus;
+    ptr->dev = dev;
+    ptr->func = func;
+    return 0;
+}
+
+static int print_hostaddr(DeviceState *qdev, Property *prop,
+                          char *dest, size_t len)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+
+    return snprintf(dest, len, "%04x:%02x:%02x.%x",
+                    ptr->seg, ptr->bus, ptr->dev, ptr->func);
+}
+
+/*
+ * MSI-X
+ */
+static uint32_t msix_mmio_read(VFIODevice *vdev,
+                               target_phys_addr_t addr, int len)
+{
+    unsigned int offset = addr & 0xfff;
+    uint32_t val = 0;
+
+    memcpy(&val, (void *)&((uint8_t *)vdev->msix.table)[offset], len);
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x) = 0x%x\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, len, val);
+    return val;
+}
+
+static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 4);
+}
+
+static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 2);
+}
+
+static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
+{
+    return msix_mmio_read(opaque, addr, 1);
+}
+
+static CPUReadMemoryFunc *msix_mmio_reads[] = {
+    msix_mmio_readb,    msix_mmio_readw,        msix_mmio_readl
+};
+
+static void msix_mmio_write(VFIODevice *vdev, target_phys_addr_t addr,
+                            uint32_t val, int len)
+{
+    unsigned int offset = addr & 0xfff;
+
+    memcpy((void *)&((uint8_t *)vdev->msix.table)[offset], &val, len);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%lx, 0x%x, 0x%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    if ((offset & 0xf) == 0xc && vdev->msix.enabled) {
+        uint64_t off = vdev->msix.bar_offset + offset +
+                       vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE +
+                                                vdev->msix.bar);
+        if (pwrite(vdev->vfiofd, &val, len, off) != len) {
+            fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n");
+        }
+    }
+}
+
+static void msix_mmio_writel(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 4);
+}
+
+static void msix_mmio_writew(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 2);
+}
+
+static void msix_mmio_writeb(void *opaque,
+                             target_phys_addr_t addr, uint32_t val)
+{
+    msix_mmio_write(opaque, addr, val, 1);
+}
+
+static CPUWriteMemoryFunc *msix_mmio_writes[] = {
+    msix_mmio_writeb,   msix_mmio_writew,       msix_mmio_writel
+};
+
+static void vfio_msix_interrupt(void *opaque)
+{
+    MSIXEvent *event = opaque;
+    uint64_t addr;
+    uint32_t data;
+
+    if (!event_notifier_test_and_clear(&event->notifier)) {
+        return;
+    }
+
+    addr = le32_to_cpu(event->entry->upper_addr);
+    addr = (addr << 32) | le32_to_cpu(event->entry->addr);
+    data = le32_to_cpu(event->entry->data);
+    DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr);
+    stl_phys(addr, data);
+}
+
+static void vfio_enable_msix(VFIODevice *vdev)
+{
+    int i, vectors, *fds;
+    uint64_t off = vdev->msix.bar_offset +
+                   vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE +
+                                            vdev->msix.bar);
+
+    /* Hmm, it's probably possible for a driver to setup less then
+     * the full table of vectors... right?
+     */
+    for (i = 0; i < vdev->msix.table_len; i++) {
+        if (!vdev->msix.table[i].addr) {
+            break;
+        }
+    }
+
+    vectors = i;
+    if (!vectors) {
+        fprintf(stderr, "vfio: Error: no MSIX vectors enabled\n");
+        return;
+    }
+
+    vdev->msix.events = qemu_mallocz(vectors * sizeof(MSIXEvent));
+    vdev->msix.num_events = vectors;
+    fds = qemu_malloc((vectors + 1) * sizeof(int));
+    fds[0] = vectors;
+
+    for (i = 0; i < vectors; i++) {
+        vdev->msix.events[i].entry = &vdev->msix.table[i];
+        if (event_notifier_init(&vdev->msix.events[i].notifier, 0))
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+
+        fds[i + 1] = event_notifier_get_fd(&vdev->msix.events[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL,
+                            &vdev->msix.events[i]);
+    }
+
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    qemu_free(fds);
+
+    for (i = 0; i < vectors; i++) {
+        MSIXTableEntry *te = &vdev->msix.table[i];
+        if (pwrite(vdev->vfiofd, &te->ctrl, sizeof(te->ctrl),
+                   off + (i * sizeof(MSIXTableEntry)) +
+                   offsetof(MSIXTableEntry, ctrl)) != sizeof(te->ctrl)) {
+            fprintf(stderr, "vfio: Error: Failed to update MSIX table ctrl\n");
+        }
+    }
+    vdev->msix.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+static void vfio_disable_msix(VFIODevice *vdev)
+{
+    uint32_t vectors = 0;
+    int i;
+
+    if (!vdev->msix.enabled) {
+        return;
+    }
+
+    for (i = 0; i < vdev->msix.num_events; i++) {
+        int fd = event_notifier_get_fd(&vdev->msix.events[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msix.events[i].notifier);
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors);
+    qemu_free(vdev->msix.events);
+    vdev->msix.events = NULL;
+    vdev->msix.num_events = 0;
+    vdev->msix.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * MSI
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    MSIEvent *event = opaque;
+    uint64_t addr;
+    uint32_t data;
+
+    if (!event_notifier_test_and_clear(&event->notifier)) {
+        return;
+    }
+
+    if (event->upper_addr) {
+        addr = pci_get_long(event->upper_addr);
+        addr = (addr << 32) | pci_get_long(event->addr);
+    } else {
+        addr = pci_get_long(event->addr);
+    }
+    data = pci_get_word(event->data);
+    DPRINTF("%s: 0x%x -> 0x%lx\n", __FUNCTION__, data, addr);
+    stl_phys(addr, data);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int i, vectors, *fds;
+    uint16_t ctrl = vfio_pci_read_config(&vdev->pdev,
+                                         vdev->msi.pos + PCI_MSI_FLAGS,
+                                         sizeof(ctrl));
+    ctrl = le32_to_cpu(ctrl);
+    vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
+
+    if (vectors > 32) {
+        fprintf(stderr, "vfio: Error: Invalid configured MSI vectors %d\n",
+                vectors);
+        return;
+    }
+
+    vdev->msi.events = qemu_mallocz(vectors * sizeof(MSIEvent));
+    vdev->msi.num_events = vectors;
+    fds = qemu_malloc((vectors + 1) * sizeof(int));
+    fds[0] = vectors;
+
+    for (i = 0; i < vectors; i++) {
+        vdev->msi.events[i].addr = vdev->pdev.config +
+                                   vdev->msi.pos + PCI_MSI_ADDRESS_LO;
+        if (ctrl & PCI_MSI_FLAGS_64BIT) {
+            vdev->msi.events[i].upper_addr = vdev->pdev.config +
+                                             vdev->msi.pos +
+                                             PCI_MSI_ADDRESS_HI;
+            vdev->msi.events[i].data = vdev->pdev.config +
+                                             vdev->msi.pos + PCI_MSI_DATA_64;
+        } else {
+            vdev->msi.events[i].upper_addr = NULL;
+            vdev->msi.events[i].data = vdev->pdev.config +
+                                             vdev->msi.pos + PCI_MSI_DATA_32;
+        }
+
+        if (event_notifier_init(&vdev->msi.events[i].notifier, 0)) {
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        }
+        fds[i + 1] = event_notifier_get_fd(&vdev->msi.events[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL,
+                            &vdev->msi.events[i]);
+    }
+    
+    /* FIXME: current vfio only supports 1 MSI */
+    if (vectors > 1) {
+        fprintf(stderr, "vfio: Error: only support 1 MSI vector, want %d\n",
+                vectors);
+        abort();
+    }
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &fds[1])) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    qemu_free(fds);
+    vdev->msi.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+static void vfio_disable_msi(VFIODevice *vdev)
+{
+    uint32_t vectors = -1;
+    int i;
+
+    if (!vdev->msi.enabled) {
+        return;
+    }
+
+    for (i = 0; i < vdev->msi.num_events; i++) {
+        int fd = event_notifier_get_fd(&vdev->msi.events[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msi.events[i].notifier);
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_MSI, &vectors);
+    qemu_free(vdev->msi.events);
+    vdev->msi.events = NULL;
+    vdev->msi.num_events = 0;
+    vdev->msi.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * INTx
+ */
+static void vfio_unmask_intx(VFIODevice *vdev)
+{
+    uint16_t cmd;
+
+    cmd = vfio_pci_read_config(&vdev->pdev, PCI_COMMAND, sizeof(cmd));
+    cmd = le16_to_cpu(cmd);
+    cmd &= ~PCI_COMMAND_INTX_DISABLE;
+    cmd = cpu_to_le16(cmd);
+    vfio_pci_write_config(&vdev->pdev, PCI_COMMAND, cmd, sizeof(cmd));
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.notifier)) {
+        return;
+    }
+
+    DPRINTF(stderr, "%s(%04x:%02x:%02x.%x) Pin %c\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, 'A' + vdev->intx.pin);
+
+    vdev->intx.pending = 1;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(ioapic_eoi_client *client)
+{
+    VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client);
+
+    if (!vdev->intx.pending) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+
+    vdev->intx.pending = 0;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_unmask_intx(vdev);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    if (!(vdev->intx.pin = vfio_pci_read_config(&vdev->pdev,
+                                                PCI_INTERRUPT_PIN, 1))) {
+        return 0;
+    }
+
+    vdev->intx.pin--; /* Pin A (1) -> irq[0] */
+    vdev->intx.eoi_client.eoi = vfio_eoi;
+    vdev->intx.eoi_client.irq = pci_get_byte(vdev->pdev.config +
+                                             PCI_INTERRUPT_LINE);
+    ioapic_register_eoi_client(&vdev->intx.eoi_client);
+
+    if (event_notifier_init(&vdev->intx.notifier, 0)) {
+        fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        return -1;
+    }
+
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+    qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) {
+        fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n",
+                strerror(errno));
+        return -1;
+    }
+    vfio_unmask_intx(vdev);
+    vdev->intx.enabled = 1;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    if (!vdev->intx.enabled) {
+        return;
+    }
+
+    ioapic_unregister_eoi_client(&vdev->intx.eoi_client);
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+    qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    event_notifier_cleanup(&vdev->intx.notifier);
+    fd = -1;
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd);
+    vdev->intx.enabled = 0;
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func);
+}
+
+/*
+ * IO Port/MMIO
+ */
+static void vfio_resource_write(PCIResource *res, uint32_t addr,
+                                uint32_t val, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+
+    if (pwrite(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, val, len, strerror(errno));
+    }
+    DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n",
+            __FUNCTION__, res->bar, addr, val, len);
+}
+
+static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 1);
+}
+
+static void vfio_resource_writew(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 2);
+}
+
+static void vfio_resource_writel(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 4);
+}
+
+static CPUWriteMemoryFunc * const vfio_resource_writes[] = {
+    &vfio_resource_writeb,
+    &vfio_resource_writew,
+    &vfio_resource_writel
+};
+
+static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 1);
+}
+
+static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 2);
+}
+
+static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 4);
+}
+
+static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+    uint32_t val;
+
+    if (pread(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, len, strerror(errno));
+        return 0xffffffffU;
+    }
+    DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n",
+            __FUNCTION__, res->bar, addr, len, val);
+    return val;
+}
+
+static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 1) & 0xff;
+}
+
+static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 2) & 0xffff;
+}
+
+static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 4);
+}
+
+static CPUReadMemoryFunc * const vfio_resource_reads[] = {
+    &vfio_resource_readb,
+    &vfio_resource_readw,
+    &vfio_resource_readl
+};
+
+static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff;
+}
+
+static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff;
+}
+
+static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 4);
+}
+
+static void vfio_ioport_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res);
+    register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res);
+    register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res);
+    register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res);
+    register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res);
+    register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res);
+}
+
+static void vfio_iomem_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (e_size != res->size) {
+        fprintf(stderr, "vfio: Error: partial BAR map?\n");
+        abort();
+    }
+
+    if (res->msix) {
+        if (res->msix_offset > 0) {
+            cpu_register_physical_memory(e_phys, res->msix_offset,
+                                         res->memory_index[0]);
+        }
+
+        DPRINTF("Overlaying MSI-X table page\n");
+        cpu_register_physical_memory(e_phys + res->msix_offset,
+                                     TARGET_PAGE_SIZE, vdev->msix.index);
+
+        if (res->size > res->msix_offset + 0x1000) {
+            cpu_register_physical_memory(e_phys + res->msix_offset + 0x1000,
+                                         res->size - res->msix_offset - 0x1000,
+                                         res->memory_index[1]);
+        }
+    } else {
+        if (!res->slow) {
+            cpu_register_physical_memory(e_phys, e_size, res->memory_index[0]);
+        } else {
+            cpu_register_physical_memory(e_phys, e_size, res->io_mem);
+        }
+    }
+}
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func, addr, len, strerror(errno));
+        return -1;
+    }
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, addr, len, val);
+    return pci_default_read_config(pdev, addr, len);
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, addr, val, len, strerror(errno));
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n",
+            __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    if (vdev->msix.pos && (addr == vdev->msix.pos + PCI_MSIX_FLAGS)) {
+        if (vdev->msix.enabled) {
+            if (!(val & PCI_MSIX_FLAGS_ENABLE)) {
+                vfio_disable_msix(vdev);
+            }
+        } else {
+            if (val & PCI_MSIX_FLAGS_ENABLE) {
+                vfio_enable_msix(vdev);
+            }
+        }
+    }
+
+    if (vdev->msi.pos && (addr == vdev->msi.pos + PCI_MSI_FLAGS)) {
+        if (vdev->msi.enabled) {
+            if (!(val & PCI_MSI_FLAGS_ENABLE)) {
+                vfio_disable_msi(vdev);
+            }
+        } else {
+            if (val & PCI_MSI_FLAGS_ENABLE) {
+                vfio_enable_msi(vdev);
+            }
+        }
+    }
+
+    if (addr == PCI_INTERRUPT_LINE) {
+        if (len != 1) {
+            fprintf(stderr, "vfio: fixme: INTERRUPT_LINE written as %d bytes\n",
+                    len);
+        }
+        vdev->intx.eoi_client.irq = val;
+    }
+
+    pci_default_write_config(pdev, addr, val, len);
+}
+
+/*
+ * DMA
+ */
+static int vfio_do_map_iommu(VFIODevice *vdev, int map)
+{
+    QemuRamSlot *slot;
+
+    QLIST_FOREACH(slot, &ram_slots.slots, next) {
+        struct vfio_dma_map dma_map;
+
+        dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(slot->offset);
+        dma_map.dmaaddr = slot->start_addr;
+        dma_map.size = slot->size;
+        dma_map.flags = VFIO_FLAG_WRITE;
+
+        if (map) {
+            if (ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map))
+                return -1;
+        } else {
+            ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map);
+        }
+    }
+    return 0;
+}
+
+static int vfio_map_iommu(VFIODevice *vdev)
+{
+    return vfio_do_map_iommu(vdev, 1);
+}
+
+static void vfio_unmap_iommu(VFIODevice *vdev)
+{
+    vfio_do_map_iommu(vdev, 0);
+}
+
+/*
+ * Interrupt setup
+ */
+static int vfio_setup_msi(VFIODevice *vdev)
+{
+    int pos;
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) {
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @%d\n", vdev->host.seg,
+                vdev->host.bus, vdev->host.dev, vdev->host.func, pos);
+        vdev->msi.pos = pos;
+    }
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) {
+        uint16_t ctrl;
+        uint32_t table, pba, len;
+
+        if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF +
+                  pos + PCI_MSIX_TABLE) != sizeof(table)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &pba, sizeof(pba),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_MSIX_PBA) != sizeof(pba)) {
+            return -1;
+        }
+
+        ctrl = le16_to_cpu(ctrl);
+        table = le32_to_cpu(table);
+        pba = le32_to_cpu(pba);
+
+        vdev->msix.pos = pos;
+        vdev->msix.table_len = (ctrl & PCI_MSIX_TABSIZE) + 1;
+        vdev->msix.bar = table & PCI_MSIX_BIR;
+        vdev->msix.bar_offset = table & ~PCI_MSIX_BIR;
+        vdev->resources[vdev->msix.bar].msix = 1;
+        vdev->resources[vdev->msix.bar].msix_offset = vdev->msix.bar_offset;
+
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @%d, BAR %d, offset 0x%x\n",
+                vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, pos, vdev->msix.bar, vdev->msix.bar_offset);
+
+        if ((pba & PCI_MSIX_BIR) == vdev->msix.bar &&
+            ((pba & ~0xfff) == vdev->msix.bar_offset)) {
+            fprintf(stderr, "vfio: Error: MSIX Table & PBA reside in the same "
+                    "page, not yet supported\n");
+            return -1;
+        }
+
+        /*
+         * Check if the BAR containing the MSIX table is 4k aligned, if
+         * so we can avoid slow mapping messiness.  This shouldn't fail
+         * for devices that follow the spec recommendations for sizing
+         * and placement. */
+        len = vdev->msix.bar;
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n");
+            return -1;
+        }
+        if (!len || len & 0xfff) {
+            fprintf(stderr, "vfio: MSIX BAR not 4k aligned\n");
+            return -1;
+        }
+
+        vdev->msix.table = mmap(NULL, 0x1000, PROT_READ|PROT_WRITE,
+                                MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+        if (vdev->msix.table == MAP_FAILED) {
+            fprintf(stderr, "vfio: Failed to allocate MSIX table page\n");
+            return -1;
+        }
+
+        memset(vdev->msix.table, 0, 0x1000);
+        vdev->msix.index = cpu_register_io_memory(msix_mmio_reads,
+                                                  msix_mmio_writes, vdev);
+    }
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    if (vdev->msix.table) {
+        munmap(vdev->msix.table, 0x1000);
+    }
+    if (vdev->msix.index) {
+        cpu_unregister_io_memory(vdev->msix.index);
+    }
+}
+
+/*
+ * Resource setup
+ */
+static int vfio_setup_resources(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        uint32_t len, bar;
+        PCIResource *res;
+        uint8_t offset;
+        int ret, space;
+
+        res = &vdev->resources[i];
+        res->vfiofd = vdev->vfiofd;
+        res->bar = len = i;
+
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i);
+            return -1;
+        }
+        if (!len) {
+            continue;
+        }
+
+        offset = PCI_BASE_ADDRESS_0 + (4 * i);
+        ret = pread(vdev->vfiofd, &bar, sizeof(bar),
+                    VFIO_PCI_CONFIG_OFF + offset);
+        if (ret != sizeof(bar)) {
+            fprintf(stderr, "vfio: Failed to read BAR %d\n", i);
+            return -1;
+        }
+        bar = le32_to_cpu(bar);
+        space = bar & PCI_BASE_ADDRESS_SPACE;
+
+        if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) {
+            int off = VFIO_PCI_BAR0_RESOURCE + i;
+            int flags = PROT_READ;
+            char name[32];
+
+            res->mem = 1;
+            res->size = len;
+
+            if (i != PCI_ROM_SLOT) {
+                flags |= PROT_WRITE;
+            }
+           
+            if (vdev->pdev.qdev.info->vmsd) {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->vmsd->name, i);
+            } else {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->name, i);
+            }
+
+            if (res->msix) {
+                if (res->msix_offset) {
+                    char *c = &name[strlen(name)];
+                    
+                    res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags,
+                                              MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off));
+
+                    if (res->r_virtbase[0] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".0", sizeof(name));
+                    res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev,
+                                                        name, res->msix_offset,
+                                                        res->r_virtbase[0]);
+                    *c = 0;
+                }
+                if (len > res->msix_offset + 0x1000) {
+                    char *c = &name[strlen(name)];
+
+                    res->r_virtbase[1] = mmap(NULL,
+                                              len - res->msix_offset - 0x1000,
+                                              flags, MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off) +
+                                              res->msix_offset + 0x1000);
+
+                    if (res->r_virtbase[1] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".1", sizeof(name));
+                    res->memory_index[1] = qemu_ram_map(&vdev->pdev.qdev, name,
+                                                        len - 0x1000 -
+                                                        res->msix_offset,
+                                                        res->r_virtbase[1]);
+                    *c = 0;
+                }
+            } else {
+                res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED,
+                                          vdev->vfiofd,
+                                          vfio_pci_space_to_offset(off));
+
+                if (res->r_virtbase[0] == MAP_FAILED) {
+                    fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                    return -1;
+                }
+                res->memory_index[0] = qemu_ram_map(&vdev->pdev.qdev, name,
+                                                    len, res->r_virtbase[0]);
+                if (i == PCI_ROM_SLOT) {
+                    res->memory_index[0] |= IO_MEM_ROM;
+                }
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+                  
+            if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+                i++;
+            }
+        } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+            res->mem = 1;
+            res->size = len;
+            res->slow = 1;
+
+            DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n",
+                    __FUNCTION__, vdev->host.seg, vdev->host.bus,
+            vdev->host.dev, vdev->host.func, i);
+
+            if (i == PCI_ROM_SLOT) {
+                res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                     NULL, res);
+            } else {
+                res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                     vfio_resource_writes, res);
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+
+        } else if (space == PCI_BASE_ADDRESS_SPACE_IO) {
+            res->size = len;
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map);
+        }
+        res->valid = 1;
+    }
+    return 0;
+}
+
+static void vfio_unmap_resources(VFIODevice *vdev)
+{
+    int i;
+    PCIResource *res = vdev->resources;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++, res++) {
+        if (res->valid && res->mem) {
+            if (res->msix) {
+                if (res->msix_offset) {
+                    cpu_register_physical_memory(res->e_phys, res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->msix_offset);
+                }
+                if (res->size > res->msix_offset + 0x1000) {
+                    cpu_register_physical_memory(res->e_phys + 0x1000 +
+                                                 res->msix_offset,
+                                                 res->e_size - 0x1000 -
+                                                 res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[1]);
+                    munmap(res->r_virtbase[1],
+                           res->size - 0x1000 - res->msix_offset);
+                }
+            } else {
+                if (!res->slow) {
+                    cpu_register_physical_memory(res->e_phys, res->e_size,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_unmap(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->size);
+                } else {
+                    cpu_unregister_io_memory(res->io_mem);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * General setup
+ */
+static int get_vfio_fd(VFIODevice *vdev)
+{
+    if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) {
+        if (qemu_isdigit(vdev->vfiofd_name[0])) {
+            vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name);
+            if (vdev->vfiofd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->vfiofd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        char vfio_dir[64], vfio_dev[16];
+        DIR *dir;
+        struct dirent *de;
+
+        sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/",
+                vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func);
+        dir = opendir(vfio_dir);
+        if (!dir) {
+            error_report("vfio: error: Driver not attached\n");
+            return -1;
+        }
+
+        while ((de = readdir(dir))) {
+            if (de->d_name[0] == '.')
+                continue;
+            if (!strncmp(de->d_name, "vfio", 4))
+                break;
+        }
+
+        if (!de) {
+            error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir);
+            return -1;
+        }
+
+        sprintf(vfio_dev, "/dev/%s", de->d_name);
+        vdev->vfiofd = open(vfio_dev, O_RDWR);
+        if (vdev->vfiofd < 0) {
+            error_report("pci-assign: vfio: Failed to open %s: %s\n",
+                         vfio_dev, strerror(errno));
+            return -1;
+        }
+        return 0;
+    }
+}
+
+static int get_uiommu_fd(VFIODevice *vdev)
+{
+    if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) {
+        if (qemu_isdigit(vdev->uiommufd_name[0])) {
+            vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name);
+            if (vdev->uiommufd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->uiommufd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        vdev->uiommufd = open("/dev/uiommu", O_RDONLY);
+        if (vdev->uiommufd < 0) {
+            return -1;
+        }
+        vdev->uiommufd_name = NULL; /* easier test later */
+        return 0;
+    }
+}
+
+static int vfio_initfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    char sys[64];
+    struct stat st;
+    int ret;
+
+    /* Check that the host device exists */
+    sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+            vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
+    if (stat(sys, &st) < 0) {
+        error_report("vfio: error: no such host device "
+                     "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus,
+                     vdev->host.dev, vdev->host.func);
+        return -1;
+    }
+
+    if (get_uiommu_fd(vdev))
+        return -1;
+
+    if (get_vfio_fd(vdev))
+        goto out_close_uiommu;
+
+    if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd))
+        goto out_close_vfiofd;
+
+    /* Get a copy of config space */
+    ret = pread(vdev->vfiofd, vdev->pdev.config,
+                pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF);
+    if (ret < pci_config_size(&vdev->pdev)) {
+        fprintf(stderr, "vfio: Failed to read device config space\n");
+        goto out_unset_domain;
+    }
+
+    if (vfio_setup_msi(vdev))
+        goto out_unset_domain;
+
+    if (vfio_setup_resources(vdev))
+        goto out_disable_msix;
+
+    if (vfio_map_iommu(vdev))
+        goto out_unmap_resources;
+
+    if (vfio_enable_intx(vdev))
+        goto out_unmap_iommu;
+
+    return 0;
+
+out_unmap_iommu:
+    vfio_unmap_iommu(vdev);
+out_unmap_resources:
+    vfio_unmap_resources(vdev);
+out_disable_msix:
+    vfio_teardown_msi(vdev);
+out_unset_domain:
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+out_close_vfiofd:
+    close(vdev->vfiofd);
+out_close_uiommu:
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return -1;
+}
+
+static int vfio_exitfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    
+    vfio_disable_intx(vdev);
+    vfio_disable_msi(vdev);
+    vfio_disable_msix(vdev);
+    vfio_unmap_iommu(vdev);
+    vfio_unmap_resources(vdev);
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+    close(vdev->vfiofd);
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return 0;
+}
+
+static PropertyInfo qdev_prop_hostaddr = {
+    .name  = "pci-hostaddr",
+    .type  = -1,
+    .size  = sizeof(PCIHostDevice),
+    .parse = parse_hostaddr,
+    .print = print_hostaddr,
+};
+
+static PCIDeviceInfo vfio_info = {
+    .qdev.name    = "vfio",
+    .qdev.desc    = "pass through host pci devices to the guest via vfio",
+    .qdev.size    = sizeof(VFIODevice),
+    .init         = vfio_initfn,
+    .exit         = vfio_exitfn,
+    .config_read  = vfio_pci_read_config,
+    .config_write = vfio_pci_write_config,
+    .qdev.props   = (Property[]) {
+        DEFINE_PROP("host", VFIODevice, host,
+                    qdev_prop_hostaddr, PCIHostDevice),
+        DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+        DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name),
+        DEFINE_PROP_END_OF_LIST(),
+    },
+};
+
+static void vfio_register_devices(void)
+{
+    pci_qdev_register(&vfio_info);
+}
+
+device_init(vfio_register_devices)
diff --git a/hw/vfio.h b/hw/vfio.h
new file mode 100644
index 0000000..9d05ae1
--- /dev/null
+++ b/hw/vfio.h
@@ -0,0 +1,90 @@
+#ifndef __VFIO_H__
+#define __VFIO_H__
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+
+typedef struct PCIHostDevice {
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t dev:5;
+    uint8_t func:3;
+} PCIHostDevice;
+
+typedef struct PCIResource {
+    uint8_t valid:1;
+    uint8_t mem:1;
+    uint8_t msix:1;
+    uint8_t bar:3;               /* see vfio_resource_read/write */
+    uint8_t slow:1;              /* use read/write rather than mmap */
+    uint64_t size;
+    ram_addr_t memory_index[2];  /* cpu_register_physical_memory() index */
+    void *r_virtbase[2];         /* mmapped address */
+    int io_mem;                  /* cpu_register_io_memory index */
+    pcibus_t e_phys;             /* emulated base address */
+    pcibus_t e_size;             /* emulated size of region in bytes */
+    uint32_t msix_offset;
+    int vfiofd;                  /* see vfio_resource_read/write */
+} PCIResource;
+
+typedef struct INTx {
+    uint8_t enabled:1;
+    uint8_t pending:1;
+    uint8_t pin:3;
+    EventNotifier notifier;
+    ioapic_eoi_client eoi_client;
+} INTx;
+
+typedef struct MSIEvent {
+    EventNotifier notifier;
+    uint8_t *addr;
+    uint8_t *upper_addr;
+    uint8_t *data;
+} MSIEvent;
+
+typedef struct MSI {
+    uint8_t enabled:1;
+    uint8_t pos;
+    int num_events;
+    MSIEvent *events;
+} MSI;
+
+typedef struct __attribute__((packed)) MSIXTableEntry {
+    uint32_t addr;
+    uint32_t upper_addr;
+    uint32_t data;
+    uint32_t ctrl;
+} MSIXTableEntry;
+
+typedef struct MSIXEvent {
+    EventNotifier notifier;
+    MSIXTableEntry *entry;
+} MSIXEvent;
+
+typedef struct MSIX {
+    uint8_t enabled:1;
+    uint8_t bar:3;
+    uint8_t pos;
+    uint16_t table_len;
+    uint32_t bar_offset;
+    MSIXTableEntry *table;
+    int num_events;
+    MSIXEvent *events;
+    int index;
+} MSIX;
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    PCIHostDevice host;
+    PCIResource resources[PCI_NUM_REGIONS];
+    INTx intx;
+    MSI msi;
+    MSIX msix;
+    int vfiofd;
+    int uiommufd;
+    char *vfiofd_name;
+    char *uiommufd_name;
+} VFIODevice;
+
+#endif /* __VFIO_H__ */

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 5/5] VFIO based device assignment
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment Alex Williamson
@ 2010-07-11 18:27   ` Avi Kivity
  2010-07-11 19:38     ` Alex Williamson
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:27 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:09 PM, Alex Williamson wrote:
> This patch adds qemu device assignment support using the proposed
> VFIO/UIOMMU kernel interfaces.  The existing KVM-only device assignment
> code makes use of various pci sysfs files for config space, MMIO BAR
> mapping, and misc other config items.  It then jumps over to KVM-specific
> ioctls for enabling interrupts and assigning devices to IOMMU domains.
> Finally, IO-port support uses in/out directly.  This is a messy model
> to support and causes numerous issues when we try to allow unprivileged
> users to access PCI devices.
>
> VFIO/UIOMMU reduces this to two interfaces, /dev/vfioX and /dev/uiommu.
> The VFIO device file provides all the necessary support for accessing
> PCI config space, read/write/mmap BARs (including IO-port space),
> configuring INTx/MSI/MSI-X interupts and setting up DMA mapping.  The
> UIOMMU interface allows iommu domains to be created, and via vfio,
> devices can be bound to a domain.  This provides an easier model to
> support (IMHO) and removes the bindings that make current device
> assignment only useable for KVM enabled guests.
>
> Usage is similar to KVM device assignment.  Rather than binding the
> device to the pci-stub driver, vfio devices need to be bound to the
> vfio driver.  From there, it's a simple matter of specifying the
> device as:
>
> -device vfio,host=01:00.0
>
> This example requires either root privileges or proper permissions on
> /dev/uiommu and /dev/vfioX.  To support unprivileged operation, the
> options vfiofd= and uiommufd= are available.  Depending on the usage
> of uiommufd, each guest device can be assigned to the same iommu
> domain, or to independent iommu domains.  In the example above, each
> device is assigned to a separate iommu domain.
>
> As VFIO has no KVM dependencies, this patch works with or without
> -enable-kvm.  I have successfully used a couple assigned devices in a
> guest without KVM support, however Michael Tsirkin warns that tcg
> may not provide atomic operations to memory visible to the passthrough
> device, which could result in failures for devices depending on such
> for synchronization.
>
> This patch is functional, but hasn't seen a lot of testing.  I've
> tested 82576 PFs and VFs, an Intel HDA audio device, and UHCI and EHCI
> USB devices (this actually includes INTx/MSI/MSI-X, 4k aligned MMIO
> BARs, non-4k aligned MMIO BARs, and IO-Port BARs).
>
>    

Good stuff.

I presume the iommu interface is responsible for page pinning.  What 
about page attributes?

There are two cases:

- snoop capable iommu - can use write-backed RAM, but need to enable 
snoop.  BARs still need to respect page attributes.
- older mmu - need to respect guest memory type; probably cannot be done 
without kvm.

If the guest maps a BAR or RAM using write-combine memory type, can we 
reflect that?  This may provide a considerable performance benefit.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 5/5] VFIO based device assignment
  2010-07-11 18:27   ` [Qemu-devel] " Avi Kivity
@ 2010-07-11 19:38     ` Alex Williamson
  2010-07-12  6:37       ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 19:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On Sun, 2010-07-11 at 21:27 +0300, Avi Kivity wrote:
> On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > This patch adds qemu device assignment support using the proposed
> > VFIO/UIOMMU kernel interfaces.  The existing KVM-only device assignment
> > code makes use of various pci sysfs files for config space, MMIO BAR
> > mapping, and misc other config items.  It then jumps over to KVM-specific
> > ioctls for enabling interrupts and assigning devices to IOMMU domains.
> > Finally, IO-port support uses in/out directly.  This is a messy model
> > to support and causes numerous issues when we try to allow unprivileged
> > users to access PCI devices.
> >
> > VFIO/UIOMMU reduces this to two interfaces, /dev/vfioX and /dev/uiommu.
> > The VFIO device file provides all the necessary support for accessing
> > PCI config space, read/write/mmap BARs (including IO-port space),
> > configuring INTx/MSI/MSI-X interupts and setting up DMA mapping.  The
> > UIOMMU interface allows iommu domains to be created, and via vfio,
> > devices can be bound to a domain.  This provides an easier model to
> > support (IMHO) and removes the bindings that make current device
> > assignment only useable for KVM enabled guests.
> >
> > Usage is similar to KVM device assignment.  Rather than binding the
> > device to the pci-stub driver, vfio devices need to be bound to the
> > vfio driver.  From there, it's a simple matter of specifying the
> > device as:
> >
> > -device vfio,host=01:00.0
> >
> > This example requires either root privileges or proper permissions on
> > /dev/uiommu and /dev/vfioX.  To support unprivileged operation, the
> > options vfiofd= and uiommufd= are available.  Depending on the usage
> > of uiommufd, each guest device can be assigned to the same iommu
> > domain, or to independent iommu domains.  In the example above, each
> > device is assigned to a separate iommu domain.
> >
> > As VFIO has no KVM dependencies, this patch works with or without
> > -enable-kvm.  I have successfully used a couple assigned devices in a
> > guest without KVM support, however Michael Tsirkin warns that tcg
> > may not provide atomic operations to memory visible to the passthrough
> > device, which could result in failures for devices depending on such
> > for synchronization.
> >
> > This patch is functional, but hasn't seen a lot of testing.  I've
> > tested 82576 PFs and VFs, an Intel HDA audio device, and UHCI and EHCI
> > USB devices (this actually includes INTx/MSI/MSI-X, 4k aligned MMIO
> > BARs, non-4k aligned MMIO BARs, and IO-Port BARs).
> >
> >    
> 
> Good stuff.
> 
> I presume the iommu interface is responsible for page pinning.

Yes, when we do the VFIO_DMA_MAP_IOVA, the vfio driver registers each
page, which seems to handle the pinning.

> What 
> about page attributes?
> 
> There are two cases:
> 
> - snoop capable iommu - can use write-backed RAM, but need to enable 
> snoop.  BARs still need to respect page attributes.
> - older mmu - need to respect guest memory type; probably cannot be done 
> without kvm.
> 
> If the guest maps a BAR or RAM using write-combine memory type, can we 
> reflect that?  This may provide a considerable performance benefit.

Do we do anything about this today in kvm device assignment?  Maybe it's
buried in the kernel side bits and I've missed it.  I would expect that
WC mappings in the guest carry through to host virtual mappings, but
maybe we can only do that with kvm.  The processor side mappings are
independent of the iommu mappings since devices don't care about such
things.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 5/5] VFIO based device assignment
  2010-07-11 19:38     ` Alex Williamson
@ 2010-07-12  6:37       ` Avi Kivity
  0 siblings, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2010-07-12  6:37 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 10:38 PM, Alex Williamson wrote:
>
>> What
>> about page attributes?
>>
>> There are two cases:
>>
>> - snoop capable iommu - can use write-backed RAM, but need to enable
>> snoop.  BARs still need to respect page attributes.
>> - older mmu - need to respect guest memory type; probably cannot be done
>> without kvm.
>>
>> If the guest maps a BAR or RAM using write-combine memory type, can we
>> reflect that?  This may provide a considerable performance benefit.
>>      
> Do we do anything about this today in kvm device assignment?  Maybe it's
> buried in the kernel side bits and I've missed it.  I would expect that
> WC mappings in the guest carry through to host virtual mappings, but
> maybe we can only do that with kvm.

Yes, see arch/x86/kvm/mmu.c, set_spte() calling ->get_mt_mask().

Strangely, it's qualified with tdp.  Perhaps because of all of the scary 
errata regarding mismatching memory types for a page.

> The processor side mappings are
> independent of the iommu mappings since devices don't care about such
> things.  Thanks,
>    

Yeah.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO device assignment
  2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
                   ` (4 preceding siblings ...)
  2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment Alex Williamson
@ 2010-07-11 18:17 ` Avi Kivity
  2010-07-11 18:37   ` Alex Williamson
  5 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:17 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:09 PM, Alex Williamson wrote:
> The following series implements QEMU device assignment using the
> proposed VFIO/UIOMMU kernel interfaces.  See the last patch for
> further vfio description.  I've tested this on the v2 VFIO patch,
> with a number of fixes hacked in along the way.  I'll update when
> Tom releases a new version of VFIO.  Hopefully this will provide
> some support for the usefulness of such an interfaces.  Thanks,
>
>    

What's the plan for supporting this alongside the existing kvm device 
assignment code?

vfio will only exist in very new kernels, so we have to support the old 
code for a while to give people chance to adjust (say 12-18 months).  
Ideally we'd have compatible command line syntax with qemu choosing vfio 
if available and falling back to kvm device assignment if not.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO device assignment
  2010-07-11 18:17 ` [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO " Avi Kivity
@ 2010-07-11 18:37   ` Alex Williamson
  2010-07-11 18:43     ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 18:37 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On Sun, 2010-07-11 at 21:17 +0300, Avi Kivity wrote:
> On 07/11/2010 09:09 PM, Alex Williamson wrote:
> > The following series implements QEMU device assignment using the
> > proposed VFIO/UIOMMU kernel interfaces.  See the last patch for
> > further vfio description.  I've tested this on the v2 VFIO patch,
> > with a number of fixes hacked in along the way.  I'll update when
> > Tom releases a new version of VFIO.  Hopefully this will provide
> > some support for the usefulness of such an interfaces.  Thanks,
> >
> >    
> 
> What's the plan for supporting this alongside the existing kvm device 
> assignment code?
> 
> vfio will only exist in very new kernels, so we have to support the old 
> code for a while to give people chance to adjust (say 12-18 months).  

I was thinking that vfio device assignment might be the qemu acceptable
version of device assignment, and we can let kvm style device assignment
live out it's remaining time in the qemu-kvm tree, before it gets
deprecated.

> Ideally we'd have compatible command line syntax with qemu choosing vfio 
> if available and falling back to kvm device assignment if not.

Ideally, yes, but I'm not sure how how feasible that is.  For the
command line root user, the syntax is nearly same (s/pci-assign/vfio/),
but once we start trying to do libvirt based assignment, passing vfiofd
& uiommufd, it will need to know the difference anyway.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO device assignment
  2010-07-11 18:37   ` Alex Williamson
@ 2010-07-11 18:43     ` Avi Kivity
  2010-07-11 20:24       ` Alex Williamson
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-11 18:43 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 09:37 PM, Alex Williamson wrote:
> On Sun, 2010-07-11 at 21:17 +0300, Avi Kivity wrote:
>    
>> On 07/11/2010 09:09 PM, Alex Williamson wrote:
>>      
>>> The following series implements QEMU device assignment using the
>>> proposed VFIO/UIOMMU kernel interfaces.  See the last patch for
>>> further vfio description.  I've tested this on the v2 VFIO patch,
>>> with a number of fixes hacked in along the way.  I'll update when
>>> Tom releases a new version of VFIO.  Hopefully this will provide
>>> some support for the usefulness of such an interfaces.  Thanks,
>>>
>>>
>>>        
>> What's the plan for supporting this alongside the existing kvm device
>> assignment code?
>>
>> vfio will only exist in very new kernels, so we have to support the old
>> code for a while to give people chance to adjust (say 12-18 months).
>>      
> I was thinking that vfio device assignment might be the qemu acceptable
> version of device assignment, and we can let kvm style device assignment
> live out it's remaining time in the qemu-kvm tree, before it gets
> deprecated.
>    

Definitely, the effort to make qemu-kvm device assignment mergable 
probably isn't worth it.

>> Ideally we'd have compatible command line syntax with qemu choosing vfio
>> if available and falling back to kvm device assignment if not.
>>      
> Ideally, yes, but I'm not sure how how feasible that is.  For the
> command line root user, the syntax is nearly same (s/pci-assign/vfio/),
> but once we start trying to do libvirt based assignment, passing vfiofd
> &  uiommufd, it will need to know the difference anyway.  Thanks,
>    

I see.  Well, we can probably live with two separate invocations, users 
who assign devices should be savvy or use libvirt.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO device assignment
  2010-07-11 18:43     ` Avi Kivity
@ 2010-07-11 20:24       ` Alex Williamson
  2010-07-12  6:29         ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Alex Williamson @ 2010-07-11 20:24 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On Sun, 2010-07-11 at 21:43 +0300, Avi Kivity wrote:
> On 07/11/2010 09:37 PM, Alex Williamson wrote:
> > On Sun, 2010-07-11 at 21:17 +0300, Avi Kivity wrote:
> >    
> >> On 07/11/2010 09:09 PM, Alex Williamson wrote:
> >>      
> >>> The following series implements QEMU device assignment using the
> >>> proposed VFIO/UIOMMU kernel interfaces.  See the last patch for
> >>> further vfio description.  I've tested this on the v2 VFIO patch,
> >>> with a number of fixes hacked in along the way.  I'll update when
> >>> Tom releases a new version of VFIO.  Hopefully this will provide
> >>> some support for the usefulness of such an interfaces.  Thanks,
> >>>
> >>>
> >>>        
> >> What's the plan for supporting this alongside the existing kvm device
> >> assignment code?
> >>
> >> vfio will only exist in very new kernels, so we have to support the old
> >> code for a while to give people chance to adjust (say 12-18 months).
> >>      
> > I was thinking that vfio device assignment might be the qemu acceptable
> > version of device assignment, and we can let kvm style device assignment
> > live out it's remaining time in the qemu-kvm tree, before it gets
> > deprecated.
> >    
> 
> Definitely, the effort to make qemu-kvm device assignment mergable 
> probably isn't worth it.
> 
> >> Ideally we'd have compatible command line syntax with qemu choosing vfio
> >> if available and falling back to kvm device assignment if not.
> >>      
> > Ideally, yes, but I'm not sure how how feasible that is.  For the
> > command line root user, the syntax is nearly same (s/pci-assign/vfio/),
> > but once we start trying to do libvirt based assignment, passing vfiofd
> > &  uiommufd, it will need to know the difference anyway.  Thanks,
> >    
> 
> I see.  Well, we can probably live with two separate invocations, users 
> who assign devices should be savvy or use libvirt.

One other thing to be aware of is that vfio requires devices to be
PCI-2.3 compliant in order to support DisINTx.  This allows vfio to
support devices making use of shared INTx interrupts, but excludes older
devices that users maybe managed to get assigned to an exclusive
interrupt for kvm style assignment.

I suppose we might be able to make vfio work with either pci 2.3 devices
or older devices with exclusive interrupts if that ends up affecting
many users.

Alex

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO device assignment
  2010-07-11 20:24       ` Alex Williamson
@ 2010-07-12  6:29         ` Avi Kivity
  2010-07-12 11:03           ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-07-12  6:29 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chrisw, mst, qemu-devel, kvm, pugs

On 07/11/2010 11:24 PM, Alex Williamson wrote:
>
> One other thing to be aware of is that vfio requires devices to be
> PCI-2.3 compliant in order to support DisINTx.  This allows vfio to
> support devices making use of shared INTx interrupts, but excludes older
> devices that users maybe managed to get assigned to an exclusive
> interrupt for kvm style assignment.
>
> I suppose we might be able to make vfio work with either pci 2.3 devices
> or older devices with exclusive interrupts if that ends up affecting
> many users.
>    

PCI 2.3 is already old enough (6-7 years?) that I believe we can require it.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO device assignment
  2010-07-12  6:29         ` Avi Kivity
@ 2010-07-12 11:03           ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-07-12 11:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: chrisw, Alex Williamson, qemu-devel, kvm, pugs

On Mon, Jul 12, 2010 at 09:29:38AM +0300, Avi Kivity wrote:
> On 07/11/2010 11:24 PM, Alex Williamson wrote:
> >
> >One other thing to be aware of is that vfio requires devices to be
> >PCI-2.3 compliant in order to support DisINTx.  This allows vfio to
> >support devices making use of shared INTx interrupts, but excludes older
> >devices that users maybe managed to get assigned to an exclusive
> >interrupt for kvm style assignment.
> >
> >I suppose we might be able to make vfio work with either pci 2.3 devices
> >or older devices with exclusive interrupts if that ends up affecting
> >many users.
> 
> PCI 2.3 is already old enough (6-7 years?)

I believe it was made a standard in 2002.

> that I believe we can require it.
> 
> -- 
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2010-07-12 11:08 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-07-11 18:09 [Qemu-devel] [RFC PATCH 0/5] QEMU VFIO device assignment Alex Williamson
2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 1/5] qemu_ram_map/unmap: Allow pre-allocated space to be mapped Alex Williamson
2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 2/5] Minimal RAM API support Alex Williamson
2010-07-11 18:18   ` [Qemu-devel] " Alex Williamson
2010-07-11 18:20   ` Avi Kivity
2010-07-11 18:24     ` Alex Williamson
2010-07-11 18:29       ` Avi Kivity
2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 3/5] RAM API: Make use of it for x86 PC Alex Williamson
2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 4/5] APIC/IOAPIC EOI callback Alex Williamson
2010-07-11 18:14   ` [Qemu-devel] " Avi Kivity
2010-07-11 18:26     ` Alex Williamson
2010-07-11 18:30       ` Avi Kivity
2010-07-11 18:54         ` Michael S. Tsirkin
2010-07-11 19:21           ` Alex Williamson
2010-07-11 19:23             ` Michael S. Tsirkin
2010-07-11 20:03               ` Alex Williamson
2010-07-11 20:05                 ` Michael S. Tsirkin
2010-07-11 20:12                 ` Michael S. Tsirkin
2010-07-11 21:59                   ` Alex Williamson
2010-07-12  6:33         ` Avi Kivity
2010-07-12  9:05           ` Gleb Natapov
2010-07-12  9:13             ` Avi Kivity
2010-07-11 18:09 ` [Qemu-devel] [RFC PATCH 5/5] VFIO based device assignment Alex Williamson
2010-07-11 18:27   ` [Qemu-devel] " Avi Kivity
2010-07-11 19:38     ` Alex Williamson
2010-07-12  6:37       ` Avi Kivity
2010-07-11 18:17 ` [Qemu-devel] Re: [RFC PATCH 0/5] QEMU VFIO " Avi Kivity
2010-07-11 18:37   ` Alex Williamson
2010-07-11 18:43     ` Avi Kivity
2010-07-11 20:24       ` Alex Williamson
2010-07-12  6:29         ` Avi Kivity
2010-07-12 11:03           ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).