public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 10:06       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
@ 2008-10-28 10:06         ` muli
  2008-10-28 14:10           ` Han, Weidong
                             ` (4 more replies)
  0 siblings, 5 replies; 31+ messages in thread
From: muli @ 2008-10-28 10:06 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Muli Ben-Yehuda <muli@il.ibm.com>

This patch has been contributed to by the following people:

Or Sagi <ors@tutis.com>
Nir Peleg <nir@tutis.com>
Amit Shah <amit.shah@redhat.com>
Ben-Ami Yassour <benami@il.ibm.com>
Weidong Han <weidong.han@intel.com>
Glauber de Oliveira Costa <gcosta@redhat.com>
Muli Ben-Yehuda <muli@il.ibm.com>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:

        -pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

* The RAW_IO capability is needed for this to work

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

[muli: lots of small fixes from Muli and Weidong Han addressing all v7
review comments]

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/Makefile.target        |    3 +
 qemu/hw/device-assignment.c |  641 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |  117 ++++++++
 qemu/hw/pc.c                |   16 +
 qemu/hw/pci.c               |    7 +
 qemu/qemu-kvm.c             |   14 +
 qemu/qemu-kvm.h             |    8 +
 qemu/vl.c                   |   28 ++
 8 files changed, 834 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..5d44e08 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM), 1)
+OBJS+= device-assignment.o
+endif
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..89b05f9
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+#include <stdio.h>
+#include <sys/io.h>
+#include "qemu-kvm.h"
+#include "hw.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "console.h"
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO       0x00000100  /* Resource type */
+#define IORESOURCE_MEM      0x00000200
+#define IORESOURCE_IRQ      0x00000400
+#define IORESOURCE_DMA      0x00000800
+#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...)                                       \
+    do {                                                      \
+      fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \
+    } while (0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+
+static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+{
+    return region->u.r_baseport + (addr - region->e_physbase);
+}
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->r_virtbase, value);
+
+    outb(value, r_pio);
+}
+
+static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+          __func__, r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->r_virtbase, value);
+
+    outw(value, r_pio);
+}
+
+static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
+                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->r_virtbase, value);
+
+    outl(value, r_pio);
+}
+
+static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inb(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->r_virtbase, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inw(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->r_virtbase, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inl(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->r_virtbase, value);
+
+    return value;
+}
+
+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t e_phys, uint32_t e_size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_ephys = region->e_physbase;
+    uint32_t old_esize = region->e_size;
+    int first_map = (region->e_size == 0);
+    int ret = 0;
+
+    DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
+          e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
+
+    region->e_physbase = e_phys;
+    region->e_size = e_size;
+
+    if (!first_map)
+	kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
+
+    if (e_size > 0)
+	ret = kvm_register_phys_mem(kvm_context, e_phys,
+                                        region->u.r_virtbase, e_size, 0);
+    if (ret != 0) {
+	fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
+	exit(1);
+    }
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                    uint32_t addr, uint32_t size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_port = region->u.r_baseport;
+    uint32_t old_num = region->e_size;
+    int first_map = (old_num == 0);
+    struct ioperm_data data;
+    int i;
+
+    region->e_physbase = addr;
+    region->e_size = size;
+
+    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
+          addr, region->u.r_baseport, type, size, region_num);
+
+    memset(&data, 0, sizeof(data));
+
+    if (!first_map) {
+	data.start_port = old_port;
+	data.num = old_num; 
+	data.turn_on = 0;
+
+	for (i = 0; i < smp_cpus; ++i)
+	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+    }
+
+    data.start_port = region->u.r_baseport;
+    data.num = size;
+    data.turn_on = 1;
+ 
+    for (i = 0; i < smp_cpus; ++i)
+	kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+ 
+    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+                          (r_dev->v_addrs + region_num));
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                          uint32_t val, int len)
+{
+    int fd;
+    ssize_t ret;
+
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    if (address == 0x4) {
+        pci_default_write_config(d, address, val, len);
+        /* Continue to program the card */
+    }
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        /* used for update-mappings (BAR emulation) */
+        pci_default_write_config(d, address, val, len);
+        return;
+    }
+
+    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                             int len)
+{
+    uint32_t val = 0;
+    int fd;
+    ssize_t ret;
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        val = pci_default_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
+    /* vga specific, remove later */
+    if (address == 0xFC)
+        goto do_log;
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pread(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+
+do_log:
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+    /* kill the special capabilities */
+    if (address == 4 && len == 4)
+        val &= ~0x100000;
+    else if (address == 6)
+        val &= ~0x10;
+
+    return val;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid)
+            continue;
+        pci_dev->v_addrs[i].num = i;
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                : PCI_ADDRESS_SPACE_MEM;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_virtbase =
+                mmap(NULL,
+                     (cur_region->size + 0xFFF) & 0xFFFFF000,
+                     PROT_WRITE | PROT_READ, MAP_SHARED,
+                     cur_region->resource_fd, (off_t) 0);
+
+            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].u.r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            pci_register_io_region((PCIDevice *) pci_dev, i,
+                                   cur_region->size, t,
+                                   assigned_dev_iomem_map);
+            continue;
+        }
+        /* handle port io regions */
+        pci_register_io_region((PCIDevice *) pci_dev, i,
+                               cur_region->size, PCI_ADDRESS_SPACE_IO,
+                               assigned_dev_ioport_map);
+
+        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+        /* not relevant for port io */
+        pci_dev->v_addrs[i].memory_index = 0;
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                           uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128];
+    int fd, r = 0;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+	     r_bus, r_dev, r_func);
+
+    snprintf(name, sizeof(name), "%sconfig", dir);
+
+    fd = open(name, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    dev->config_fd = fd;
+again:
+    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+
+    snprintf(name, sizeof(name), "%sresource", dir);
+
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    r = -1;
+    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
+        r++;
+        rp = dev->regions + r;
+        rp->valid = 0;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+            continue;
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;       /* probably ROM */
+            rp->resource_fd = fd;
+        } else
+            flags &= ~IORESOURCE_PREFETCH;
+
+        rp->type = flags;
+        rp->valid = 1;
+        rp->base_addr = start;
+        rp->size = size;
+        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
+              r, rp->size, start, rp->type, rp->resource_fd);
+    }
+    fclose(f);
+
+    dev->region_number = r;
+    return 0;
+}
+
+static int disable_iommu;
+int nr_assigned_devices;
+static LIST_HEAD(, AssignedDevInfo) adev_head;
+
+static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
+{
+    return (uint32_t)bus << 8 | (uint32_t)devfn;
+}
+
+static AssignedDevice *register_real_device(PCIBus *e_bus,
+                                            const char *e_dev_name,
+                                            int e_devfn, uint8_t r_bus,
+                                            uint8_t r_dev, uint8_t r_func)
+{
+    int r;
+    AssignedDevice *pci_dev;
+    uint8_t e_device, e_intx;
+
+    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
+          e_dev_name, e_devfn);
+
+    pci_dev = (AssignedDevice *)
+        pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
+                            e_devfn, assigned_dev_pci_read_config,
+                            assigned_dev_pci_write_config);
+    if (NULL == pci_dev) {
+        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
+                __func__, e_dev_name);
+        return NULL;
+    }
+    if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
+                __func__, e_dev_name);
+        goto out;
+    }
+
+    /* handle real device's MMIO/PIO BARs */
+    if (assigned_dev_register_regions(pci_dev->real_device.regions,
+                                      pci_dev->real_device.region_number,
+                                      pci_dev))
+        goto out;
+
+    /* handle interrupt routing */
+    e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+    e_intx = pci_dev->dev.config[0x3d] - 1;
+    pci_dev->intpin = e_intx;
+    pci_dev->run = 0;
+    pci_dev->girq = 0;
+    pci_dev->h_busnr = r_bus;
+    pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+    if (kvm_enabled()) {
+        struct kvm_assigned_pci_dev assigned_dev_data;
+
+        memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+        assigned_dev_data.assigned_dev_id  =
+            calc_assigned_dev_id(pci_dev->h_busnr,
+                                 (uint32_t)pci_dev->h_devfn);
+        assigned_dev_data.busnr = pci_dev->h_busnr;
+        assigned_dev_data.devfn = pci_dev->h_devfn;
+
+#ifdef KVM_CAP_IOMMU
+        /* We always enable the IOMMU if present
+         * (or when not disabled on the command line)
+         */
+        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
+        if (r && !disable_iommu)
+            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
+#endif
+        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
+        if (r < 0) {
+            fprintf(stderr, "Could not notify kernel about "
+                "assigned device \"%s\"\n", e_dev_name);
+            perror("register_real_device");
+            goto out;
+        }
+    }
+#endif
+    term_printf("Registered host PCI device %02x:%02x.%1x "
+		"(\"%s\") as guest device %02x:%02x.%1x\n",
+		r_bus, r_dev, r_func, e_dev_name,
+		pci_bus_num(e_bus), e_device, r_func);
+
+    return pci_dev;
+out:
+/*    pci_unregister_device(&pci_dev->dev); */
+    return NULL;
+}
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void assigned_dev_update_irq(PCIDevice *d)
+{
+    int irq, r;
+    AssignedDevice *assigned_dev;
+    AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next) {
+        assigned_dev = adev->assigned_dev;
+        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
+        irq = piix_get_irq(irq);
+
+        if (irq != assigned_dev->girq) {
+            struct kvm_assigned_irq assigned_irq_data;
+
+            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
+            assigned_irq_data.assigned_dev_id  =
+                calc_assigned_dev_id(assigned_dev->h_busnr,
+                                     (uint8_t) assigned_dev->h_devfn);
+            assigned_irq_data.guest_irq = irq;
+            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
+            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
+            if (r < 0) {
+                perror("assigned_dev_update_irq");
+                fprintf(stderr, "Are you assigning a device "
+                        "that shares IRQ with some other device?\n");
+                pci_unregister_device(&assigned_dev->dev);
+                /* FIXME: Delete node from list */
+                continue;
+            }
+            assigned_dev->girq = irq;
+        }
+    }
+}
+#endif
+
+struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
+{
+    adev->assigned_dev = register_real_device(bus,
+                                              adev->name, -1,
+                                              adev->bus,
+                                              adev->dev,
+                                              adev->func);
+    return &adev->assigned_dev->dev;
+}
+
+int init_all_assigned_devices(PCIBus *bus)
+{
+    struct AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next)
+        if (init_assigned_device(adev, bus) == NULL)
+            return -1;
+
+    return 0;
+}
+
+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice dev=bus:dev.func,dma=dma
+ *
+ * Example:
+ * -pcidevice host=00:13.0,dma=pvdma
+ *
+ * dma can currently only be 'none' to disable iommu support.
+ */
+AssignedDevInfo *add_assigned_device(const char *arg)
+{
+    char *cp, *cp1;
+    char device[8];
+    char dma[6];
+    int r;
+    AssignedDevInfo *adev;
+
+    adev = qemu_mallocz(sizeof(AssignedDevInfo));
+    if (adev == NULL) {
+        fprintf(stderr, "%s: Out of memory\n", __func__);
+        return NULL;
+    }
+    r = get_param_value(device, sizeof(device), "host", arg);
+    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
+    if (!r)
+	snprintf(adev->name, sizeof(adev->name), "%s", device);
+
+#ifdef KVM_CAP_IOMMU
+    r = get_param_value(dma, sizeof(dma), "dma", arg);
+    if (r && !strncmp(dma, "none", 4))
+        disable_iommu = 1;
+#endif
+    cp = device;
+    adev->bus = strtoul(cp, &cp1, 16);
+    if (*cp1 != ':')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->dev = strtoul(cp, &cp1, 16);
+    if (*cp1 != '.')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->func = strtoul(cp, &cp1, 16);
+
+    nr_assigned_devices++;
+    LIST_INSERT_HEAD(&adev_head, adev, next);
+    return adev;
+bad:
+    fprintf(stderr, "pcidevice argument parse error; "
+            "please check the help text for usage\n");
+    qemu_free(adev);
+    return NULL;
+}
diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
new file mode 100644
index 0000000..ebc0b50
--- /dev/null
+++ b/qemu/hw/device-assignment.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Data structures for storing PCI state
+ *
+ *  Adapted to kvm by Qumranet
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ */
+
+#ifndef __DEVICE_ASSIGNMENT_H__
+#define __DEVICE_ASSIGNMENT_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "sys-queue.h"
+#include "pci.h"
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+#define MAX_IO_REGIONS (6)
+
+typedef struct {
+    int type;           /* Memory or port I/O */
+    int valid;
+    uint32_t base_addr;
+    uint32_t size;    /* size of the region */
+    int resource_fd;
+} PCIRegion;
+
+typedef struct {
+    uint8_t bus, dev, func; /* Bus inside domain, device and function */
+    int irq;                /* IRQ number */
+    uint16_t region_number; /* number of active regions */
+
+    /* Port I/O or MMIO Regions */
+    PCIRegion regions[MAX_IO_REGIONS];
+    int config_fd;
+} PCIDevRegions;
+
+typedef struct {
+    target_phys_addr_t e_physbase;
+    uint32_t memory_index;
+    union {
+        void *r_virtbase;    /* mmapped access address for memory regions */
+        uint32_t r_baseport; /* the base guest port for I/O regions */
+    } u;
+    int num;            /* our index within v_addrs[] */
+    uint32_t e_size;    /* emulated size of region in bytes */
+    uint32_t r_size;    /* real size of region in bytes */
+} AssignedDevRegion;
+
+typedef struct {
+    PCIDevice dev;
+    int intpin;
+    uint8_t debug_flags;
+    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
+    PCIDevRegions real_device;
+    int run;
+    int girq;
+    unsigned char h_busnr;
+    unsigned int h_devfn;
+    int bound;
+} AssignedDevice;
+
+typedef struct AssignedDevInfo AssignedDevInfo;
+
+struct AssignedDevInfo {
+    char name[15];
+    int bus;
+    int dev;
+    int func;
+    AssignedDevice *assigned_dev;
+    LIST_ENTRY(AssignedDevInfo) next;
+};
+
+PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
+AssignedDevInfo *add_assigned_device(const char *arg);
+void assigned_dev_set_vector(int irq, int vector);
+void assigned_dev_ack_mirq(int vector);
+
+
+#ifdef USE_KVM
+int init_all_assigned_devices(PCIBus *bus);
+#else /* not using kvm */
+static inline int init_all_assigned_devices(PCIBus *bus)
+{
+    return 0;
+}
+#endif /* !USE_KVM */
+
+
+#define MAX_DEV_ASSIGN_CMDLINE 8
+
+extern int device_assignment_enabled;
+extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+extern int assigned_devices_index;
+
+#endif              /* __DEVICE_ASSIGNMENT_H__ */
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index d559f0c..5fdb726 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -33,6 +33,7 @@
 #include "boards.h"
 #include "console.h"
 #include "fw_cfg.h"
+#include "device-assignment.h"
 
 #include "qemu-kvm.h"
 
@@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
 
     if (pci_enabled)
         virtio_balloon_init(pci_bus);
+
+    if (kvm_enabled() && device_assignment_enabled) {
+	int i;
+        for (i = 0; i < assigned_devices_index; i++) {
+            if (add_assigned_device(assigned_devices[i]) < 0) {
+                fprintf(stderr, "Warning: could not add assigned device %s\n",
+                        assigned_devices[i]);
+            }
+        }
+
+	if (init_all_assigned_devices(pci_bus)) {
+	    fprintf(stderr, "Failed to initialize assigned devices\n");
+	    exit (1);
+	}
+    }
 }
 
 static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index c82cd20..f86a8a7 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
 
 static void pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
+void assigned_dev_update_irq(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
         val >>= 8;
     }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+        address >= 0x60 && address <= 0x63)
+        assigned_dev_update_irq(d);
+#endif
+
     end = address + len;
     if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
         /* if the command register is modified, we must modify the mappings */
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index c5f3f29..5e66832 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -20,6 +20,7 @@ int kvm_pit = 1;
 #include "console.h"
 #include "block.h"
 #include "compatfd.h"
+#include "hw/device-assignment.h"
 
 #include "qemu-kvm.h"
 #include <libkvm.h>
@@ -27,6 +28,7 @@ int kvm_pit = 1;
 #include <sys/utsname.h>
 #include <sys/syscall.h>
 #include <sys/mman.h>
+#include <sys/io.h>
 
 #define bool _Bool
 #define false 0
@@ -1047,3 +1049,15 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 {
     return kvm_unregister_coalesced_mmio(kvm_context, addr, size);
 }
+
+static void kvm_do_ioperm(void *_data)
+{
+	struct ioperm_data *data = _data;
+	ioperm(data->start_port, data->num, data->turn_on);
+}
+
+void kvm_ioperm(CPUState *env, void *data)
+{
+	if (kvm_enabled() && qemu_system_ready)
+		on_vcpu(env, kvm_do_ioperm, data);
+}
diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h
index a1d6646..92d921d 100644
--- a/qemu/qemu-kvm.h
+++ b/qemu/qemu-kvm.h
@@ -93,6 +93,8 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 
 void qemu_kvm_system_reset_request(void);
 
+void kvm_ioperm(CPUState *env, void *data);
+
 #ifdef TARGET_PPC
 int handle_powerpc_dcr_read(int vcpu, uint32_t dcrn, uint32_t *data);
 int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
@@ -107,6 +109,12 @@ int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
 extern int kvm_allowed;
 extern kvm_context_t kvm_context;
 
+struct ioperm_data {
+	unsigned long start_port;
+	unsigned long num;
+	int turn_on;
+};
+
 #define kvm_enabled() (kvm_allowed)
 #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
 #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
diff --git a/qemu/vl.c b/qemu/vl.c
index 388e79d..9dda2f9 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -38,6 +38,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "balloon.h"
 #include "qemu-kvm.h"
@@ -215,6 +216,9 @@ CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack = 0;
 #endif
 int usb_enabled = 0;
+int device_assignment_enabled = 0;
+const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+int assigned_devices_index;
 static VLANState *first_vlan;
 int smp_cpus = 1;
 const char *vnc_display;
@@ -8692,6 +8696,12 @@ static void help(int exitcode)
 #endif
 	   "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
 	   "-no-kvm-pit	    disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+           "-pcidevice host=bus:dev.func[,dma=none][,name=\"string\"]\n"
+           "                expose a PCI device to the guest OS.\n"
+           "                dma=none: don't perform any dma translations (default is to use an iommu)\n"
+           "                'string' is used in log output.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-no-acpi        disable ACPI\n"
@@ -8811,6 +8821,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8900,6 +8913,9 @@ static const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9411,6 +9427,7 @@ int main(int argc, char **argv)
     parallel_device_index = 0;
 
     usb_devices_index = 0;
+    assigned_devices_index = 0;
 
     nb_net_clients = 0;
     nb_drives = 0;
@@ -9844,6 +9861,17 @@ int main(int argc, char **argv)
 		kvm_pit = 0;
 		break;
 	    }
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+            case QEMU_OPTION_pcidevice:
+		device_assignment_enabled = 1;
+		if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
+                    fprintf(stderr, "Too many assigned devices\n");
+                    exit(1);
+		}
+		assigned_devices[assigned_devices_index] = optarg;
+		assigned_devices_index++;
+                break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 10:06         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
@ 2008-10-28 14:10           ` Han, Weidong
  2008-10-28 15:32             ` Muli Ben-Yehuda
       [not found]           ` <715D42877B251141A38726ABF5CABF2C018683D874@pdsmsx503.ccr.corp.intel.com>
                             ` (3 subsequent siblings)
  4 siblings, 1 reply; 31+ messages in thread
From: Han, Weidong @ 2008-10-28 14:10 UTC (permalink / raw)
  To: 'muli@il.ibm.com', 'avi@redhat.com'
  Cc: 'kvm@vger.kernel.org', 'anthony@codemonkey.ws',
	'benami@il.ibm.com', 'amit.shah@redhat.com',
	Kay, Allen M

muli@il.ibm.com wrote:
> From: Muli Ben-Yehuda <muli@il.ibm.com>
> 
> This patch has been contributed to by the following people:
> 
> Or Sagi <ors@tutis.com>
> Nir Peleg <nir@tutis.com>
> Amit Shah <amit.shah@redhat.com>
> Ben-Ami Yassour <benami@il.ibm.com>
> Weidong Han <weidong.han@intel.com>
> Glauber de Oliveira Costa <gcosta@redhat.com>
> Muli Ben-Yehuda <muli@il.ibm.com>
> 
> With this patch, we can assign a device on the host machine to a
> guest.
> 
> A new command-line option, -pcidevice is added.
> To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:
> 
>         -pcidevice host=04:08.0
> 
> * The host driver for the device, if any, is to be removed before
> assigning the device (else device assignment will fail).
> 
> * A device that shares IRQ with another host device cannot currently
> be assigned.
> 
> * The RAW_IO capability is needed for this to work
> 
> This works only with the in-kernel irqchip method; to use the
> userspace irqchip, a kernel module (irqhook) and some extra changes
> are needed.
> 
> [muli: lots of small fixes from Muli and Weidong Han addressing all v7
> review comments]
> 
> Signed-off-by: Amit Shah <amit.shah@redhat.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
>  qemu/Makefile.target        |    3 +
>  qemu/hw/device-assignment.c |  641
>  +++++++++++++++++++++++++++++++++++++++++++
>  qemu/hw/device-assignment.h |  117 ++++++++ qemu/hw/pc.c            
>  |   16 + qemu/hw/pci.c               |    7 +
>  qemu/qemu-kvm.c             |   14 +
>  qemu/qemu-kvm.h             |    8 +
>  qemu/vl.c                   |   28 ++
>  8 files changed, 834 insertions(+), 0 deletions(-)
>  create mode 100644 qemu/hw/device-assignment.c
>  create mode 100644 qemu/hw/device-assignment.h
> 
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> index d9bdeca..5d44e08 100644
> --- a/qemu/Makefile.target
> +++ b/qemu/Makefile.target
> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
>  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
>  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> +ifeq ($(USE_KVM), 1)
> +OBJS+= device-assignment.o
> +endif
>  ifeq ($(USE_KVM_PIT), 1)
>  OBJS+= i8254-kvm.o
>  endif
> diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
> new file mode 100644
> index 0000000..89b05f9
> --- /dev/null
> +++ b/qemu/hw/device-assignment.c
> @@ -0,0 +1,641 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> modify it + * under the terms and conditions of the GNU General
> Public License, + * version 2, as published by the Free Software
> Foundation. + *
> + * This program is distributed in the hope it will be useful, but
> WITHOUT + * ANY WARRANTY; without even the implied warranty of
> MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> General Public License for + * more details.
> + *
> + * You should have received a copy of the GNU General Public License
> along with + * this program; if not, write to the Free Software
> Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA
> 02111-1307 USA. + *
> + *
> + *  Assign a PCI device from the host to a guest VM.
> + *
> + *  Adapted for KVM by Qumranet.
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +#include <stdio.h>
> +#include <sys/io.h>
> +#include "qemu-kvm.h"
> +#include "hw.h"
> +#include "pc.h"
> +#include "sysemu.h"
> +#include "console.h"
> +#include "device-assignment.h"
> +
> +/* From linux/ioport.h */
> +#define IORESOURCE_IO       0x00000100  /* Resource type */
> +#define IORESOURCE_MEM      0x00000200
> +#define IORESOURCE_IRQ      0x00000400
> +#define IORESOURCE_DMA      0x00000800
> +#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
> +
> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
> +
> +#ifdef DEVICE_ASSIGNMENT_DEBUG
> +#define DEBUG(fmt, ...)                                       \
> +    do {                                                      \
> +      fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \
> +    } while (0)
> +#else
> +#define DEBUG(fmt, ...) do { } while(0)
> +#endif
> +
> +static uint32_t guest_to_host_ioport(AssignedDevRegion *region,
> uint32_t addr) +{
> +    return region->u.r_baseport + (addr - region->e_physbase);
> +}
> +
> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
> +                                       uint32_t value)
> +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> +	  r_pio, (int)r_access->e_physbase,
> +	  (unsigned long)r_access->r_virtbase, value);

should be (unsigned long)r_access->u.r_virtbase

> +
> +    outb(value, r_pio);
> +}
> +
> +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
> +                                       uint32_t value)
> +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> +          __func__, r_pio, (int)r_access->e_physbase,
> +          (unsigned long)r_access->r_virtbase, value);

(unsigned long)r_access->u.r_virtbase

> +
> +    outw(value, r_pio);
> +}
> +
> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
> +                       uint32_t value)
> +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> +    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx
> value=%08x\n", +	  r_pio, (int)r_access->e_physbase,
> +          (unsigned long)r_access->r_virtbase, value);

(unsigned long)r_access->u.r_virtbase

> +
> +    outl(value, r_pio);
> +}
> +
> +static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t
> addr) +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +    uint32_t value;
> +
> +    value = inb(r_pio);
> +
> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> +          r_pio, (int)r_access->e_physbase,
> +          (unsigned long)r_access->r_virtbase, value);

(unsigned long)r_access->u.r_virtbase

> +
> +    return value;
> +}
> +
> +static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t
> addr) +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +    uint32_t value;
> +
> +    value = inw(r_pio);
> +
> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> +          r_pio, (int)r_access->e_physbase,
> +	  (unsigned long)r_access->r_virtbase, value);

(unsigned long)r_access->u.r_virtbase

> +
> +    return value;
> +}
> +
> +static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t
> addr) +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +    uint32_t value;
> +
> +    value = inl(r_pio);
> +
> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> +          r_pio, (int)r_access->e_physbase,
> +          (unsigned long)r_access->r_virtbase, value);

(unsigned long)r_access->u.r_virtbase

> +
> +    return value;
> +}
> +
> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int
> region_num, +                                   uint32_t e_phys,
> uint32_t e_size, int type) +{
> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    uint32_t old_ephys = region->e_physbase;
> +    uint32_t old_esize = region->e_size;
> +    int first_map = (region->e_size == 0);
> +    int ret = 0;
> +
> +    DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
> +          e_phys, (uint32_t)region->r_virtbase, type, e_size,

(uint32_t)r_access->u.r_virtbase

Regards,
Weidong


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
       [not found]           ` <715D42877B251141A38726ABF5CABF2C018683D874@pdsmsx503.ccr.corp.intel.com>
@ 2008-10-28 15:31             ` Han, Weidong
  0 siblings, 0 replies; 31+ messages in thread
From: Han, Weidong @ 2008-10-28 15:31 UTC (permalink / raw)
  To: Han, Weidong, 'muli@il.ibm.com', 'avi@redhat.com'
  Cc: 'kvm@vger.kernel.org', 'anthony@codemonkey.ws',
	'benami@il.ibm.com', 'amit.shah@redhat.com',
	Kay, Allen M

Han, Weidong wrote:
> muli@il.ibm.com wrote:
>> From: Muli Ben-Yehuda <muli@il.ibm.com>
>> 
>> This patch has been contributed to by the following people:
>> 
>> Or Sagi <ors@tutis.com>
>> Nir Peleg <nir@tutis.com>
>> Amit Shah <amit.shah@redhat.com>
>> Ben-Ami Yassour <benami@il.ibm.com>
>> Weidong Han <weidong.han@intel.com>
>> Glauber de Oliveira Costa <gcosta@redhat.com>
>> Muli Ben-Yehuda <muli@il.ibm.com>
>> 
>> With this patch, we can assign a device on the host machine to a
>> guest. 
>> 
>> A new command-line option, -pcidevice is added.
>> To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use
>> this: 
>> 
>>         -pcidevice host=04:08.0
>> 
>> * The host driver for the device, if any, is to be removed before
>> assigning the device (else device assignment will fail).
>> 
>> * A device that shares IRQ with another host device cannot currently
>> be assigned. 
>> 
>> * The RAW_IO capability is needed for this to work
>> 
>> This works only with the in-kernel irqchip method; to use the
>> userspace irqchip, a kernel module (irqhook) and some extra changes
>> are needed. 
>> 
>> [muli: lots of small fixes from Muli and Weidong Han addressing all
>> v7 review comments] 
>> 
>> Signed-off-by: Amit Shah <amit.shah@redhat.com>
>> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
>> ---
>>  qemu/Makefile.target        |    3 +
>>  qemu/hw/device-assignment.c |  641
>>  +++++++++++++++++++++++++++++++++++++++++++
>>  qemu/hw/device-assignment.h |  117 ++++++++ qemu/hw/pc.c
>>  |   16 + qemu/hw/pci.c               |    7 +
>>  qemu/qemu-kvm.c             |   14 +
>>  qemu/qemu-kvm.h             |    8 +
>>  qemu/vl.c                   |   28 ++
>>  8 files changed, 834 insertions(+), 0 deletions(-)
>>  create mode 100644 qemu/hw/device-assignment.c
>>  create mode 100644 qemu/hw/device-assignment.h
>> 
>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
>> index d9bdeca..5d44e08 100644
>> --- a/qemu/Makefile.target
>> +++ b/qemu/Makefile.target
>> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW)
>>  dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o
>>  pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
>>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +ifeq
>> ($(USE_KVM), 1) +OBJS+= device-assignment.o
>> +endif
>>  ifeq ($(USE_KVM_PIT), 1)
>>  OBJS+= i8254-kvm.o
>>  endif
>> diff --git a/qemu/hw/device-assignment.c
>> b/qemu/hw/device-assignment.c new file mode 100644 index
>> 0000000..89b05f9 --- /dev/null
>> +++ b/qemu/hw/device-assignment.c
>> @@ -0,0 +1,641 @@
>> +/*
>> + * Copyright (c) 2007, Neocleus Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> modify it + * under the terms and conditions of the GNU General
>> Public License, + * version 2, as published by the Free Software
>> Foundation. + * + * This program is distributed in the hope it will
>> be useful, but WITHOUT + * ANY WARRANTY; without even the implied
>> warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.
>> See the GNU General Public License for + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> along with + * this program; if not, write to the Free Software
>> Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA
>> 02111-1307 USA. + * + *
>> + *  Assign a PCI device from the host to a guest VM. + *
>> + *  Adapted for KVM by Qumranet.
>> + *
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */
>> +#include <stdio.h>
>> +#include <sys/io.h>
>> +#include "qemu-kvm.h"
>> +#include "hw.h"
>> +#include "pc.h"
>> +#include "sysemu.h"
>> +#include "console.h"
>> +#include "device-assignment.h"
>> +
>> +/* From linux/ioport.h */
>> +#define IORESOURCE_IO       0x00000100  /* Resource type */
>> +#define IORESOURCE_MEM      0x00000200
>> +#define IORESOURCE_IRQ      0x00000400
>> +#define IORESOURCE_DMA      0x00000800
>> +#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */ +
>> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
>> +
>> +#ifdef DEVICE_ASSIGNMENT_DEBUG
>> +#define DEBUG(fmt, ...)                                       \
>> +    do {                                                      \
>> +      fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \ +  
>> } while (0) +#else
>> +#define DEBUG(fmt, ...) do { } while(0)
>> +#endif
>> +
>> +static uint32_t guest_to_host_ioport(AssignedDevRegion *region,
>> uint32_t addr) +{ +    return region->u.r_baseport + (addr -
>> region->e_physbase); +} +
>> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
>> +                                       uint32_t value) +{
>> +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +
>> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>> value=%08x\n", +	  r_pio, (int)r_access->e_physbase,
>> +	  (unsigned long)r_access->r_virtbase, value);
> 
> should be (unsigned long)r_access->u.r_virtbase

should be r_access->u.r_portbase

> 
>> +
>> +    outb(value, r_pio);
>> +}
>> +
>> +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
>> +                                       uint32_t value) +{
>> +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +
>> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>> value=%08x\n", +          __func__, r_pio, (int)r_access->e_physbase,
>> +          (unsigned long)r_access->r_virtbase, value);
> 
> (unsigned long)r_access->u.r_virtbase

r_access->u.r_portbase, and remove "__func__".

> 
>> +
>> +    outw(value, r_pio);
>> +}
>> +
>> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
>> +                       uint32_t value)
>> +{
>> +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +
>> +    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>> value=%08x\n", +	  r_pio, (int)r_access->e_physbase,
>> +          (unsigned long)r_access->r_virtbase, value);
> 
> (unsigned long)r_access->u.r_virtbase

r_access->u.r_portbase, and remove "%s"

> 
>> +
>> +    outl(value, r_pio);
>> +}
>> +
>> +static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t
>> addr) +{ +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +   
>> uint32_t value; +
>> +    value = inb(r_pio);
>> +
>> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>> value=%08x\n", +          r_pio, (int)r_access->e_physbase,
>> +          (unsigned long)r_access->r_virtbase, value);
> 
> (unsigned long)r_access->u.r_virtbase

r_access->u.r_portbase

> 
>> +
>> +    return value;
>> +}
>> +
>> +static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t
>> addr) +{ +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +   
>> uint32_t value; +
>> +    value = inw(r_pio);
>> +
>> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>> value=%08x\n", +          r_pio, (int)r_access->e_physbase,
>> +	  (unsigned long)r_access->r_virtbase, value);
> 
> (unsigned long)r_access->u.r_virtbase

r_access->u.r_portbase

> 
>> +
>> +    return value;
>> +}
>> +
>> +static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t
>> addr) +{ +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +   
>> uint32_t value; +
>> +    value = inl(r_pio);
>> +
>> +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>> value=%08x\n", +          r_pio, (int)r_access->e_physbase,
>> +          (unsigned long)r_access->r_virtbase, value);
> 
> (unsigned long)r_access->u.r_virtbase

r_access->u.r_portbase

Regards,
Weidong

> 
>> +
>> +    return value;
>> +}
>> +
>> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int
>> region_num, +                                   uint32_t e_phys,
>> uint32_t e_size, int type) +{ +    AssignedDevice *r_dev =
>> (AssignedDevice *) pci_dev; +    AssignedDevRegion *region =
>> &r_dev->v_addrs[region_num]; +    uint32_t old_ephys =
>> region->e_physbase; +    uint32_t old_esize = region->e_size;
>> +    int first_map = (region->e_size == 0);
>> +    int ret = 0;
>> +
>> +    DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
>> +          e_phys, (uint32_t)region->r_virtbase, type, e_size,
> 
> (uint32_t)r_access->u.r_virtbase
> 
> Regards,
> Weidong


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 14:10           ` Han, Weidong
@ 2008-10-28 15:32             ` Muli Ben-Yehuda
  0 siblings, 0 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-28 15:32 UTC (permalink / raw)
  To: Han, Weidong
  Cc: 'avi@redhat.com', 'kvm@vger.kernel.org',
	'anthony@codemonkey.ws', Ben-Ami Yassour1,
	'amit.shah@redhat.com', Kay, Allen M

On Tue, Oct 28, 2008 at 10:10:07PM +0800, Han, Weidong wrote:

> > +    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> > +	  r_pio, (int)r_access->e_physbase,
> > +	  (unsigned long)r_access->r_virtbase, value);
> 
> should be (unsigned long)r_access->u.r_virtbase

Thanks, actually it should be u.r_baseport for IO ports and there were
a number of other bogosities there too. Here's a quick incremental
patch compiled with DEBUG() enabled.

>From 9b917528647b55a1046a5a19d9e2427bb2d86db7 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Tue, 28 Oct 2008 17:30:30 +0200
Subject: [PATCH 1/1] fix DEBUG statements

(thanks to Weidong Han for spotting)

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/hw/device-assignment.c |   32 ++++++++++++++++----------------
 1 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
index 89b05f9..8b56599 100644
--- a/qemu/hw/device-assignment.c
+++ b/qemu/hw/device-assignment.c
@@ -63,9 +63,9 @@ static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
     AssignedDevRegion *r_access = opaque;
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
 
-    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
 	  r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->r_virtbase, value);
+	  (unsigned long)r_access->u.r_baseport, value);
 
     outb(value, r_pio);
 }
@@ -76,9 +76,9 @@ static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
     AssignedDevRegion *r_access = opaque;
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
 
-    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
-          __func__, r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->r_virtbase, value);
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
 
     outw(value, r_pio);
 }
@@ -89,9 +89,9 @@ static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
     AssignedDevRegion *r_access = opaque;
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
 
-    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
 	  r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->r_virtbase, value);
+          (unsigned long)r_access->u.r_baseport, value);
 
     outl(value, r_pio);
 }
@@ -104,9 +104,9 @@ static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
 
     value = inb(r_pio);
 
-    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->r_virtbase, value);
+          (unsigned long)r_access->u.r_baseport, value);
 
     return value;
 }
@@ -119,9 +119,9 @@ static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
 
     value = inw(r_pio);
 
-    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->r_virtbase, value);
+	  (unsigned long)r_access->u.r_baseport, value);
 
     return value;
 }
@@ -134,9 +134,9 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
 
     value = inl(r_pio);
 
-    DEBUG("r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->r_virtbase, value);
+          (unsigned long)r_access->u.r_baseport, value);
 
     return value;
 }
@@ -151,8 +151,8 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
     int first_map = (region->e_size == 0);
     int ret = 0;
 
-    DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
-          e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
+    DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+          e_phys, region->u.r_virtbase, type, e_size, region_num);
 
     region->e_physbase = e_phys;
     region->e_size = e_size;
@@ -425,7 +425,7 @@ again:
         rp->valid = 1;
         rp->base_addr = start;
         rp->size = size;
-        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
+        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
               r, rp->size, start, rp->type, rp->resource_fd);
     }
     fclose(f);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 10:06         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
  2008-10-28 14:10           ` Han, Weidong
       [not found]           ` <715D42877B251141A38726ABF5CABF2C018683D874@pdsmsx503.ccr.corp.intel.com>
@ 2008-10-28 15:36           ` Han, Weidong
  2008-10-28 15:47             ` Muli Ben-Yehuda
  2008-10-28 15:45           ` Anthony Liguori
  2008-10-28 16:55           ` Mark McLoughlin
  4 siblings, 1 reply; 31+ messages in thread
From: Han, Weidong @ 2008-10-28 15:36 UTC (permalink / raw)
  To: 'muli@il.ibm.com', 'avi@redhat.com'
  Cc: 'kvm@vger.kernel.org', 'anthony@codemonkey.ws',
	'benami@il.ibm.com', 'amit.shah@redhat.com',
	Kay, Allen M

muli@il.ibm.com wrote:
> From: Muli Ben-Yehuda <muli@il.ibm.com>
> 
> This patch has been contributed to by the following people:
> 
> Or Sagi <ors@tutis.com>
> Nir Peleg <nir@tutis.com>
> Amit Shah <amit.shah@redhat.com>
> Ben-Ami Yassour <benami@il.ibm.com>
> Weidong Han <weidong.han@intel.com>
> Glauber de Oliveira Costa <gcosta@redhat.com>
> Muli Ben-Yehuda <muli@il.ibm.com>
> 
> With this patch, we can assign a device on the host machine to a
> guest.
> 
> A new command-line option, -pcidevice is added.
> To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:
> 
>         -pcidevice host=04:08.0
> 
> * The host driver for the device, if any, is to be removed before
> assigning the device (else device assignment will fail).
> 
> * A device that shares IRQ with another host device cannot currently
> be assigned.
> 
> * The RAW_IO capability is needed for this to work
> 
> This works only with the in-kernel irqchip method; to use the
> userspace irqchip, a kernel module (irqhook) and some extra changes
> are needed.
> 
> [muli: lots of small fixes from Muli and Weidong Han addressing all v7
> review comments]
> 
> Signed-off-by: Amit Shah <amit.shah@redhat.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
>  qemu/Makefile.target        |    3 +
>  qemu/hw/device-assignment.c |  641
>  +++++++++++++++++++++++++++++++++++++++++++
>  qemu/hw/device-assignment.h |  117 ++++++++ qemu/hw/pc.c            
>  |   16 + qemu/hw/pci.c               |    7 +
>  qemu/qemu-kvm.c             |   14 +
>  qemu/qemu-kvm.h             |    8 +
>  qemu/vl.c                   |   28 ++
>  8 files changed, 834 insertions(+), 0 deletions(-)
>  create mode 100644 qemu/hw/device-assignment.c
>  create mode 100644 qemu/hw/device-assignment.h
> 
> diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
> index c5f3f29..5e66832 100644
> --- a/qemu/qemu-kvm.c
> +++ b/qemu/qemu-kvm.c
> @@ -20,6 +20,7 @@ int kvm_pit = 1;
>  #include "console.h"
>  #include "block.h"
>  #include "compatfd.h"
> +#include "hw/device-assignment.h"

It's not necessary.

> 
>  #include "qemu-kvm.h"
>  #include <libkvm.h>
> @@ -27,6 +28,7 @@ int kvm_pit = 1;
>  #include <sys/utsname.h>
>  #include <sys/syscall.h>
>  #include <sys/mman.h>
> +#include <sys/io.h>

It's not necessary.

Regards,
Weidong


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 10:06         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
                             ` (2 preceding siblings ...)
  2008-10-28 15:36           ` Han, Weidong
@ 2008-10-28 15:45           ` Anthony Liguori
  2008-10-28 15:53             ` Muli Ben-Yehuda
  2008-10-28 16:55           ` Mark McLoughlin
  4 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2008-10-28 15:45 UTC (permalink / raw)
  To: muli; +Cc: avi, kvm, weidong.han, benami, amit.shah, allen.m.kay

muli@il.ibm.com wrote:
> Signed-off-by: Amit Shah <amit.shah@redhat.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
>  qemu/Makefile.target        |    3 +
>  qemu/hw/device-assignment.c |  641 +++++++++++++++++++++++++++++++++++++++++++
>  qemu/hw/device-assignment.h |  117 ++++++++
>  qemu/hw/pc.c                |   16 +
>  qemu/hw/pci.c               |    7 +
>  qemu/qemu-kvm.c             |   14 +
>  qemu/qemu-kvm.h             |    8 +
>  qemu/vl.c                   |   28 ++
>  8 files changed, 834 insertions(+), 0 deletions(-)
>  create mode 100644 qemu/hw/device-assignment.c
>  create mode 100644 qemu/hw/device-assignment.h
>
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> index d9bdeca..5d44e08 100644
> --- a/qemu/Makefile.target
> +++ b/qemu/Makefile.target
> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
>  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
>  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> +ifeq ($(USE_KVM), 1)
> +OBJS+= device-assignment.o
> +endif
>   

I don't think you want to build this on PPC so I think you need a 
stronger check.

> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
> +                       uint32_t value)
> +{
> +    AssignedDevRegion *r_access = opaque;
> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> +    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> +	  r_pio, (int)r_access->e_physbase,
> +          (unsigned long)r_access->r_virtbase, value);
>   

The format doesn't match the parameter count.

> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> +                                   uint32_t e_phys, uint32_t e_size, int type)
> +{
> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    uint32_t old_ephys = region->e_physbase;
> +    uint32_t old_esize = region->e_size;
> +    int first_map = (region->e_size == 0);
> +    int ret = 0;
> +
> +    DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
> +          e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
> +
> +    region->e_physbase = e_phys;
> +    region->e_size = e_size;
> +
> +    if (!first_map)
> +	kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
> +
> +    if (e_size > 0)
> +	ret = kvm_register_phys_mem(kvm_context, e_phys,
> +                                        region->u.r_virtbase, e_size, 0);
> +    if (ret != 0) {
> +	fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
> +	exit(1);
> +    }
> +}
> +
> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
> +                                    uint32_t addr, uint32_t size, int type)
> +{
> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    uint32_t old_port = region->u.r_baseport;
> +    uint32_t old_num = region->e_size;
> +    int first_map = (old_num == 0);
> +    struct ioperm_data data;
> +    int i;
> +
> +    region->e_physbase = addr;
> +    region->e_size = size;
> +
> +    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
> +          addr, region->u.r_baseport, type, size, region_num);
> +
> +    memset(&data, 0, sizeof(data));
> +
> +    if (!first_map) {
> +	data.start_port = old_port;
> +	data.num = old_num; 
> +	data.turn_on = 0;
> +
> +	for (i = 0; i < smp_cpus; ++i)
> +	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
>   

How does this interact with VCPU hot-plug?

> +    }
> +
> +    data.start_port = region->u.r_baseport;
> +    data.num = size;
> +    data.turn_on = 1;
> + 
> +    for (i = 0; i < smp_cpus; ++i)
> +	kvm_ioperm(qemu_kvm_cpu_env(i), &data);
> + 
> +    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
> +                         (r_dev->v_addrs + region_num));
> +    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
> +                         (r_dev->v_addrs + region_num));
> +    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
> +                         (r_dev->v_addrs + region_num));
> +    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
> +                          (r_dev->v_addrs + region_num));
> +    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
> +                          (r_dev->v_addrs + region_num));
> +    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
> +                          (r_dev->v_addrs + region_num));
> +}
> +
> +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
> +                                          uint32_t val, int len)
> +{
> +    int fd;
> +    ssize_t ret;
> +
> +    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> +          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> +          (uint16_t) address, val, len);
> +
> +    if (address == 0x4) {
> +        pci_default_write_config(d, address, val, len);
> +        /* Continue to program the card */
> +    }
> +
> +    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> +        address == 0x3c || address == 0x3d) {
> +        /* used for update-mappings (BAR emulation) */
> +        pci_default_write_config(d, address, val, len);
> +        return;
> +    }
> +
> +    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
> +          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> +          (uint16_t) address, val, len);
> +
> +    fd = ((AssignedDevice *)d)->real_device.config_fd;
> +
> +again:
> +    ret = pwrite(fd, &val, len, address);
> +    if (ret != len) {
> +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> +	    goto again;
> +
> +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> +		__func__, ret, errno);
> +
> +	exit(1);
> +    }
> +}
> +
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
> +                                             int len)
> +{
> +    uint32_t val = 0;
> +    int fd;
> +    ssize_t ret;
> +
> +    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> +        address == 0x3c || address == 0x3d) {
> +        val = pci_default_read_config(d, address, len);
> +        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> +              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> +        return val;
> +    }
> +
> +    /* vga specific, remove later */
> +    if (address == 0xFC)
> +        goto do_log;
> +
> +    fd = ((AssignedDevice *)d)->real_device.config_fd;
> +
> +again:
> +    ret = pread(fd, &val, len, address);
> +    if (ret != len) {
> +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> +	    goto again;
> +
> +	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
> +		__func__, ret, errno);
> +
> +	exit(1);
> +    }
> +
> +do_log:
> +    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> +          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> +
> +    /* kill the special capabilities */
> +    if (address == 4 && len == 4)
> +        val &= ~0x100000;
> +    else if (address == 6)
> +        val &= ~0x10;
> +
> +    return val;
> +}
> +
> +static int assigned_dev_register_regions(PCIRegion *io_regions,
> +                                         unsigned long regions_num,
> +                                         AssignedDevice *pci_dev)
> +{
> +    uint32_t i;
> +    PCIRegion *cur_region = io_regions;
> +
> +    for (i = 0; i < regions_num; i++, cur_region++) {
> +        if (!cur_region->valid)
> +            continue;
> +        pci_dev->v_addrs[i].num = i;
> +
> +        /* handle memory io regions */
> +        if (cur_region->type & IORESOURCE_MEM) {
> +            int t = cur_region->type & IORESOURCE_PREFETCH
> +                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
> +                : PCI_ADDRESS_SPACE_MEM;
> +
> +            /* map physical memory */
> +            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
> +            pci_dev->v_addrs[i].u.r_virtbase =
> +                mmap(NULL,
> +                     (cur_region->size + 0xFFF) & 0xFFFFF000,
> +                     PROT_WRITE | PROT_READ, MAP_SHARED,
> +                     cur_region->resource_fd, (off_t) 0);
> +
> +            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
> +                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
> +                        "\n", __func__,
> +                        (uint32_t) (cur_region->base_addr));
> +                return -1;
> +            }
> +            pci_dev->v_addrs[i].r_size = cur_region->size;
> +            pci_dev->v_addrs[i].e_size = 0;
> +
> +            /* add offset */
> +            pci_dev->v_addrs[i].u.r_virtbase +=
> +                (cur_region->base_addr & 0xFFF);
> +
> +            pci_register_io_region((PCIDevice *) pci_dev, i,
> +                                   cur_region->size, t,
> +                                   assigned_dev_iomem_map);
> +            continue;
> +        }
> +        /* handle port io regions */
> +        pci_register_io_region((PCIDevice *) pci_dev, i,
> +                               cur_region->size, PCI_ADDRESS_SPACE_IO,
> +                               assigned_dev_ioport_map);
> +
> +        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
> +        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
> +        /* not relevant for port io */
> +        pci_dev->v_addrs[i].memory_index = 0;
> +    }
> +
> +    /* success */
> +    return 0;
> +}
> +
> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> +                           uint8_t r_dev, uint8_t r_func)
> +{
> +    char dir[128], name[128];
> +    int fd, r = 0;
> +    FILE *f;
> +    unsigned long long start, end, size, flags;
> +    PCIRegion *rp;
> +    PCIDevRegions *dev = &pci_dev->real_device;
> +
> +    dev->region_number = 0;
> +
> +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> +	     r_bus, r_dev, r_func);
> +
> +    snprintf(name, sizeof(name), "%sconfig", dir);
> +
> +    fd = open(name, O_RDWR);
> +    if (fd == -1) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return 1;
> +    }
> +    dev->config_fd = fd;
> +again:
> +    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> +    if (r < 0) {
> +        if (errno == EINTR || errno == EAGAIN)
> +            goto again;
> +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> +    }
> +
> +    snprintf(name, sizeof(name), "%sresource", dir);
> +
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return 1;
> +    }
> +    r = -1;
> +    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> +        r++;
> +        rp = dev->regions + r;
> +        rp->valid = 0;
> +        size = end - start + 1;
> +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> +            continue;
> +        if (flags & IORESOURCE_MEM) {
> +            flags &= ~IORESOURCE_IO;
> +	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
> +            fd = open(name, O_RDWR);
> +            if (fd == -1)
> +                continue;       /* probably ROM */
> +            rp->resource_fd = fd;
> +        } else
> +            flags &= ~IORESOURCE_PREFETCH;
> +
> +        rp->type = flags;
> +        rp->valid = 1;
> +        rp->base_addr = start;
> +        rp->size = size;
> +        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> +              r, rp->size, start, rp->type, rp->resource_fd);
> +    }
> +    fclose(f);
> +
> +    dev->region_number = r;
> +    return 0;
> +}
> +
> +static int disable_iommu;
> +int nr_assigned_devices;
> +static LIST_HEAD(, AssignedDevInfo) adev_head;
> +
> +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> +{
> +    return (uint32_t)bus << 8 | (uint32_t)devfn;
> +}
> +
> +static AssignedDevice *register_real_device(PCIBus *e_bus,
> +                                            const char *e_dev_name,
> +                                            int e_devfn, uint8_t r_bus,
> +                                            uint8_t r_dev, uint8_t r_func)
> +{
> +    int r;
> +    AssignedDevice *pci_dev;
> +    uint8_t e_device, e_intx;
> +
> +    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> +          e_dev_name, e_devfn);
> +
> +    pci_dev = (AssignedDevice *)
> +        pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> +                            e_devfn, assigned_dev_pci_read_config,
> +                            assigned_dev_pci_write_config);
> +    if (NULL == pci_dev) {
> +        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> +                __func__, e_dev_name);
> +        return NULL;
> +    }
> +    if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> +        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> +                __func__, e_dev_name);
> +        goto out;
> +    }
> +
> +    /* handle real device's MMIO/PIO BARs */
> +    if (assigned_dev_register_regions(pci_dev->real_device.regions,
> +                                      pci_dev->real_device.region_number,
> +                                      pci_dev))
> +        goto out;
> +
> +    /* handle interrupt routing */
> +    e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> +    e_intx = pci_dev->dev.config[0x3d] - 1;
> +    pci_dev->intpin = e_intx;
> +    pci_dev->run = 0;
> +    pci_dev->girq = 0;
> +    pci_dev->h_busnr = r_bus;
> +    pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> +
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> +    if (kvm_enabled()) {
> +        struct kvm_assigned_pci_dev assigned_dev_data;
> +
> +        memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> +        assigned_dev_data.assigned_dev_id  =
> +            calc_assigned_dev_id(pci_dev->h_busnr,
> +                                 (uint32_t)pci_dev->h_devfn);
> +        assigned_dev_data.busnr = pci_dev->h_busnr;
> +        assigned_dev_data.devfn = pci_dev->h_devfn;
> +
> +#ifdef KVM_CAP_IOMMU
> +        /* We always enable the IOMMU if present
> +         * (or when not disabled on the command line)
> +         */
> +        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> +        if (r && !disable_iommu)
> +            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> +#endif
> +        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> +        if (r < 0) {
> +            fprintf(stderr, "Could not notify kernel about "
> +                "assigned device \"%s\"\n", e_dev_name);
> +            perror("register_real_device");
> +            goto out;
> +        }
> +    }
>   

You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined?  That 
means a newer userspace compiled on an older kernel will silently fail 
if they try to do device assignment.  There's probably no reason to 
build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined (see how the 
in-kernel PIT gets conditionally build depending on whether that cap is 
available).

> +#endif
> +    term_printf("Registered host PCI device %02x:%02x.%1x "
> +		"(\"%s\") as guest device %02x:%02x.%1x\n",
> +		r_bus, r_dev, r_func, e_dev_name,
> +		pci_bus_num(e_bus), e_device, r_func);
>
>   

If I read the code correctly, this term_printf() happens regardless of 
whether this is being done for PCI hotplug or for command-line 
assignment?  That's a problem as it'll print garbage on the monitor when 
you start QEMU which could break management applications.

> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> index d559f0c..5fdb726 100644
> --- a/qemu/hw/pc.c
> +++ b/qemu/hw/pc.c
> @@ -33,6 +33,7 @@
>  #include "boards.h"
>  #include "console.h"
>  #include "fw_cfg.h"
> +#include "device-assignment.h"
>  
>  #include "qemu-kvm.h"
>  
> @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
>  
>      if (pci_enabled)
>          virtio_balloon_init(pci_bus);
> +
> +    if (kvm_enabled() && device_assignment_enabled) {
> +	int i;
>   

Stray tab.

> +        for (i = 0; i < assigned_devices_index; i++) {
> +            if (add_assigned_device(assigned_devices[i]) < 0) {
> +                fprintf(stderr, "Warning: could not add assigned device %s\n",
> +                        assigned_devices[i]);
> +            }
> +        }
> +
> +	if (init_all_assigned_devices(pci_bus)) {
> +	    fprintf(stderr, "Failed to initialize assigned devices\n");
> +	    exit (1);
> +	}
> +    }
>  }
>  +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
> +            case QEMU_OPTION_pcidevice:
> +		device_assignment_enabled = 1;
> +		if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
> +                    fprintf(stderr, "Too many assigned devices\n");
> +                    exit(1);
> +		}
> +		assigned_devices[assigned_devices_index] = optarg;
> +		assigned_devices_index++;
> +                break;
>   

Tab damage.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 15:36           ` Han, Weidong
@ 2008-10-28 15:47             ` Muli Ben-Yehuda
  0 siblings, 0 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-28 15:47 UTC (permalink / raw)
  To: Han, Weidong
  Cc: 'avi@redhat.com', 'kvm@vger.kernel.org',
	'anthony@codemonkey.ws', Ben-Ami Yassour1,
	'amit.shah@redhat.com', Kay, Allen M

On Tue, Oct 28, 2008 at 11:36:10PM +0800, Han, Weidong wrote:
> > diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
> > index c5f3f29..5e66832 100644
> > --- a/qemu/qemu-kvm.c
> > +++ b/qemu/qemu-kvm.c
> > @@ -20,6 +20,7 @@ int kvm_pit = 1;
> >  #include "console.h"
> >  #include "block.h"
> >  #include "compatfd.h"
> > +#include "hw/device-assignment.h"
> 
> It's not necessary.

Indeed, left overs from my ioperm bits. Removed.

> >  #include "qemu-kvm.h"
> >  #include <libkvm.h>
> > @@ -27,6 +28,7 @@ int kvm_pit = 1;
> >  #include <sys/utsname.h>
> >  #include <sys/syscall.h>
> >  #include <sys/mman.h>
> > +#include <sys/io.h>
> 
> It's not necessary.

This one is needed on my compile system for the ioperm() declaration.

Cheers,
Muli

-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 15:45           ` Anthony Liguori
@ 2008-10-28 15:53             ` Muli Ben-Yehuda
  2008-10-29  7:56               ` Zhang, Xiantao
                                 ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-28 15:53 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: avi, kvm, weidong.han, Ben-Ami Yassour1, amit.shah, allen.m.kay

On Tue, Oct 28, 2008 at 10:45:57AM -0500, Anthony Liguori wrote:

>> +ifeq ($(USE_KVM), 1)
>> +OBJS+= device-assignment.o
>> +endif
>
> I don't think you want to build this on PPC so I think you need a
> stronger check.

Good point. How about checking TARGET_BASE_ARCH = i386?

>> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
>> +                       uint32_t value)
>> +{
>> +    AssignedDevRegion *r_access = opaque;
>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
>> +
>> +    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
>> +	  r_pio, (int)r_access->e_physbase,
>> +          (unsigned long)r_access->r_virtbase, value);
>>   
>
> The format doesn't match the parameter count.

Yep, already fixed.

>> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
>> +                                    uint32_t addr, uint32_t size, int 
>> type)
>> +{
>> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    uint32_t old_port = region->u.r_baseport;
>> +    uint32_t old_num = region->e_size;
>> +    int first_map = (old_num == 0);
>> +    struct ioperm_data data;
>> +    int i;
>> +
>> +    region->e_physbase = addr;
>> +    region->e_size = size;
>> +
>> +    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
>> +          addr, region->u.r_baseport, type, size, region_num);
>> +
>> +    memset(&data, 0, sizeof(data));
>> +
>> +    if (!first_map) {
>> +	data.start_port = old_port;
>> +	data.num = old_num; +	data.turn_on = 0;
>> +
>> +	for (i = 0; i < smp_cpus; ++i)
>> +	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
>>   
>
> How does this interact with VCPU hot-plug?

I have no idea. Weidong?

>> +#ifdef KVM_CAP_IOMMU
>> +        /* We always enable the IOMMU if present
>> +         * (or when not disabled on the command line)
>> +         */
>> +        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
>> +        if (r && !disable_iommu)
>> +            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
>> +#endif
>> +        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
>> +        if (r < 0) {
>> +            fprintf(stderr, "Could not notify kernel about "
>> +                "assigned device \"%s\"\n", e_dev_name);
>> +            perror("register_real_device");
>> +            goto out;
>> +        }
>> +    }
>>   
>
> You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined?  That
> means a newer userspace compiled on an older kernel will silently
> fail if they try to do device assignment.  There's probably no
> reason to build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined
> (see how the in-kernel PIT gets conditionally build depending on
> whether that cap is available).

Ok, I'll take a look at this.

>> +#endif
>> +    term_printf("Registered host PCI device %02x:%02x.%1x "
>> +		"(\"%s\") as guest device %02x:%02x.%1x\n",
>> +		r_bus, r_dev, r_func, e_dev_name,
>> +		pci_bus_num(e_bus), e_device, r_func);
>>
>>   
>
> If I read the code correctly, this term_printf() happens regardless
> of whether this is being done for PCI hotplug or for command-line
> assignment?  That's a problem as it'll print garbage on the monitor
> when you start QEMU which could break management applications.

Is there a more suitable alternative or shall I just nuke it?

>> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
>> index d559f0c..5fdb726 100644
>> --- a/qemu/hw/pc.c
>> +++ b/qemu/hw/pc.c
>> @@ -33,6 +33,7 @@
>>  #include "boards.h"
>>  #include "console.h"
>>  #include "fw_cfg.h"
>> +#include "device-assignment.h"
>>   #include "qemu-kvm.h"
>>  @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int 
>> vga_ram_size,
>>       if (pci_enabled)
>>          virtio_balloon_init(pci_bus);
>> +
>> +    if (kvm_enabled() && device_assignment_enabled) {
>> +	int i;
>>   
>
> Stray tab.

Grrr. Silly emacs.

>
>> +        for (i = 0; i < assigned_devices_index; i++) {
>> +            if (add_assigned_device(assigned_devices[i]) < 0) {
>> +                fprintf(stderr, "Warning: could not add assigned device 
>> %s\n",
>> +                        assigned_devices[i]);
>> +            }
>> +        }
>> +
>> +	if (init_all_assigned_devices(pci_bus)) {
>> +	    fprintf(stderr, "Failed to initialize assigned devices\n");
>> +	    exit (1);
>> +	}
>> +    }
>>  }
>>  +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
>> +            case QEMU_OPTION_pcidevice:
>> +		device_assignment_enabled = 1;
>> +		if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
>> +                    fprintf(stderr, "Too many assigned devices\n");
>> +                    exit(1);
>> +		}
>> +		assigned_devices[assigned_devices_index] = optarg;
>> +		assigned_devices_index++;
>> +                break;
>>   
>
> Tab damage.

Thanks, will fix in the next revision.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 10:06         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
                             ` (3 preceding siblings ...)
  2008-10-28 15:45           ` Anthony Liguori
@ 2008-10-28 16:55           ` Mark McLoughlin
  2008-10-29 10:31             ` Muli Ben-Yehuda
  4 siblings, 1 reply; 31+ messages in thread
From: Mark McLoughlin @ 2008-10-28 16:55 UTC (permalink / raw)
  To: muli; +Cc: avi, kvm, anthony, weidong.han, benami, amit.shah, allen.m.kay

On Tue, 2008-10-28 at 12:06 +0200, muli@il.ibm.com wrote:
...
> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> +                           uint8_t r_dev, uint8_t r_func)
> +{
> +    char dir[128], name[128];
> +    int fd, r = 0;
> +    FILE *f;
> +    unsigned long long start, end, size, flags;
> +    PCIRegion *rp;
> +    PCIDevRegions *dev = &pci_dev->real_device;
> +
> +    dev->region_number = 0;
> +
> +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> +	     r_bus, r_dev, r_func);
> +
> +    snprintf(name, sizeof(name), "%sconfig", dir);
> +
> +    fd = open(name, O_RDWR);
> +    if (fd == -1) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return 1;
> +    }
> +    dev->config_fd = fd;
> +again:
> +    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> +    if (r < 0) {
> +        if (errno == EINTR || errno == EAGAIN)
> +            goto again;
> +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> +    }
> +
> +    snprintf(name, sizeof(name), "%sresource", dir);
> +
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return 1;
> +    }
> +    r = -1;
> +    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> +        r++;
> +        rp = dev->regions + r;

Could, in theory, overflow dev->regions here. Suggest:

+    for (r = 0; r < MAX_IO_REGIONS; r++) {
+        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
+            break;

> +        rp->valid = 0;
> +        size = end - start + 1;
> +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> +            continue;
> +        if (flags & IORESOURCE_MEM) {
> +            flags &= ~IORESOURCE_IO;
> +	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
> +            fd = open(name, O_RDWR);
> +            if (fd == -1)
> +                continue;       /* probably ROM */
> +            rp->resource_fd = fd;
> +        } else
> +            flags &= ~IORESOURCE_PREFETCH;
> +
> +        rp->type = flags;
> +        rp->valid = 1;
> +        rp->base_addr = start;
> +        rp->size = size;
> +        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> +              r, rp->size, start, rp->type, rp->resource_fd);
> +    }
> +    fclose(f);
> +
> +    dev->region_number = r;
> +    return 0;
> +}
> +
> +static int disable_iommu;

Why is this global?

The flag is set per-device on the command-line and only affects whether
we pass KVM_DEV_ASSIGN_ENABLE_IOMMU to kvm_assign_pci_device()

> +int nr_assigned_devices;
> +static LIST_HEAD(, AssignedDevInfo) adev_head;
> +
> +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> +{
> +    return (uint32_t)bus << 8 | (uint32_t)devfn;
> +}
> +
> +static AssignedDevice *register_real_device(PCIBus *e_bus,
> +                                            const char *e_dev_name,
> +                                            int e_devfn, uint8_t r_bus,
> +                                            uint8_t r_dev, uint8_t r_func)
> +{
> +    int r;
> +    AssignedDevice *pci_dev;
> +    uint8_t e_device, e_intx;
> +
> +    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> +          e_dev_name, e_devfn);
> +
> +    pci_dev = (AssignedDevice *)
> +        pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> +                            e_devfn, assigned_dev_pci_read_config,
> +                            assigned_dev_pci_write_config);
> +    if (NULL == pci_dev) {
> +        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> +                __func__, e_dev_name);
> +        return NULL;
> +    }
> +    if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> +        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> +                __func__, e_dev_name);
> +        goto out;
> +    }
> +
> +    /* handle real device's MMIO/PIO BARs */
> +    if (assigned_dev_register_regions(pci_dev->real_device.regions,
> +                                      pci_dev->real_device.region_number,
> +                                      pci_dev))
> +        goto out;
> +
> +    /* handle interrupt routing */
> +    e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> +    e_intx = pci_dev->dev.config[0x3d] - 1;
> +    pci_dev->intpin = e_intx;
> +    pci_dev->run = 0;
> +    pci_dev->girq = 0;
> +    pci_dev->h_busnr = r_bus;
> +    pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> +
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> +    if (kvm_enabled()) {
> +        struct kvm_assigned_pci_dev assigned_dev_data;
> +
> +        memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> +        assigned_dev_data.assigned_dev_id  =
> +            calc_assigned_dev_id(pci_dev->h_busnr,
> +                                 (uint32_t)pci_dev->h_devfn);
> +        assigned_dev_data.busnr = pci_dev->h_busnr;
> +        assigned_dev_data.devfn = pci_dev->h_devfn;
> +
> +#ifdef KVM_CAP_IOMMU
> +        /* We always enable the IOMMU if present
> +         * (or when not disabled on the command line)
> +         */
> +        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> +        if (r && !disable_iommu)
> +            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> +#endif
> +        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> +        if (r < 0) {
> +            fprintf(stderr, "Could not notify kernel about "
> +                "assigned device \"%s\"\n", e_dev_name);
> +            perror("register_real_device");
> +            goto out;
> +        }
> +    }
> +#endif
> +    term_printf("Registered host PCI device %02x:%02x.%1x "
> +		"(\"%s\") as guest device %02x:%02x.%1x\n",
> +		r_bus, r_dev, r_func, e_dev_name,
> +		pci_bus_num(e_bus), e_device, r_func);
> +
> +    return pci_dev;
> +out:
> +/*    pci_unregister_device(&pci_dev->dev); */
> +    return NULL;
> +}
> +
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> +/* The pci config space got updated. Check if irq numbers have changed
> + * for our devices
> + */
> +void assigned_dev_update_irq(PCIDevice *d)
> +{
> +    int irq, r;
> +    AssignedDevice *assigned_dev;
> +    AssignedDevInfo *adev;
> +
> +    LIST_FOREACH(adev, &adev_head, next) {
> +        assigned_dev = adev->assigned_dev;
> +        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
> +        irq = piix_get_irq(irq);
> +
> +        if (irq != assigned_dev->girq) {
> +            struct kvm_assigned_irq assigned_irq_data;
> +
> +            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
> +            assigned_irq_data.assigned_dev_id  =
> +                calc_assigned_dev_id(assigned_dev->h_busnr,
> +                                     (uint8_t) assigned_dev->h_devfn);
> +            assigned_irq_data.guest_irq = irq;
> +            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
> +            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
> +            if (r < 0) {
> +                perror("assigned_dev_update_irq");
> +                fprintf(stderr, "Are you assigning a device "
> +                        "that shares IRQ with some other device?\n");
> +                pci_unregister_device(&assigned_dev->dev);
> +                /* FIXME: Delete node from list */
> +                continue;
> +            }
> +            assigned_dev->girq = irq;
> +        }
> +    }
> +}
> +#endif
> +
> +struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
> +{
> +    adev->assigned_dev = register_real_device(bus,
> +                                              adev->name, -1,
> +                                              adev->bus,
> +                                              adev->dev,
> +                                              adev->func);
> +    return &adev->assigned_dev->dev;
> +}

This looks unnecessary - register_real_device() isn't used anywhere
else.

Why not just move register_real_device() into init_assigned_device() ?

> +int init_all_assigned_devices(PCIBus *bus)
> +{
> +    struct AssignedDevInfo *adev;
> +
> +    LIST_FOREACH(adev, &adev_head, next)
> +        if (init_assigned_device(adev, bus) == NULL)
> +            return -1;
> +
> +    return 0;
> +}
> +
> +/*
> + * Syntax to assign device:
> + *
> + * -pcidevice dev=bus:dev.func,dma=dma
                 ^^^                  ^^^

Should be:

  -pcidevice host=bus:dev.func[,dma=none][,name=string]

> + *
> + * Example:
> + * -pcidevice host=00:13.0,dma=pvdma
                                  ^^^^^

Should be:

  -pcidevice host=00:13.0,dma=none,name=Foo
  
> + *
> + * dma can currently only be 'none' to disable iommu support.
> + */
> +AssignedDevInfo *add_assigned_device(const char *arg)
> +{
> +    char *cp, *cp1;
> +    char device[8];
> +    char dma[6];
> +    int r;
> +    AssignedDevInfo *adev;
> +
> +    adev = qemu_mallocz(sizeof(AssignedDevInfo));
> +    if (adev == NULL) {
> +        fprintf(stderr, "%s: Out of memory\n", __func__);
> +        return NULL;
> +    }
> +    r = get_param_value(device, sizeof(device), "host", arg);
> +    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
> +    if (!r)
> +	snprintf(adev->name, sizeof(adev->name), "%s", device);
> +
> +#ifdef KVM_CAP_IOMMU
> +    r = get_param_value(dma, sizeof(dma), "dma", arg);
> +    if (r && !strncmp(dma, "none", 4))
> +        disable_iommu = 1;
> +#endif
> +    cp = device;
> +    adev->bus = strtoul(cp, &cp1, 16);
> +    if (*cp1 != ':')
> +        goto bad;
> +    cp = cp1 + 1;
> +
> +    adev->dev = strtoul(cp, &cp1, 16);
> +    if (*cp1 != '.')
> +        goto bad;
> +    cp = cp1 + 1;
> +
> +    adev->func = strtoul(cp, &cp1, 16);
> +
> +    nr_assigned_devices++;

nr_assigned_devices isn't actually used anywhere.

> +    LIST_INSERT_HEAD(&adev_head, adev, next);
> +    return adev;
> +bad:
> +    fprintf(stderr, "pcidevice argument parse error; "
> +            "please check the help text for usage\n");
> +    qemu_free(adev);
> +    return NULL;
> +}
> diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
> new file mode 100644
> index 0000000..ebc0b50
> --- /dev/null
> +++ b/qemu/hw/device-assignment.h
> @@ -0,0 +1,117 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + * Copyright (c) 2007, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + *  Data structures for storing PCI state
> + *
> + *  Adapted to kvm by Qumranet
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + */
> +
> +#ifndef __DEVICE_ASSIGNMENT_H__
> +#define __DEVICE_ASSIGNMENT_H__
> +
> +#include <sys/mman.h>
> +#include "qemu-common.h"
> +#include "sys-queue.h"
> +#include "pci.h"
> +
> +/* From include/linux/pci.h in the kernel sources */
> +#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> +
> +#define MAX_IO_REGIONS (6)

Perhaps a comment to say this is the number of BARs in the config space
header?

> +typedef struct {
> +    int type;           /* Memory or port I/O */
> +    int valid;
> +    uint32_t base_addr;
> +    uint32_t size;    /* size of the region */
> +    int resource_fd;
> +} PCIRegion;
> +
> +typedef struct {
> +    uint8_t bus, dev, func; /* Bus inside domain, device and function */
> +    int irq;                /* IRQ number */
> +    uint16_t region_number; /* number of active regions */
> +
> +    /* Port I/O or MMIO Regions */
> +    PCIRegion regions[MAX_IO_REGIONS];
> +    int config_fd;
> +} PCIDevRegions;
> +
> +typedef struct {
> +    target_phys_addr_t e_physbase;
> +    uint32_t memory_index;
> +    union {
> +        void *r_virtbase;    /* mmapped access address for memory regions */
> +        uint32_t r_baseport; /* the base guest port for I/O regions */
> +    } u;
> +    int num;            /* our index within v_addrs[] */
> +    uint32_t e_size;    /* emulated size of region in bytes */
> +    uint32_t r_size;    /* real size of region in bytes */
> +} AssignedDevRegion;
> +
> +typedef struct {
> +    PCIDevice dev;
> +    int intpin;
> +    uint8_t debug_flags;
> +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
> +    PCIDevRegions real_device;
> +    int run;
> +    int girq;
> +    unsigned char h_busnr;
> +    unsigned int h_devfn;
> +    int bound;
> +} AssignedDevice;
> +
> +typedef struct AssignedDevInfo AssignedDevInfo;
> +
> +struct AssignedDevInfo {
> +    char name[15];
> +    int bus;
> +    int dev;
> +    int func;
> +    AssignedDevice *assigned_dev;
> +    LIST_ENTRY(AssignedDevInfo) next;
> +};
> +
> +PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
> +AssignedDevInfo *add_assigned_device(const char *arg);
> +void assigned_dev_set_vector(int irq, int vector);
> +void assigned_dev_ack_mirq(int vector);
> +
> +
> +#ifdef USE_KVM
> +int init_all_assigned_devices(PCIBus *bus);
> +#else /* not using kvm */
> +static inline int init_all_assigned_devices(PCIBus *bus)
> +{
> +    return 0;
> +}
> +#endif /* !USE_KVM */
> +
> +
> +#define MAX_DEV_ASSIGN_CMDLINE 8
> +
> +extern int device_assignment_enabled;

> +extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
> +extern int assigned_devices_index;

Neither of these two are implemented anywhere.

> +#endif              /* __DEVICE_ASSIGNMENT_H__ */
> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> index d559f0c..5fdb726 100644
> --- a/qemu/hw/pc.c
> +++ b/qemu/hw/pc.c
> @@ -33,6 +33,7 @@
>  #include "boards.h"
>  #include "console.h"
>  #include "fw_cfg.h"
> +#include "device-assignment.h"
>  
>  #include "qemu-kvm.h"
>  
> @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
>  
>      if (pci_enabled)
>          virtio_balloon_init(pci_bus);
> +
> +    if (kvm_enabled() && device_assignment_enabled) {

The device_assignment_enabled flag looks like it shouldn't be needed.

If assigned_devices_index remains zero, nothing should happen anyway.

> +	int i;
> +        for (i = 0; i < assigned_devices_index; i++) {
> +            if (add_assigned_device(assigned_devices[i]) < 0) {
> +                fprintf(stderr, "Warning: could not add assigned device %s\n",
> +                        assigned_devices[i]);
> +            }
> +        }
> +
> +	if (init_all_assigned_devices(pci_bus)) {
> +	    fprintf(stderr, "Failed to initialize assigned devices\n");
> +	    exit (1);
> +	}
> +    }
>  }
>  
>  static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
> diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
> index c82cd20..f86a8a7 100644
> --- a/qemu/hw/pci.c
> +++ b/qemu/hw/pci.c
> @@ -50,6 +50,7 @@ struct PCIBus {
>  
>  static void pci_update_mappings(PCIDevice *d);
>  static void pci_set_irq(void *opaque, int irq_num, int level);
> +void assigned_dev_update_irq(PCIDevice *d);
>  
>  target_phys_addr_t pci_mem_base;
>  static int pci_irq_index;
> @@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
>          val >>= 8;
>      }
>  
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> +    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
> +        address >= 0x60 && address <= 0x63)
> +        assigned_dev_update_irq(d);
> +#endif

Outside of the context of piix_pci.c, it's difficult to figure out what
the 0x60-0x63 register range relates to - i.e. you need to know to go
digging in the PIIX spec.

How about something like in qemu/hw/pc.h:

  +/* config space register for IRQ routing */
  +#define PIIX_CONFIG_IRQ_ROUTE 0x60

then:

     if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
         address >= PIIX_CONFIG_IRQ_ROUTE &&
         address < PIIX_CONFIG_IRQ_ROUTE + 4)

> +
>      end = address + len;
>      if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
>          /* if the command register is modified, we must modify the mappings */
> diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
> index c5f3f29..5e66832 100644
> --- a/qemu/qemu-kvm.c
> +++ b/qemu/qemu-kvm.c
> @@ -20,6 +20,7 @@ int kvm_pit = 1;
>  #include "console.h"
>  #include "block.h"
>  #include "compatfd.h"
> +#include "hw/device-assignment.h"
>  
>  #include "qemu-kvm.h"
>  #include <libkvm.h>
> @@ -27,6 +28,7 @@ int kvm_pit = 1;
>  #include <sys/utsname.h>
>  #include <sys/syscall.h>
>  #include <sys/mman.h>
> +#include <sys/io.h>
>  
>  #define bool _Bool
>  #define false 0
> @@ -1047,3 +1049,15 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
>  {
>      return kvm_unregister_coalesced_mmio(kvm_context, addr, size);
>  }
> +
> +static void kvm_do_ioperm(void *_data)
> +{
> +	struct ioperm_data *data = _data;
> +	ioperm(data->start_port, data->num, data->turn_on);
> +}
> +
> +void kvm_ioperm(CPUState *env, void *data)
> +{
> +	if (kvm_enabled() && qemu_system_ready)
> +		on_vcpu(env, kvm_do_ioperm, data);
> +}
> diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h
> index a1d6646..92d921d 100644
> --- a/qemu/qemu-kvm.h
> +++ b/qemu/qemu-kvm.h
> @@ -93,6 +93,8 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
>  
>  void qemu_kvm_system_reset_request(void);
>  
> +void kvm_ioperm(CPUState *env, void *data);
> +
>  #ifdef TARGET_PPC
>  int handle_powerpc_dcr_read(int vcpu, uint32_t dcrn, uint32_t *data);
>  int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
> @@ -107,6 +109,12 @@ int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
>  extern int kvm_allowed;
>  extern kvm_context_t kvm_context;
>  
> +struct ioperm_data {
> +	unsigned long start_port;
> +	unsigned long num;
> +	int turn_on;
> +};
> +
>  #define kvm_enabled() (kvm_allowed)
>  #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
>  #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
> diff --git a/qemu/vl.c b/qemu/vl.c
> index 388e79d..9dda2f9 100644
> --- a/qemu/vl.c
> +++ b/qemu/vl.c
> @@ -38,6 +38,7 @@
>  #include "qemu-char.h"
>  #include "block.h"
>  #include "audio/audio.h"
> +#include "hw/device-assignment.h"
>  #include "migration.h"
>  #include "balloon.h"
>  #include "qemu-kvm.h"
> @@ -215,6 +216,9 @@ CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
>  int win2k_install_hack = 0;
>  #endif
>  int usb_enabled = 0;
> +int device_assignment_enabled = 0;
> +const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
> +int assigned_devices_index;
>  static VLANState *first_vlan;
>  int smp_cpus = 1;
>  const char *vnc_display;
> @@ -8692,6 +8696,12 @@ static void help(int exitcode)
>  #endif
>  	   "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
>  	   "-no-kvm-pit	    disable KVM kernel mode PIT\n"
> +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
> +           "-pcidevice host=bus:dev.func[,dma=none][,name=\"string\"]\n"

name="string" isn't correct, is it? e.g. this won't work

  -pcidevice host=04:08.0,name="Foo Bar"

but this will:

  -pcidevice "host=04:08.0,name=Foo Bar"

...

Cheers,
Mark.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 15:53             ` Muli Ben-Yehuda
@ 2008-10-29  7:56               ` Zhang, Xiantao
  2008-10-29 10:27                 ` Muli Ben-Yehuda
  2008-10-29  8:22               ` Han, Weidong
  2008-10-29 10:25               ` Muli Ben-Yehuda
  2 siblings, 1 reply; 31+ messages in thread
From: Zhang, Xiantao @ 2008-10-29  7:56 UTC (permalink / raw)
  To: Muli Ben-Yehuda, Anthony Liguori
  Cc: avi@redhat.com, kvm@vger.kernel.org, Han, Weidong,
	Ben-Ami Yassour1, amit.shah@redhat.com, Kay, Allen M

Muli Ben-Yehuda wrote:
> On Tue, Oct 28, 2008 at 10:45:57AM -0500, Anthony Liguori wrote:
> 
>>> +ifeq ($(USE_KVM), 1)
>>> +OBJS+= device-assignment.o
>>> +endif
>> 
>> I don't think you want to build this on PPC so I think you need a
>> stronger check.
> 
> Good point. How about checking TARGET_BASE_ARCH = i386?

It should work for ia64, please include ia64 when you do it in this way. 
Xiantao

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 15:53             ` Muli Ben-Yehuda
  2008-10-29  7:56               ` Zhang, Xiantao
@ 2008-10-29  8:22               ` Han, Weidong
  2008-10-29 10:25               ` Muli Ben-Yehuda
  2 siblings, 0 replies; 31+ messages in thread
From: Han, Weidong @ 2008-10-29  8:22 UTC (permalink / raw)
  To: 'Muli Ben-Yehuda', 'Anthony Liguori'
  Cc: 'avi@redhat.com', 'kvm@vger.kernel.org',
	'Ben-Ami Yassour1', 'amit.shah@redhat.com',
	Kay, Allen M

Muli Ben-Yehuda wrote:
> On Tue, Oct 28, 2008 at 10:45:57AM -0500, Anthony Liguori wrote:
> 
>>> +ifeq ($(USE_KVM), 1)
>>> +OBJS+= device-assignment.o
>>> +endif
>> 
>> I don't think you want to build this on PPC so I think you need a
>> stronger check.
> 
> Good point. How about checking TARGET_BASE_ARCH = i386?
> 
>>> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
>>> +                       uint32_t value)
>>> +{
>>> +    AssignedDevRegion *r_access = opaque;
>>> +    uint32_t r_pio = guest_to_host_ioport(r_access, addr); +
>>> +    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx
>>> value=%08x\n", +	  r_pio, (int)r_access->e_physbase,
>>> +          (unsigned long)r_access->r_virtbase, value);
>>> 
>> 
>> The format doesn't match the parameter count.
> 
> Yep, already fixed.
> 
>>> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int
>>> region_num, +                                    uint32_t addr,
>>> uint32_t size, int type) +{
>>> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>>> +    uint32_t old_port = region->u.r_baseport;
>>> +    uint32_t old_num = region->e_size;
>>> +    int first_map = (old_num == 0);
>>> +    struct ioperm_data data;
>>> +    int i;
>>> +
>>> +    region->e_physbase = addr;
>>> +    region->e_size = size;
>>> +
>>> +    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d
>>> region_num=%d \n", +          addr, region->u.r_baseport, type,
>>> size, region_num); + +    memset(&data, 0, sizeof(data));
>>> +
>>> +    if (!first_map) {
>>> +	data.start_port = old_port;
>>> +	data.num = old_num; +	data.turn_on = 0;
>>> +
>>> +	for (i = 0; i < smp_cpus; ++i)
>>> +	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
>>> 
>> 
>> How does this interact with VCPU hot-plug?
> 
> I have no idea. Weidong?

maybe we need to keep an assigned io port range list, when VCPU hot-plug in, do ioperm for those io ports on new VCPU. Has VCPU hot-plug been already supported? 

Regards,
Weidong


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 10:22       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
@ 2008-10-29 10:22         ` muli
  0 siblings, 0 replies; 31+ messages in thread
From: muli @ 2008-10-29 10:22 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Muli Ben-Yehuda <muli@il.ibm.com>

This patch has been contributed to by the following people:

Or Sagi <ors@tutis.com>
Nir Peleg <nir@tutis.com>
Amit Shah <amit.shah@redhat.com>
Ben-Ami Yassour <benami@il.ibm.com>
Weidong Han <weidong.han@intel.com>
Glauber de Oliveira Costa <gcosta@redhat.com>
Muli Ben-Yehuda <muli@il.ibm.com>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:

        -pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

* The RAW_IO capability is needed for this to work

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

[muli: lots of fixes addressing review comments]

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/Makefile.target        |    3 +
 qemu/configure              |   21 ++
 qemu/hw/device-assignment.c |  618 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |  108 ++++++++
 qemu/hw/pc.c                |   18 ++
 qemu/hw/pci.c               |    8 +
 qemu/qemu-kvm.c             |   13 +
 qemu/qemu-kvm.h             |    8 +
 qemu/vl.c                   |   28 ++
 9 files changed, 825 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..64d4e44 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM_DEVICE_ASSIGNMENT), 1)
+OBJS+= device-assignment.o
+endif
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/configure b/qemu/configure
index 922a156..618dbce 100755
--- a/qemu/configure
+++ b/qemu/configure
@@ -101,6 +101,7 @@ linux="no"
 kqemu="no"
 kvm="no"
 kvm_cap_pit="no"
+kvm_cap_device_assignment="no"
 profiler="no"
 kernel_path=""
 cocoa="no"
@@ -749,6 +750,9 @@ fi
 # KVM probe
 
 if test "$kvm" = "yes" ; then
+
+# test for KVM_CAP_PIT
+
 cat > $TMPC <<EOF
 #include <libkvm.h>
 #ifndef KVM_CAP_PIT
@@ -759,6 +763,19 @@ EOF
     if $cc $ARCH_CFLAGS $CFLAGS -I"$kernel_path"/include -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
 	kvm_cap_pit="yes"
     fi
+
+# test for KVM_CAP_DEVICE_ASSIGNMENT
+
+cat > $TMPC <<EOF
+#include <libkvm.h>
+#ifndef KVM_CAP_DEVICE_ASSIGNMENT
+#error "kvm no device assignment capability"
+#endif
+int main(void) { return 0; }
+EOF
+    if $cc $ARCH_CFLAGS $CFLAGS -I"$kernel_path"/include -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
+	kvm_cap_device_assignment="yes"
+    fi
 fi
 
 ##########################################
@@ -1515,6 +1532,10 @@ configure_kvm() {
 	echo "USE_KVM_PIT=1" >> $config_mak
 	echo "#define USE_KVM_PIT 1" >> $config_h
     fi
+    if test $kvm_cap_device_assignment = "yes" ; then
+	echo "USE_KVM_DEVICE_ASSIGNMENT=1" >> $config_mak
+	echo "#define USE_KVM_DEVICE_ASSIGNMENT 1" >> $config_h
+    fi
     disable_cpu_emulation
   fi
 }
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..1322e72
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+#include <stdio.h>
+#include <sys/io.h>
+#include "qemu-kvm.h"
+#include "hw.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "console.h"
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO       0x00000100  /* Resource type */
+#define IORESOURCE_MEM      0x00000200
+#define IORESOURCE_IRQ      0x00000400
+#define IORESOURCE_DMA      0x00000800
+#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...)                                       \
+    do {                                                      \
+      fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \
+    } while (0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+
+static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+{
+    return region->u.r_baseport + (addr - region->e_physbase);
+}
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    outb(value, r_pio);
+}
+
+static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    outw(value, r_pio);
+}
+
+static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
+                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    outl(value, r_pio);
+}
+
+static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inb(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inw(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inl(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t e_phys, uint32_t e_size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_ephys = region->e_physbase;
+    uint32_t old_esize = region->e_size;
+    int first_map = (region->e_size == 0);
+    int ret = 0;
+
+    DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+          e_phys, region->u.r_virtbase, type, e_size, region_num);
+
+    region->e_physbase = e_phys;
+    region->e_size = e_size;
+
+    if (!first_map)
+	kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
+
+    if (e_size > 0)
+	ret = kvm_register_phys_mem(kvm_context, e_phys,
+                                        region->u.r_virtbase, e_size, 0);
+    if (ret != 0) {
+	fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
+	exit(1);
+    }
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                    uint32_t addr, uint32_t size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_port = region->u.r_baseport;
+    uint32_t old_num = region->e_size;
+    int first_map = (old_num == 0);
+    struct ioperm_data data;
+    int i;
+
+    region->e_physbase = addr;
+    region->e_size = size;
+
+    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
+          addr, region->u.r_baseport, type, size, region_num);
+
+    memset(&data, 0, sizeof(data));
+
+    if (!first_map) {
+	data.start_port = old_port;
+	data.num = old_num; 
+	data.turn_on = 0;
+
+	for (i = 0; i < smp_cpus; ++i)
+	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+    }
+
+    data.start_port = region->u.r_baseport;
+    data.num = size;
+    data.turn_on = 1;
+ 
+    for (i = 0; i < smp_cpus; ++i)
+	kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+ 
+    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+                          (r_dev->v_addrs + region_num));
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                          uint32_t val, int len)
+{
+    int fd;
+    ssize_t ret;
+
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    if (address == 0x4) {
+        pci_default_write_config(d, address, val, len);
+        /* Continue to program the card */
+    }
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        /* used for update-mappings (BAR emulation) */
+        pci_default_write_config(d, address, val, len);
+        return;
+    }
+
+    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                             int len)
+{
+    uint32_t val = 0;
+    int fd;
+    ssize_t ret;
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        val = pci_default_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
+    /* vga specific, remove later */
+    if (address == 0xFC)
+        goto do_log;
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pread(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+
+do_log:
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+    /* kill the special capabilities */
+    if (address == 4 && len == 4)
+        val &= ~0x100000;
+    else if (address == 6)
+        val &= ~0x10;
+
+    return val;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid)
+            continue;
+        pci_dev->v_addrs[i].num = i;
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                : PCI_ADDRESS_SPACE_MEM;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_virtbase =
+                mmap(NULL,
+                     (cur_region->size + 0xFFF) & 0xFFFFF000,
+                     PROT_WRITE | PROT_READ, MAP_SHARED,
+                     cur_region->resource_fd, (off_t) 0);
+
+            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].u.r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            pci_register_io_region((PCIDevice *) pci_dev, i,
+                                   cur_region->size, t,
+                                   assigned_dev_iomem_map);
+            continue;
+        }
+        /* handle port io regions */
+        pci_register_io_region((PCIDevice *) pci_dev, i,
+                               cur_region->size, PCI_ADDRESS_SPACE_IO,
+                               assigned_dev_ioport_map);
+
+        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+        /* not relevant for port io */
+        pci_dev->v_addrs[i].memory_index = 0;
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                           uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128];
+    int fd, r = 0;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+	     r_bus, r_dev, r_func);
+
+    snprintf(name, sizeof(name), "%sconfig", dir);
+
+    fd = open(name, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    dev->config_fd = fd;
+again:
+    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+
+    snprintf(name, sizeof(name), "%sresource", dir);
+
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+
+    for (r = 0; r < MAX_IO_REGIONS; r++) {
+	if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
+	    break; 
+
+        rp = dev->regions + r;
+        rp->valid = 0;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+            continue;
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;       /* probably ROM */
+            rp->resource_fd = fd;
+        } else
+            flags &= ~IORESOURCE_PREFETCH;
+
+        rp->type = flags;
+        rp->valid = 1;
+        rp->base_addr = start;
+        rp->size = size;
+        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
+              r, rp->size, start, rp->type, rp->resource_fd);
+    }
+    fclose(f);
+
+    dev->region_number = r;
+    return 0;
+}
+
+int nr_assigned_devices;
+static LIST_HEAD(, AssignedDevInfo) adev_head;
+
+static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
+{
+    return (uint32_t)bus << 8 | (uint32_t)devfn;
+}
+
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void assigned_dev_update_irq(PCIDevice *d)
+{
+    int irq, r;
+    AssignedDevice *assigned_dev;
+    AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next) {
+        assigned_dev = adev->assigned_dev;
+        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
+        irq = piix_get_irq(irq);
+
+        if (irq != assigned_dev->girq) {
+            struct kvm_assigned_irq assigned_irq_data;
+
+            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
+            assigned_irq_data.assigned_dev_id  =
+                calc_assigned_dev_id(assigned_dev->h_busnr,
+                                     (uint8_t) assigned_dev->h_devfn);
+            assigned_irq_data.guest_irq = irq;
+            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
+            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
+            if (r < 0) {
+                perror("assigned_dev_update_irq");
+                fprintf(stderr, "Are you assigning a device "
+                        "that shares IRQ with some other device?\n");
+                pci_unregister_device(&assigned_dev->dev);
+                /* FIXME: Delete node from list */
+                continue;
+            }
+            assigned_dev->girq = irq;
+        }
+    }
+}
+
+struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
+{
+    int r;
+    AssignedDevice *dev;
+    uint8_t e_device, e_intx;
+    struct kvm_assigned_pci_dev assigned_dev_data;
+
+    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
+          adev->name, e_devfn);
+
+    dev = (AssignedDevice *)
+        pci_register_device(bus, adev->name, sizeof(AssignedDevice),
+                            -1, assigned_dev_pci_read_config,
+                            assigned_dev_pci_write_config);
+    if (NULL == dev) {
+        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
+                __func__, adev->name);
+        return NULL;
+    }
+
+    if (get_real_device(dev, adev->bus, adev->dev, adev->func)) {
+        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
+                __func__, adev->name);
+        goto out;
+    }
+
+    /* handle real device's MMIO/PIO BARs */
+    if (assigned_dev_register_regions(dev->real_device.regions,
+                                      dev->real_device.region_number,
+                                      dev))
+        goto out;
+
+    /* handle interrupt routing */
+    e_device = (dev->dev.devfn >> 3) & 0x1f;
+    e_intx = dev->dev.config[0x3d] - 1;
+    dev->intpin = e_intx;
+    dev->run = 0;
+    dev->girq = 0;
+    dev->h_busnr = adev->bus;
+    dev->h_devfn = PCI_DEVFN(adev->dev, adev->func);
+
+    memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+    assigned_dev_data.assigned_dev_id  =
+	calc_assigned_dev_id(dev->h_busnr, (uint32_t)dev->h_devfn);
+    assigned_dev_data.busnr = dev->h_busnr;
+    assigned_dev_data.devfn = dev->h_devfn;
+
+#ifdef KVM_CAP_IOMMU
+    /* We always enable the IOMMU if present
+     * (or when not disabled on the command line)
+     */
+    r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
+    if (r && !adev->disable_iommu)
+	assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
+#endif
+      
+    r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
+    if (r < 0) {
+	fprintf(stderr, "Could not notify kernel about "
+                "assigned device \"%s\"\n", adev->name);
+	perror("register_real_device");
+	goto out;
+    }
+
+    adev->assigned_dev = dev;
+  out:
+    return &dev->dev;
+}
+
+int init_all_assigned_devices(PCIBus *bus)
+{
+    struct AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next)
+        if (init_assigned_device(adev, bus) == NULL)
+            return -1;
+
+    return 0;
+}
+
+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice dev=bus:dev.func,dma=dma
+ *
+ * Example:
+ * -pcidevice host=00:13.0,dma=pvdma
+ *
+ * dma can currently only be 'none' to disable iommu support.
+ */
+AssignedDevInfo *add_assigned_device(const char *arg)
+{
+    char *cp, *cp1;
+    char device[8];
+    char dma[6];
+    int r;
+    AssignedDevInfo *adev;
+
+    adev = qemu_mallocz(sizeof(AssignedDevInfo));
+    if (adev == NULL) {
+        fprintf(stderr, "%s: Out of memory\n", __func__);
+        return NULL;
+    }
+    r = get_param_value(device, sizeof(device), "host", arg);
+    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
+    if (!r)
+	snprintf(adev->name, sizeof(adev->name), "%s", device);
+
+#ifdef KVM_CAP_IOMMU
+    r = get_param_value(dma, sizeof(dma), "dma", arg);
+    if (r && !strncmp(dma, "none", 4))
+        adev->disable_iommu = 1;
+#endif
+    cp = device;
+    adev->bus = strtoul(cp, &cp1, 16);
+    if (*cp1 != ':')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->dev = strtoul(cp, &cp1, 16);
+    if (*cp1 != '.')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->func = strtoul(cp, &cp1, 16);
+
+    nr_assigned_devices++;
+    LIST_INSERT_HEAD(&adev_head, adev, next);
+    return adev;
+bad:
+    fprintf(stderr, "pcidevice argument parse error; "
+            "please check the help text for usage\n");
+    qemu_free(adev);
+    return NULL;
+}
diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
new file mode 100644
index 0000000..a770bf3
--- /dev/null
+++ b/qemu/hw/device-assignment.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Data structures for storing PCI state
+ *
+ *  Adapted to kvm by Qumranet
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ */
+
+#ifndef __DEVICE_ASSIGNMENT_H__
+#define __DEVICE_ASSIGNMENT_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "sys-queue.h"
+#include "pci.h"
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+#define MAX_IO_REGIONS (6)
+
+typedef struct {
+    int type;           /* Memory or port I/O */
+    int valid;
+    uint32_t base_addr;
+    uint32_t size;    /* size of the region */
+    int resource_fd;
+} PCIRegion;
+
+typedef struct {
+    uint8_t bus, dev, func; /* Bus inside domain, device and function */
+    int irq;                /* IRQ number */
+    uint16_t region_number; /* number of active regions */
+
+    /* Port I/O or MMIO Regions */
+    PCIRegion regions[MAX_IO_REGIONS];
+    int config_fd;
+} PCIDevRegions;
+
+typedef struct {
+    target_phys_addr_t e_physbase;
+    uint32_t memory_index;
+    union {
+        void *r_virtbase;    /* mmapped access address for memory regions */
+        uint32_t r_baseport; /* the base guest port for I/O regions */
+    } u;
+    int num;            /* our index within v_addrs[] */
+    uint32_t e_size;    /* emulated size of region in bytes */
+    uint32_t r_size;    /* real size of region in bytes */
+} AssignedDevRegion;
+
+typedef struct {
+    PCIDevice dev;
+    int intpin;
+    uint8_t debug_flags;
+    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
+    PCIDevRegions real_device;
+    int run;
+    int girq;
+    unsigned char h_busnr;
+    unsigned int h_devfn;
+    int bound;
+} AssignedDevice;
+
+typedef struct AssignedDevInfo AssignedDevInfo;
+
+struct AssignedDevInfo {
+    char name[15];
+    int bus;
+    int dev;
+    int func;
+    AssignedDevice *assigned_dev;
+    LIST_ENTRY(AssignedDevInfo) next;
+    int disable_iommu;
+};
+
+PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
+AssignedDevInfo *add_assigned_device(const char *arg);
+void assigned_dev_set_vector(int irq, int vector);
+void assigned_dev_ack_mirq(int vector);
+int init_all_assigned_devices(PCIBus *bus);
+
+#define MAX_DEV_ASSIGN_CMDLINE 8
+
+extern int device_assignment_enabled;
+extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+extern int assigned_devices_index;
+
+#endif              /* __DEVICE_ASSIGNMENT_H__ */
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index d559f0c..e5a4722 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -33,6 +33,7 @@
 #include "boards.h"
 #include "console.h"
 #include "fw_cfg.h"
+#include "device-assignment.h"
 
 #include "qemu-kvm.h"
 
@@ -1157,6 +1158,23 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
 
     if (pci_enabled)
         virtio_balloon_init(pci_bus);
+
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && device_assignment_enabled) {
+	int i;
+        for (i = 0; i < assigned_devices_index; i++) {
+            if (add_assigned_device(assigned_devices[i]) < 0) {
+                fprintf(stderr, "Warning: could not add assigned device %s\n",
+                        assigned_devices[i]);
+            }
+        }
+
+	if (init_all_assigned_devices(pci_bus)) {
+	    fprintf(stderr, "Failed to initialize assigned devices\n");
+	    exit (1);
+	}
+    }
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
 }
 
 static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index c82cd20..75bc9a9 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
 
 static void pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
+void assigned_dev_update_irq(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -453,6 +454,13 @@ void pci_default_write_config(PCIDevice *d,
         val >>= 8;
     }
 
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+        address >= PIIX_CONFIG_IRQ_ROUTE &&
+	address < PIIX_CONFIG_IRQ_ROUTE + 4)
+        assigned_dev_update_irq(d);
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
+
     end = address + len;
     if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
         /* if the command register is modified, we must modify the mappings */
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index c5f3f29..3b4f279 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -27,6 +27,7 @@ int kvm_pit = 1;
 #include <sys/utsname.h>
 #include <sys/syscall.h>
 #include <sys/mman.h>
+#include <sys/io.h>
 
 #define bool _Bool
 #define false 0
@@ -1047,3 +1048,15 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 {
     return kvm_unregister_coalesced_mmio(kvm_context, addr, size);
 }
+
+static void kvm_do_ioperm(void *_data)
+{
+    struct ioperm_data *data = _data;
+    ioperm(data->start_port, data->num, data->turn_on);
+}
+
+void kvm_ioperm(CPUState *env, void *data)
+{
+    if (kvm_enabled() && qemu_system_ready)
+	on_vcpu(env, kvm_do_ioperm, data);
+}
diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h
index a1d6646..1084cd6 100644
--- a/qemu/qemu-kvm.h
+++ b/qemu/qemu-kvm.h
@@ -93,6 +93,8 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 
 void qemu_kvm_system_reset_request(void);
 
+void kvm_ioperm(CPUState *env, void *data);
+
 #ifdef TARGET_PPC
 int handle_powerpc_dcr_read(int vcpu, uint32_t dcrn, uint32_t *data);
 int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
@@ -107,6 +109,12 @@ int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
 extern int kvm_allowed;
 extern kvm_context_t kvm_context;
 
+struct ioperm_data {
+    unsigned long start_port;
+    unsigned long num;
+    int turn_on;
+};
+
 #define kvm_enabled() (kvm_allowed)
 #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
 #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
diff --git a/qemu/vl.c b/qemu/vl.c
index 388e79d..560177b 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -38,6 +38,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "balloon.h"
 #include "qemu-kvm.h"
@@ -215,6 +216,9 @@ CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack = 0;
 #endif
 int usb_enabled = 0;
+int device_assignment_enabled = 0;
+const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+int assigned_devices_index;
 static VLANState *first_vlan;
 int smp_cpus = 1;
 const char *vnc_display;
@@ -8692,6 +8696,12 @@ static void help(int exitcode)
 #endif
 	   "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
 	   "-no-kvm-pit	    disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+           "-pcidevice host=bus:dev.func[,dma=none][,name=string]\n"
+           "                expose a PCI device to the guest OS.\n"
+           "                dma=none: don't perform any dma translations (default is to use an iommu)\n"
+           "                'string' is used in log output.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-no-acpi        disable ACPI\n"
@@ -8811,6 +8821,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8900,6 +8913,9 @@ static const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9411,6 +9427,7 @@ int main(int argc, char **argv)
     parallel_device_index = 0;
 
     usb_devices_index = 0;
+    assigned_devices_index = 0;
 
     nb_net_clients = 0;
     nb_drives = 0;
@@ -9844,6 +9861,17 @@ int main(int argc, char **argv)
 		kvm_pit = 0;
 		break;
 	    }
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+            case QEMU_OPTION_pcidevice:
+		device_assignment_enabled = 1;
+		if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
+                    fprintf(stderr, "Too many assigned devices\n");
+                    exit(1);
+		}
+		assigned_devices[assigned_devices_index] = optarg;
+		assigned_devices_index++;
+                break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 15:53             ` Muli Ben-Yehuda
  2008-10-29  7:56               ` Zhang, Xiantao
  2008-10-29  8:22               ` Han, Weidong
@ 2008-10-29 10:25               ` Muli Ben-Yehuda
  2008-10-29 10:39                 ` Muli Ben-Yehuda
  2 siblings, 1 reply; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-29 10:25 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: avi, kvm, weidong.han, Ben-Ami Yassour1, amit.shah, allen.m.kay

On Tue, Oct 28, 2008 at 05:53:05PM +0200, Muli Ben-Yehuda wrote:
> On Tue, Oct 28, 2008 at 10:45:57AM -0500, Anthony Liguori wrote:
> 
> >> +ifeq ($(USE_KVM), 1)
> >> +OBJS+= device-assignment.o
> >> +endif
> >
> > I don't think you want to build this on PPC so I think you need a
> > stronger check.
> 
> Good point. How about checking TARGET_BASE_ARCH = i386?

Turns out this stanza is already enclosed in exactly that.

> >> +#ifdef KVM_CAP_IOMMU
> >> +        /* We always enable the IOMMU if present
> >> +         * (or when not disabled on the command line)
> >> +         */
> >> +        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> >> +        if (r && !disable_iommu)
> >> +            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> >> +#endif
> >> +        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> >> +        if (r < 0) {
> >> +            fprintf(stderr, "Could not notify kernel about "
> >> +                "assigned device \"%s\"\n", e_dev_name);
> >> +            perror("register_real_device");
> >> +            goto out;
> >> +        }
> >> +    }
> >>   
> >
> > You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined?
> > That means a newer userspace compiled on an older kernel will
> > silently fail if they try to do device assignment.  There's
> > probably no reason to build this file if KVM_CAP_DEVICE_ASSIGNMENT
> > isn't defined (see how the in-kernel PIT gets conditionally build
> > depending on whether that cap is available).
> 
> Ok, I'll take a look at this.

I reworked it per your suggestion so that device assignment is a kvm
only feature for now. I am pretty sure Amit intended for the patches
to support device assignment without kvm too, but getting rid of it
did make things simpler.

cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29  7:56               ` Zhang, Xiantao
@ 2008-10-29 10:27                 ` Muli Ben-Yehuda
  0 siblings, 0 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-29 10:27 UTC (permalink / raw)
  To: Zhang, Xiantao
  Cc: Anthony Liguori, avi@redhat.com, kvm@vger.kernel.org,
	Han, Weidong, Ben-Ami Yassour1, amit.shah@redhat.com,
	Kay, Allen M

On Wed, Oct 29, 2008 at 03:56:54PM +0800, Zhang, Xiantao wrote:
> Muli Ben-Yehuda wrote:
> > On Tue, Oct 28, 2008 at 10:45:57AM -0500, Anthony Liguori wrote:
> > 
> >>> +ifeq ($(USE_KVM), 1)
> >>> +OBJS+= device-assignment.o
> >>> +endif
> >> 
> >> I don't think you want to build this on PPC so I think you need a
> >> stronger check.
> > 
> > Good point. How about checking TARGET_BASE_ARCH = i386?
> 
> It should work for ia64, please include ia64 when you do it in this
> way. 

Fixing it to work for ia64 will take a few more changes than just
this, and I'd rather not do it now. This patchset has been through a
tremendous amount of churn, and I hope it can finally find its way
into the tree now. Once it's in, adding ia64 support will be a lot
simpler.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-28 16:55           ` Mark McLoughlin
@ 2008-10-29 10:31             ` Muli Ben-Yehuda
  2008-10-29 11:07               ` Mark McLoughlin
  2008-10-29 11:15               ` Mark McLoughlin
  0 siblings, 2 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-29 10:31 UTC (permalink / raw)
  To: Mark McLoughlin
  Cc: avi, kvm, anthony, weidong.han, Ben-Ami Yassour1, amit.shah,
	allen.m.kay

On Tue, Oct 28, 2008 at 04:55:22PM +0000, Mark McLoughlin wrote:
> On Tue, 2008-10-28 at 12:06 +0200, muli@il.ibm.com wrote:
> ...
> > +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> > +                           uint8_t r_dev, uint8_t r_func)
> > +{
> > +    char dir[128], name[128];
> > +    int fd, r = 0;
> > +    FILE *f;
> > +    unsigned long long start, end, size, flags;
> > +    PCIRegion *rp;
> > +    PCIDevRegions *dev = &pci_dev->real_device;
> > +
> > +    dev->region_number = 0;
> > +
> > +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> > +	     r_bus, r_dev, r_func);
> > +
> > +    snprintf(name, sizeof(name), "%sconfig", dir);
> > +
> > +    fd = open(name, O_RDWR);
> > +    if (fd == -1) {
> > +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> > +        return 1;
> > +    }
> > +    dev->config_fd = fd;
> > +again:
> > +    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> > +    if (r < 0) {
> > +        if (errno == EINTR || errno == EAGAIN)
> > +            goto again;
> > +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> > +    }
> > +
> > +    snprintf(name, sizeof(name), "%sresource", dir);
> > +
> > +    f = fopen(name, "r");
> > +    if (f == NULL) {
> > +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> > +        return 1;
> > +    }
> > +    r = -1;
> > +    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> > +        r++;
> > +        rp = dev->regions + r;
> 
> Could, in theory, overflow dev->regions here. Suggest:
> 
> +    for (r = 0; r < MAX_IO_REGIONS; r++) {
> +        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
> +            break;

Fixed, thanks Mark. I think it also uncovered a buglet where we would
skip the last region with the original code, which should be ok now.

> > +        rp->valid = 0;
> > +        size = end - start + 1;
> > +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> > +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> > +            continue;
> > +        if (flags & IORESOURCE_MEM) {
> > +            flags &= ~IORESOURCE_IO;
> > +	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
> > +            fd = open(name, O_RDWR);
> > +            if (fd == -1)
> > +                continue;       /* probably ROM */
> > +            rp->resource_fd = fd;
> > +        } else
> > +            flags &= ~IORESOURCE_PREFETCH;
> > +
> > +        rp->type = flags;
> > +        rp->valid = 1;
> > +        rp->base_addr = start;
> > +        rp->size = size;
> > +        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> > +              r, rp->size, start, rp->type, rp->resource_fd);
> > +    }
> > +    fclose(f);
> > +
> > +    dev->region_number = r;
> > +    return 0;
> > +}
> > +
> > +static int disable_iommu;
> 
> Why is this global?
> 
> The flag is set per-device on the command-line and only affects whether
> we pass KVM_DEV_ASSIGN_ENABLE_IOMMU to kvm_assign_pci_device()

Made per-device by moving it to struct AssignedDevInfo.

> > +int nr_assigned_devices;
> > +static LIST_HEAD(, AssignedDevInfo) adev_head;
> > +
> > +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> > +{
> > +    return (uint32_t)bus << 8 | (uint32_t)devfn;
> > +}
> > +
> > +static AssignedDevice *register_real_device(PCIBus *e_bus,
> > +                                            const char *e_dev_name,
> > +                                            int e_devfn, uint8_t r_bus,
> > +                                            uint8_t r_dev, uint8_t r_func)
> > +{
> > +    int r;
> > +    AssignedDevice *pci_dev;
> > +    uint8_t e_device, e_intx;
> > +
> > +    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> > +          e_dev_name, e_devfn);
> > +
> > +    pci_dev = (AssignedDevice *)
> > +        pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> > +                            e_devfn, assigned_dev_pci_read_config,
> > +                            assigned_dev_pci_write_config);
> > +    if (NULL == pci_dev) {
> > +        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> > +                __func__, e_dev_name);
> > +        return NULL;
> > +    }
> > +    if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> > +        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> > +                __func__, e_dev_name);
> > +        goto out;
> > +    }
> > +
> > +    /* handle real device's MMIO/PIO BARs */
> > +    if (assigned_dev_register_regions(pci_dev->real_device.regions,
> > +                                      pci_dev->real_device.region_number,
> > +                                      pci_dev))
> > +        goto out;
> > +
> > +    /* handle interrupt routing */
> > +    e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> > +    e_intx = pci_dev->dev.config[0x3d] - 1;
> > +    pci_dev->intpin = e_intx;
> > +    pci_dev->run = 0;
> > +    pci_dev->girq = 0;
> > +    pci_dev->h_busnr = r_bus;
> > +    pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> > +
> > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> > +    if (kvm_enabled()) {
> > +        struct kvm_assigned_pci_dev assigned_dev_data;
> > +
> > +        memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> > +        assigned_dev_data.assigned_dev_id  =
> > +            calc_assigned_dev_id(pci_dev->h_busnr,
> > +                                 (uint32_t)pci_dev->h_devfn);
> > +        assigned_dev_data.busnr = pci_dev->h_busnr;
> > +        assigned_dev_data.devfn = pci_dev->h_devfn;
> > +
> > +#ifdef KVM_CAP_IOMMU
> > +        /* We always enable the IOMMU if present
> > +         * (or when not disabled on the command line)
> > +         */
> > +        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> > +        if (r && !disable_iommu)
> > +            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> > +#endif
> > +        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> > +        if (r < 0) {
> > +            fprintf(stderr, "Could not notify kernel about "
> > +                "assigned device \"%s\"\n", e_dev_name);
> > +            perror("register_real_device");
> > +            goto out;
> > +        }
> > +    }
> > +#endif
> > +    term_printf("Registered host PCI device %02x:%02x.%1x "
> > +		"(\"%s\") as guest device %02x:%02x.%1x\n",
> > +		r_bus, r_dev, r_func, e_dev_name,
> > +		pci_bus_num(e_bus), e_device, r_func);
> > +
> > +    return pci_dev;
> > +out:
> > +/*    pci_unregister_device(&pci_dev->dev); */
> > +    return NULL;
> > +}
> > +
> > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> > +/* The pci config space got updated. Check if irq numbers have changed
> > + * for our devices
> > + */
> > +void assigned_dev_update_irq(PCIDevice *d)
> > +{
> > +    int irq, r;
> > +    AssignedDevice *assigned_dev;
> > +    AssignedDevInfo *adev;
> > +
> > +    LIST_FOREACH(adev, &adev_head, next) {
> > +        assigned_dev = adev->assigned_dev;
> > +        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
> > +        irq = piix_get_irq(irq);
> > +
> > +        if (irq != assigned_dev->girq) {
> > +            struct kvm_assigned_irq assigned_irq_data;
> > +
> > +            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
> > +            assigned_irq_data.assigned_dev_id  =
> > +                calc_assigned_dev_id(assigned_dev->h_busnr,
> > +                                     (uint8_t) assigned_dev->h_devfn);
> > +            assigned_irq_data.guest_irq = irq;
> > +            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
> > +            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
> > +            if (r < 0) {
> > +                perror("assigned_dev_update_irq");
> > +                fprintf(stderr, "Are you assigning a device "
> > +                        "that shares IRQ with some other device?\n");
> > +                pci_unregister_device(&assigned_dev->dev);
> > +                /* FIXME: Delete node from list */
> > +                continue;
> > +            }
> > +            assigned_dev->girq = irq;
> > +        }
> > +    }
> > +}
> > +#endif
> > +
> > +struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
> > +{
> > +    adev->assigned_dev = register_real_device(bus,
> > +                                              adev->name, -1,
> > +                                              adev->bus,
> > +                                              adev->dev,
> > +                                              adev->func);
> > +    return &adev->assigned_dev->dev;
> > +}
> 
> This looks unnecessary - register_real_device() isn't used anywhere
> else.
> 
> Why not just move register_real_device() into init_assigned_device()
> ?

Done.

> 
> > +int init_all_assigned_devices(PCIBus *bus)
> > +{
> > +    struct AssignedDevInfo *adev;
> > +
> > +    LIST_FOREACH(adev, &adev_head, next)
> > +        if (init_assigned_device(adev, bus) == NULL)
> > +            return -1;
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * Syntax to assign device:
> > + *
> > + * -pcidevice dev=bus:dev.func,dma=dma
>                  ^^^                  ^^^
> 
> Should be:
> 
>   -pcidevice host=bus:dev.func[,dma=none][,name=string]

Fixed (as well as the other -pcidevice bogosities you pointed).

> 
> > + *
> > + * Example:
> > + * -pcidevice host=00:13.0,dma=pvdma
>                                   ^^^^^
> 
> Should be:
> 
>   -pcidevice host=00:13.0,dma=none,name=Foo
>   
> > + *
> > + * dma can currently only be 'none' to disable iommu support.
> > + */
> > +AssignedDevInfo *add_assigned_device(const char *arg)
> > +{
> > +    char *cp, *cp1;
> > +    char device[8];
> > +    char dma[6];
> > +    int r;
> > +    AssignedDevInfo *adev;
> > +
> > +    adev = qemu_mallocz(sizeof(AssignedDevInfo));
> > +    if (adev == NULL) {
> > +        fprintf(stderr, "%s: Out of memory\n", __func__);
> > +        return NULL;
> > +    }
> > +    r = get_param_value(device, sizeof(device), "host", arg);
> > +    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
> > +    if (!r)
> > +	snprintf(adev->name, sizeof(adev->name), "%s", device);
> > +
> > +#ifdef KVM_CAP_IOMMU
> > +    r = get_param_value(dma, sizeof(dma), "dma", arg);
> > +    if (r && !strncmp(dma, "none", 4))
> > +        disable_iommu = 1;
> > +#endif
> > +    cp = device;
> > +    adev->bus = strtoul(cp, &cp1, 16);
> > +    if (*cp1 != ':')
> > +        goto bad;
> > +    cp = cp1 + 1;
> > +
> > +    adev->dev = strtoul(cp, &cp1, 16);
> > +    if (*cp1 != '.')
> > +        goto bad;
> > +    cp = cp1 + 1;
> > +
> > +    adev->func = strtoul(cp, &cp1, 16);
> > +
> > +    nr_assigned_devices++;
> 
> nr_assigned_devices isn't actually used anywhere.

Nuked.

> 
> > +    LIST_INSERT_HEAD(&adev_head, adev, next);
> > +    return adev;
> > +bad:
> > +    fprintf(stderr, "pcidevice argument parse error; "
> > +            "please check the help text for usage\n");
> > +    qemu_free(adev);
> > +    return NULL;
> > +}
> > diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
> > new file mode 100644
> > index 0000000..ebc0b50
> > --- /dev/null
> > +++ b/qemu/hw/device-assignment.h
> > @@ -0,0 +1,117 @@
> > +/*
> > + * Copyright (c) 2007, Neocleus Corporation.
> > + * Copyright (c) 2007, Intel Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along with
> > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> > + * Place - Suite 330, Boston, MA 02111-1307 USA.
> > + *
> > + *  Data structures for storing PCI state
> > + *
> > + *  Adapted to kvm by Qumranet
> > + *
> > + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> > + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> > + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> > + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> > + */
> > +
> > +#ifndef __DEVICE_ASSIGNMENT_H__
> > +#define __DEVICE_ASSIGNMENT_H__
> > +
> > +#include <sys/mman.h>
> > +#include "qemu-common.h"
> > +#include "sys-queue.h"
> > +#include "pci.h"
> > +
> > +/* From include/linux/pci.h in the kernel sources */
> > +#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> > +
> > +#define MAX_IO_REGIONS (6)
> 
> Perhaps a comment to say this is the number of BARs in the config space
> header?

Sure, comments are cheap.

> 
> > +typedef struct {
> > +    int type;           /* Memory or port I/O */
> > +    int valid;
> > +    uint32_t base_addr;
> > +    uint32_t size;    /* size of the region */
> > +    int resource_fd;
> > +} PCIRegion;
> > +
> > +typedef struct {
> > +    uint8_t bus, dev, func; /* Bus inside domain, device and function */
> > +    int irq;                /* IRQ number */
> > +    uint16_t region_number; /* number of active regions */
> > +
> > +    /* Port I/O or MMIO Regions */
> > +    PCIRegion regions[MAX_IO_REGIONS];
> > +    int config_fd;
> > +} PCIDevRegions;
> > +
> > +typedef struct {
> > +    target_phys_addr_t e_physbase;
> > +    uint32_t memory_index;
> > +    union {
> > +        void *r_virtbase;    /* mmapped access address for memory regions */
> > +        uint32_t r_baseport; /* the base guest port for I/O regions */
> > +    } u;
> > +    int num;            /* our index within v_addrs[] */
> > +    uint32_t e_size;    /* emulated size of region in bytes */
> > +    uint32_t r_size;    /* real size of region in bytes */
> > +} AssignedDevRegion;
> > +
> > +typedef struct {
> > +    PCIDevice dev;
> > +    int intpin;
> > +    uint8_t debug_flags;
> > +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
> > +    PCIDevRegions real_device;
> > +    int run;
> > +    int girq;
> > +    unsigned char h_busnr;
> > +    unsigned int h_devfn;
> > +    int bound;
> > +} AssignedDevice;
> > +
> > +typedef struct AssignedDevInfo AssignedDevInfo;
> > +
> > +struct AssignedDevInfo {
> > +    char name[15];
> > +    int bus;
> > +    int dev;
> > +    int func;
> > +    AssignedDevice *assigned_dev;
> > +    LIST_ENTRY(AssignedDevInfo) next;
> > +};
> > +
> > +PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
> > +AssignedDevInfo *add_assigned_device(const char *arg);
> > +void assigned_dev_set_vector(int irq, int vector);
> > +void assigned_dev_ack_mirq(int vector);
> > +
> > +
> > +#ifdef USE_KVM
> > +int init_all_assigned_devices(PCIBus *bus);
> > +#else /* not using kvm */
> > +static inline int init_all_assigned_devices(PCIBus *bus)
> > +{
> > +    return 0;
> > +}
> > +#endif /* !USE_KVM */
> > +
> > +
> > +#define MAX_DEV_ASSIGN_CMDLINE 8
> > +
> > +extern int device_assignment_enabled;
> 
> > +extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
> > +extern int assigned_devices_index;
> 
> Neither of these two are implemented anywhere.

Actually they are, we use them to pass the arguments from main to
pc.c.

> 
> > +#endif              /* __DEVICE_ASSIGNMENT_H__ */
> > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> > index d559f0c..5fdb726 100644
> > --- a/qemu/hw/pc.c
> > +++ b/qemu/hw/pc.c
> > @@ -33,6 +33,7 @@
> >  #include "boards.h"
> >  #include "console.h"
> >  #include "fw_cfg.h"
> > +#include "device-assignment.h"
> >  
> >  #include "qemu-kvm.h"
> >  
> > @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
> >  
> >      if (pci_enabled)
> >          virtio_balloon_init(pci_bus);
> > +
> > +    if (kvm_enabled() && device_assignment_enabled) {
> 
> The device_assignment_enabled flag looks like it shouldn't be needed.
> 
> If assigned_devices_index remains zero, nothing should happen
> anyway.

Nuked.

> 
> > +	int i;
> > +        for (i = 0; i < assigned_devices_index; i++) {
> > +            if (add_assigned_device(assigned_devices[i]) < 0) {
> > +                fprintf(stderr, "Warning: could not add assigned device %s\n",
> > +                        assigned_devices[i]);
> > +            }
> > +        }
> > +
> > +	if (init_all_assigned_devices(pci_bus)) {
> > +	    fprintf(stderr, "Failed to initialize assigned devices\n");
> > +	    exit (1);
> > +	}
> > +    }
> >  }
> >  
> >  static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
> > diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
> > index c82cd20..f86a8a7 100644
> > --- a/qemu/hw/pci.c
> > +++ b/qemu/hw/pci.c
> > @@ -50,6 +50,7 @@ struct PCIBus {
> >  
> >  static void pci_update_mappings(PCIDevice *d);
> >  static void pci_set_irq(void *opaque, int irq_num, int level);
> > +void assigned_dev_update_irq(PCIDevice *d);
> >  
> >  target_phys_addr_t pci_mem_base;
> >  static int pci_irq_index;
> > @@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
> >          val >>= 8;
> >      }
> >  
> > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> > +    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
> > +        address >= 0x60 && address <= 0x63)
> > +        assigned_dev_update_irq(d);
> > +#endif
> 
> Outside of the context of piix_pci.c, it's difficult to figure out what
> the 0x60-0x63 register range relates to - i.e. you need to know to go
> digging in the PIIX spec.
> 
> How about something like in qemu/hw/pc.h:
> 
>   +/* config space register for IRQ routing */
>   +#define PIIX_CONFIG_IRQ_ROUTE 0x60
> 
> then:
> 
>      if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
>          address >= PIIX_CONFIG_IRQ_ROUTE &&
>          address < PIIX_CONFIG_IRQ_ROUTE + 4)

Ok. #define's are cheap too.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 10:25               ` Muli Ben-Yehuda
@ 2008-10-29 10:39                 ` Muli Ben-Yehuda
  0 siblings, 0 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-29 10:39 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: avi, kvm, weidong.han, Ben-Ami Yassour1, amit.shah, allen.m.kay

On Wed, Oct 29, 2008 at 12:25:50PM +0200, Muli Ben-Yehuda wrote:

> > > You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined?
> > > That means a newer userspace compiled on an older kernel will
> > > silently fail if they try to do device assignment.  There's
> > > probably no reason to build this file if
> > > KVM_CAP_DEVICE_ASSIGNMENT isn't defined (see how the in-kernel
> > > PIT gets conditionally build depending on whether that cap is
> > > available).
> > 
> > Ok, I'll take a look at this.
> 
> I reworked it per your suggestion so that device assignment is a kvm
> only feature for now. I am pretty sure Amit intended for the patches
> to support device assignment without kvm too, but getting rid of it
> did make things simpler.

By the way, one thing I ran into here is that we check the PIT and
DEVICE_ASSIGNMENT capabilities based on the kernel headers under
kernel/ first, and sync the kernel headers from the patched kernel
tree into kernel/ second.  In other words, if you configure userspace
with a patched kernel tree and just edit include/linux/kvm.h to have
or not have the capability, the build will use the old copy of the
header first, and only pick up the new copy later after it resynch's
the header. Seems rather counter-intuitive.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 10:31             ` Muli Ben-Yehuda
@ 2008-10-29 11:07               ` Mark McLoughlin
  2008-10-29 11:15               ` Mark McLoughlin
  1 sibling, 0 replies; 31+ messages in thread
From: Mark McLoughlin @ 2008-10-29 11:07 UTC (permalink / raw)
  To: Muli Ben-Yehuda
  Cc: avi, kvm, anthony, weidong.han, Ben-Ami Yassour1, amit.shah,
	allen.m.kay

On Wed, 2008-10-29 at 12:31 +0200, Muli Ben-Yehuda wrote:
> On Tue, Oct 28, 2008 at 04:55:22PM +0000, Mark McLoughlin wrote:
> > On Tue, 2008-10-28 at 12:06 +0200, muli@il.ibm.com wrote:

> > > +void assigned_dev_set_vector(int irq, int vector);
> > > +void assigned_dev_ack_mirq(int vector);
...
> > > +extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
> > > +extern int assigned_devices_index;
> > 
> > Neither of these two are implemented anywhere.
> 
> Actually they are, we use them to pass the arguments from main to
> pc.c.

Heh, sorry ... I added the comment in the wrong place :-)

assigned_dev_{set_vector,ack_mirq} aren't implemented anywhere

Cheers,
Mark.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 10:31             ` Muli Ben-Yehuda
  2008-10-29 11:07               ` Mark McLoughlin
@ 2008-10-29 11:15               ` Mark McLoughlin
  2008-10-29 11:47                 ` Muli Ben-Yehuda
  1 sibling, 1 reply; 31+ messages in thread
From: Mark McLoughlin @ 2008-10-29 11:15 UTC (permalink / raw)
  To: Muli Ben-Yehuda
  Cc: avi, kvm, anthony, weidong.han, Ben-Ami Yassour1, amit.shah,
	allen.m.kay

On Wed, 2008-10-29 at 12:31 +0200, Muli Ben-Yehuda wrote:
> On Tue, Oct 28, 2008 at 04:55:22PM +0000, Mark McLoughlin wrote:

> > nr_assigned_devices isn't actually used anywhere.
> 
> Nuked.

Still there.

> > > +#define MAX_IO_REGIONS (6)
> > 
> > Perhaps a comment to say this is the number of BARs in the config space
> > header?
> 
> Sure, comments are cheap.

You didn't add one though :-)

> > > +
> > > +    if (kvm_enabled() && device_assignment_enabled) {
> > 
> > The device_assignment_enabled flag looks like it shouldn't be needed.
> > 
> > If assigned_devices_index remains zero, nothing should happen
> > anyway.
> 
> Nuked.

Still there.

Cheers,
Mark.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 11:15               ` Mark McLoughlin
@ 2008-10-29 11:47                 ` Muli Ben-Yehuda
  0 siblings, 0 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-29 11:47 UTC (permalink / raw)
  To: Mark McLoughlin
  Cc: avi, kvm, anthony, weidong.han, Ben-Ami Yassour1, amit.shah,
	allen.m.kay

On Wed, Oct 29, 2008 at 11:15:10AM +0000, Mark McLoughlin wrote:
> On Wed, 2008-10-29 at 12:31 +0200, Muli Ben-Yehuda wrote:
> > On Tue, Oct 28, 2008 at 04:55:22PM +0000, Mark McLoughlin wrote:
> 
> > > nr_assigned_devices isn't actually used anywhere.
> > 
> > Nuked.
> 
> Still there.
> 
> > > > +#define MAX_IO_REGIONS (6)
> > > 
> > > Perhaps a comment to say this is the number of BARs in the config space
> > > header?
> > 
> > Sure, comments are cheap.
> 
> You didn't add one though :-)
> 
> > > > +
> > > > +    if (kvm_enabled() && device_assignment_enabled) {
> > > 
> > > The device_assignment_enabled flag looks like it shouldn't be needed.
> > > 
> > > If assigned_devices_index remains zero, nothing should happen
> > > anyway.
> > 
> > Nuked.
> 
> Still there.

Sorry, I messed up the rebase! v10 coming up in a few minutes.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [v10] Userspace patches for PCI device assignment
@ 2008-10-29 12:19 muli
  2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: muli @ 2008-10-29 12:19 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

This patchset enables device assignment for KVM hosts for PCI
devices. It uses the Intel IOMMU by default if available.

Changes from v9->v10:

- really emove nr_assigned_devices and device_assignment_enabled and
  other cleanups

Changes from v8->v9 in no particular order:

- fixed DEBUG, removed uneeded headers
- introduce USE_KVM_DEVICE_ASSIGNMENT modeled after KVM_USE_PIT, per
  aliguori's suggestion
- only call term_printf from the monitor
- implement markmc's suggestions: don't overflow dev->regions, fix
  disable_iommu bogosity, fix usage and comment with regards to
  -pcidevice format, and some other bits

Changes from v7->v8 in in particular order:

- various formatting fixes, DEBUG cleanups, cast removals, etc.
- s/strncpy/snprintf/
- split initialization in two phases per aliguori's suggestion
- bail out on errors when we can't limp on
- do ioperm on every cpu and vcpu (Weidong Han)
- use pwrite/pread where applicable
- split r_virtbase into different fields for memory and IO
- fix destruction of MMIO regions (Disheng Su and Weidong Han)

Changes from v6->v7 in no particular order:

- formatting changes: adhere to qemu style
- use strncmp, strncpy etc. instead of the insecure ones
- move from array to linked list
- change iopl() to ioperm() (Weidong Han)
- other small changes as suggested during the review of v6.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/6] device assignment: add ioctl wrappers
  2008-10-29 12:19 [v10] Userspace patches for PCI device assignment muli
@ 2008-10-29 12:19 ` muli
  2008-10-29 12:19   ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
  2008-10-30 11:29 ` [v10] Userspace patches for PCI device assignment Avi Kivity
  2008-10-30 12:44 ` Anthony Liguori
  2 siblings, 1 reply; 31+ messages in thread
From: muli @ 2008-10-29 12:19 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Amit Shah <amit.shah@redhat.com>

[muli: return -errno instead of ioctl retval]

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 libkvm/libkvm.c |   25 +++++++++++++++++++++++++
 libkvm/libkvm.h |   27 +++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index 444b97f..e7dba8a 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -1112,3 +1112,28 @@ int kvm_unregister_coalesced_mmio(kvm_context_t kvm, uint64_t addr, uint32_t siz
 	return -ENOSYS;
 }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+int kvm_assign_pci_device(kvm_context_t kvm,
+			  struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int ret;
+
+	ret = ioctl(kvm->vm_fd, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
+	if (ret < 0)
+		return -errno;
+
+	return ret;
+}
+
+int kvm_assign_irq(kvm_context_t kvm,
+		   struct kvm_assigned_irq *assigned_irq)
+{
+	int ret;
+
+	ret = ioctl(kvm->vm_fd, KVM_ASSIGN_IRQ, assigned_irq);
+	if (ret < 0)
+		return -errno;
+
+	return ret;
+}
+#endif
diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h
index 423ce31..53d67f2 100644
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -686,4 +686,31 @@ int kvm_s390_interrupt(kvm_context_t kvm, int slot,
 int kvm_s390_set_initial_psw(kvm_context_t kvm, int slot, psw_t psw);
 int kvm_s390_store_status(kvm_context_t kvm, int slot, unsigned long addr);
 #endif
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/*!
+ * \brief Notifies host kernel about a PCI device to be assigned to a guest
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the assigning of the physical PCI device to a guest.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_dev Parameters, like bus, devfn number, etc
+ */
+int kvm_assign_pci_device(kvm_context_t kvm,
+			  struct kvm_assigned_pci_dev *assigned_dev);
+
+/*!
+ * \brief Notifies host kernel about changes to IRQ for an assigned device
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the changes in IRQ number for an assigned physical
+ * PCI device.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_irq Parameters, like dev id, host irq, guest irq, etc
+ */
+int kvm_assign_irq(kvm_context_t kvm,
+		   struct kvm_assigned_irq *assigned_irq);
+#endif
 #endif
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number
  2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
@ 2008-10-29 12:19   ` muli
  2008-10-29 12:19     ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
  0 siblings, 1 reply; 31+ messages in thread
From: muli @ 2008-10-29 12:19 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Amit Shah <amit.shah@redhat.com>

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/hw/pci.c |    5 +++++
 qemu/hw/pci.h |    1 +
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 512dbea..c82cd20 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -560,6 +560,11 @@ static void pci_set_irq(void *opaque, int irq_num, int level)
     bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
 }
 
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+    return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
 /***********************************************************/
 /* monitor info on PCI */
 
diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index 60e4094..e11fbbf 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -81,6 +81,7 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num,
                             uint32_t size, int type,
                             PCIMapIORegionFunc *map_func);
 
+int pci_map_irq(PCIDevice *pci_dev, int pin);
 uint32_t pci_default_read_config(PCIDevice *d,
                                  uint32_t address, int len);
 void pci_default_write_config(PCIDevice *d,
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq
  2008-10-29 12:19   ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
@ 2008-10-29 12:19     ` muli
  2008-10-29 12:19       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
  0 siblings, 1 reply; 31+ messages in thread
From: muli @ 2008-10-29 12:19 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Amit Shah <amit.shah@redhat.com>

[muli: use define for PIIX IRQ routing register (thanks markmc)]

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/hw/pc.h       |    5 +++++
 qemu/hw/piix_pci.c |   10 ++++++++++
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h
index 1f63678..f3d206a 100644
--- a/qemu/hw/pc.h
+++ b/qemu/hw/pc.h
@@ -105,6 +105,9 @@ void pcspk_init(PITState *);
 int pcspk_audio_init(AudioState *, qemu_irq *pic);
 
 /* piix_pci.c */
+/* config space register for IRQ routing */
+#define PIIX_CONFIG_IRQ_ROUTE 0x60
+
 PCIBus *i440fx_init(PCIDevice **pi440fx_state, qemu_irq *pic);
 void i440fx_set_smm(PCIDevice *d, int val);
 int piix3_init(PCIBus *bus, int devfn);
@@ -112,6 +115,8 @@ void i440fx_init_memory_mappings(PCIDevice *d);
 
 int piix4_init(PCIBus *bus, int devfn);
 
+int piix_get_irq(int pin);
+
 /* vga.c */
 enum vga_retrace_method {
     VGA_RETRACE_DUMB,
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 6fbf47b..b9067b8 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -243,6 +243,16 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int level)
     }
 }
 
+int piix_get_irq(int pin)
+{
+    if (piix3_dev)
+        return piix3_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
+    if (piix4_dev)
+        return piix4_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
+
+    return 0;
+}
+
 static void piix3_reset(PCIDevice *d)
 {
     uint8_t *pci_conf = d->config;
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support
  2008-10-29 12:19     ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
@ 2008-10-29 12:19       ` muli
  2008-10-29 12:20         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
  0 siblings, 1 reply; 31+ messages in thread
From: muli @ 2008-10-29 12:19 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Amit Shah <amit.shah@redhat.com>

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 kernel/x86/Kbuild |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/kernel/x86/Kbuild b/kernel/x86/Kbuild
index 2369d00..c4723b1 100644
--- a/kernel/x86/Kbuild
+++ b/kernel/x86/Kbuild
@@ -9,6 +9,9 @@ kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o ../anon_inodes.o irq.o i8259.o
 ifeq ($(EXT_CONFIG_KVM_TRACE),y)
 kvm-objs += kvm_trace.o
 endif
+ifeq ($(CONFIG_DMAR),y)
+kvm-objs += vtd.o
+endif
 kvm-intel-objs := vmx.o vmx-debug.o ../external-module-compat.o
 kvm-amd-objs := svm.o ../external-module-compat.o
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 12:19       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
@ 2008-10-29 12:20         ` muli
  2008-10-29 12:20           ` [PATCH 6/6] device assignment: support for hot-plugging PCI devices muli
  2008-10-29 12:27           ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests Mark McLoughlin
  0 siblings, 2 replies; 31+ messages in thread
From: muli @ 2008-10-29 12:20 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Muli Ben-Yehuda <muli@il.ibm.com>

This patch has been contributed to by the following people:

Or Sagi <ors@tutis.com>
Nir Peleg <nir@tutis.com>
Amit Shah <amit.shah@redhat.com>
Ben-Ami Yassour <benami@il.ibm.com>
Weidong Han <weidong.han@intel.com>
Glauber de Oliveira Costa <gcosta@redhat.com>
Muli Ben-Yehuda <muli@il.ibm.com>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:

        -pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

* The RAW_IO capability is needed for this to work

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/Makefile.target        |    3 +
 qemu/configure              |   21 ++
 qemu/hw/device-assignment.c |  616 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |  106 ++++++++
 qemu/hw/pc.c                |   18 ++
 qemu/hw/pci.c               |    8 +
 qemu/hw/piix_pci.c          |    4 +-
 qemu/qemu-kvm.c             |   13 +
 qemu/qemu-kvm.h             |    8 +
 qemu/vl.c                   |   26 ++
 10 files changed, 821 insertions(+), 2 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..64d4e44 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM_DEVICE_ASSIGNMENT), 1)
+OBJS+= device-assignment.o
+endif
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/configure b/qemu/configure
index 922a156..618dbce 100755
--- a/qemu/configure
+++ b/qemu/configure
@@ -101,6 +101,7 @@ linux="no"
 kqemu="no"
 kvm="no"
 kvm_cap_pit="no"
+kvm_cap_device_assignment="no"
 profiler="no"
 kernel_path=""
 cocoa="no"
@@ -749,6 +750,9 @@ fi
 # KVM probe
 
 if test "$kvm" = "yes" ; then
+
+# test for KVM_CAP_PIT
+
 cat > $TMPC <<EOF
 #include <libkvm.h>
 #ifndef KVM_CAP_PIT
@@ -759,6 +763,19 @@ EOF
     if $cc $ARCH_CFLAGS $CFLAGS -I"$kernel_path"/include -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
 	kvm_cap_pit="yes"
     fi
+
+# test for KVM_CAP_DEVICE_ASSIGNMENT
+
+cat > $TMPC <<EOF
+#include <libkvm.h>
+#ifndef KVM_CAP_DEVICE_ASSIGNMENT
+#error "kvm no device assignment capability"
+#endif
+int main(void) { return 0; }
+EOF
+    if $cc $ARCH_CFLAGS $CFLAGS -I"$kernel_path"/include -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
+	kvm_cap_device_assignment="yes"
+    fi
 fi
 
 ##########################################
@@ -1515,6 +1532,10 @@ configure_kvm() {
 	echo "USE_KVM_PIT=1" >> $config_mak
 	echo "#define USE_KVM_PIT 1" >> $config_h
     fi
+    if test $kvm_cap_device_assignment = "yes" ; then
+	echo "USE_KVM_DEVICE_ASSIGNMENT=1" >> $config_mak
+	echo "#define USE_KVM_DEVICE_ASSIGNMENT 1" >> $config_h
+    fi
     disable_cpu_emulation
   fi
 }
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..78b7e14
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+#include <stdio.h>
+#include <sys/io.h>
+#include "qemu-kvm.h"
+#include "hw.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "console.h"
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO       0x00000100  /* Resource type */
+#define IORESOURCE_MEM      0x00000200
+#define IORESOURCE_IRQ      0x00000400
+#define IORESOURCE_DMA      0x00000800
+#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...)                                       \
+    do {                                                      \
+      fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \
+    } while (0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+
+static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+{
+    return region->u.r_baseport + (addr - region->e_physbase);
+}
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    outb(value, r_pio);
+}
+
+static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    outw(value, r_pio);
+}
+
+static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
+                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    outl(value, r_pio);
+}
+
+static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inb(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inw(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inl(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t e_phys, uint32_t e_size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_ephys = region->e_physbase;
+    uint32_t old_esize = region->e_size;
+    int first_map = (region->e_size == 0);
+    int ret = 0;
+
+    DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+          e_phys, region->u.r_virtbase, type, e_size, region_num);
+
+    region->e_physbase = e_phys;
+    region->e_size = e_size;
+
+    if (!first_map)
+	kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
+
+    if (e_size > 0)
+	ret = kvm_register_phys_mem(kvm_context, e_phys,
+                                        region->u.r_virtbase, e_size, 0);
+    if (ret != 0) {
+	fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
+	exit(1);
+    }
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                    uint32_t addr, uint32_t size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_port = region->u.r_baseport;
+    uint32_t old_num = region->e_size;
+    int first_map = (old_num == 0);
+    struct ioperm_data data;
+    int i;
+
+    region->e_physbase = addr;
+    region->e_size = size;
+
+    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
+          addr, region->u.r_baseport, type, size, region_num);
+
+    memset(&data, 0, sizeof(data));
+
+    if (!first_map) {
+	data.start_port = old_port;
+	data.num = old_num; 
+	data.turn_on = 0;
+
+	for (i = 0; i < smp_cpus; ++i)
+	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+    }
+
+    data.start_port = region->u.r_baseport;
+    data.num = size;
+    data.turn_on = 1;
+ 
+    for (i = 0; i < smp_cpus; ++i)
+	kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+ 
+    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+                          (r_dev->v_addrs + region_num));
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                          uint32_t val, int len)
+{
+    int fd;
+    ssize_t ret;
+
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    if (address == 0x4) {
+        pci_default_write_config(d, address, val, len);
+        /* Continue to program the card */
+    }
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        /* used for update-mappings (BAR emulation) */
+        pci_default_write_config(d, address, val, len);
+        return;
+    }
+
+    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                             int len)
+{
+    uint32_t val = 0;
+    int fd;
+    ssize_t ret;
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        val = pci_default_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
+    /* vga specific, remove later */
+    if (address == 0xFC)
+        goto do_log;
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pread(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+
+do_log:
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+    /* kill the special capabilities */
+    if (address == 4 && len == 4)
+        val &= ~0x100000;
+    else if (address == 6)
+        val &= ~0x10;
+
+    return val;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid)
+            continue;
+        pci_dev->v_addrs[i].num = i;
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                : PCI_ADDRESS_SPACE_MEM;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_virtbase =
+                mmap(NULL,
+                     (cur_region->size + 0xFFF) & 0xFFFFF000,
+                     PROT_WRITE | PROT_READ, MAP_SHARED,
+                     cur_region->resource_fd, (off_t) 0);
+
+            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].u.r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            pci_register_io_region((PCIDevice *) pci_dev, i,
+                                   cur_region->size, t,
+                                   assigned_dev_iomem_map);
+            continue;
+        }
+        /* handle port io regions */
+        pci_register_io_region((PCIDevice *) pci_dev, i,
+                               cur_region->size, PCI_ADDRESS_SPACE_IO,
+                               assigned_dev_ioport_map);
+
+        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+        /* not relevant for port io */
+        pci_dev->v_addrs[i].memory_index = 0;
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                           uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128];
+    int fd, r = 0;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+	     r_bus, r_dev, r_func);
+
+    snprintf(name, sizeof(name), "%sconfig", dir);
+
+    fd = open(name, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    dev->config_fd = fd;
+again:
+    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+
+    snprintf(name, sizeof(name), "%sresource", dir);
+
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+
+    for (r = 0; r < MAX_IO_REGIONS; r++) {
+	if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
+	    break; 
+
+        rp = dev->regions + r;
+        rp->valid = 0;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+            continue;
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;       /* probably ROM */
+            rp->resource_fd = fd;
+        } else
+            flags &= ~IORESOURCE_PREFETCH;
+
+        rp->type = flags;
+        rp->valid = 1;
+        rp->base_addr = start;
+        rp->size = size;
+        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
+              r, rp->size, start, rp->type, rp->resource_fd);
+    }
+    fclose(f);
+
+    dev->region_number = r;
+    return 0;
+}
+
+static LIST_HEAD(, AssignedDevInfo) adev_head;
+
+static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
+{
+    return (uint32_t)bus << 8 | (uint32_t)devfn;
+}
+
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void assigned_dev_update_irq(PCIDevice *d)
+{
+    int irq, r;
+    AssignedDevice *assigned_dev;
+    AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next) {
+        assigned_dev = adev->assigned_dev;
+        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
+        irq = piix_get_irq(irq);
+
+        if (irq != assigned_dev->girq) {
+            struct kvm_assigned_irq assigned_irq_data;
+
+            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
+            assigned_irq_data.assigned_dev_id  =
+                calc_assigned_dev_id(assigned_dev->h_busnr,
+                                     (uint8_t) assigned_dev->h_devfn);
+            assigned_irq_data.guest_irq = irq;
+            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
+            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
+            if (r < 0) {
+                perror("assigned_dev_update_irq");
+                fprintf(stderr, "Are you assigning a device "
+                        "that shares IRQ with some other device?\n");
+                pci_unregister_device(&assigned_dev->dev);
+                /* FIXME: Delete node from list */
+                continue;
+            }
+            assigned_dev->girq = irq;
+        }
+    }
+}
+
+struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
+{
+    int r;
+    AssignedDevice *dev;
+    uint8_t e_device, e_intx;
+    struct kvm_assigned_pci_dev assigned_dev_data;
+
+    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
+          adev->name, e_devfn);
+
+    dev = (AssignedDevice *)
+        pci_register_device(bus, adev->name, sizeof(AssignedDevice),
+                            -1, assigned_dev_pci_read_config,
+                            assigned_dev_pci_write_config);
+    if (NULL == dev) {
+        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
+                __func__, adev->name);
+        return NULL;
+    }
+
+    if (get_real_device(dev, adev->bus, adev->dev, adev->func)) {
+        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
+                __func__, adev->name);
+        goto out;
+    }
+
+    /* handle real device's MMIO/PIO BARs */
+    if (assigned_dev_register_regions(dev->real_device.regions,
+                                      dev->real_device.region_number,
+                                      dev))
+        goto out;
+
+    /* handle interrupt routing */
+    e_device = (dev->dev.devfn >> 3) & 0x1f;
+    e_intx = dev->dev.config[0x3d] - 1;
+    dev->intpin = e_intx;
+    dev->run = 0;
+    dev->girq = 0;
+    dev->h_busnr = adev->bus;
+    dev->h_devfn = PCI_DEVFN(adev->dev, adev->func);
+
+    memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+    assigned_dev_data.assigned_dev_id  =
+	calc_assigned_dev_id(dev->h_busnr, (uint32_t)dev->h_devfn);
+    assigned_dev_data.busnr = dev->h_busnr;
+    assigned_dev_data.devfn = dev->h_devfn;
+
+#ifdef KVM_CAP_IOMMU
+    /* We always enable the IOMMU if present
+     * (or when not disabled on the command line)
+     */
+    r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
+    if (r && !adev->disable_iommu)
+	assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
+#endif
+      
+    r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
+    if (r < 0) {
+	fprintf(stderr, "Could not notify kernel about "
+                "assigned device \"%s\"\n", adev->name);
+	perror("register_real_device");
+	goto out;
+    }
+
+    adev->assigned_dev = dev;
+  out:
+    return &dev->dev;
+}
+
+int init_all_assigned_devices(PCIBus *bus)
+{
+    struct AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next)
+        if (init_assigned_device(adev, bus) == NULL)
+            return -1;
+
+    return 0;
+}
+
+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice host=bus:dev.func[,dma=none][,name=Foo]
+ *
+ * Example:
+ * -pcidevice host=00:13.0,dma=pvdma
+ *
+ * dma can currently only be 'none' to disable iommu support.
+ */
+AssignedDevInfo *add_assigned_device(const char *arg)
+{
+    char *cp, *cp1;
+    char device[8];
+    char dma[6];
+    int r;
+    AssignedDevInfo *adev;
+
+    adev = qemu_mallocz(sizeof(AssignedDevInfo));
+    if (adev == NULL) {
+        fprintf(stderr, "%s: Out of memory\n", __func__);
+        return NULL;
+    }
+    r = get_param_value(device, sizeof(device), "host", arg);
+    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
+    if (!r)
+	snprintf(adev->name, sizeof(adev->name), "%s", device);
+
+#ifdef KVM_CAP_IOMMU
+    r = get_param_value(dma, sizeof(dma), "dma", arg);
+    if (r && !strncmp(dma, "none", 4))
+        adev->disable_iommu = 1;
+#endif
+    cp = device;
+    adev->bus = strtoul(cp, &cp1, 16);
+    if (*cp1 != ':')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->dev = strtoul(cp, &cp1, 16);
+    if (*cp1 != '.')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->func = strtoul(cp, &cp1, 16);
+
+    LIST_INSERT_HEAD(&adev_head, adev, next);
+    return adev;
+bad:
+    fprintf(stderr, "pcidevice argument parse error; "
+            "please check the help text for usage\n");
+    qemu_free(adev);
+    return NULL;
+}
diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
new file mode 100644
index 0000000..d6caa67
--- /dev/null
+++ b/qemu/hw/device-assignment.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Data structures for storing PCI state
+ *
+ *  Adapted to kvm by Qumranet
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ */
+
+#ifndef __DEVICE_ASSIGNMENT_H__
+#define __DEVICE_ASSIGNMENT_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "sys-queue.h"
+#include "pci.h"
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+/* The number of BARs in the config space header */
+#define MAX_IO_REGIONS (6)
+
+typedef struct {
+    int type;           /* Memory or port I/O */
+    int valid;
+    uint32_t base_addr;
+    uint32_t size;    /* size of the region */
+    int resource_fd;
+} PCIRegion;
+
+typedef struct {
+    uint8_t bus, dev, func; /* Bus inside domain, device and function */
+    int irq;                /* IRQ number */
+    uint16_t region_number; /* number of active regions */
+
+    /* Port I/O or MMIO Regions */
+    PCIRegion regions[MAX_IO_REGIONS];
+    int config_fd;
+} PCIDevRegions;
+
+typedef struct {
+    target_phys_addr_t e_physbase;
+    uint32_t memory_index;
+    union {
+        void *r_virtbase;    /* mmapped access address for memory regions */
+        uint32_t r_baseport; /* the base guest port for I/O regions */
+    } u;
+    int num;            /* our index within v_addrs[] */
+    uint32_t e_size;    /* emulated size of region in bytes */
+    uint32_t r_size;    /* real size of region in bytes */
+} AssignedDevRegion;
+
+typedef struct {
+    PCIDevice dev;
+    int intpin;
+    uint8_t debug_flags;
+    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
+    PCIDevRegions real_device;
+    int run;
+    int girq;
+    unsigned char h_busnr;
+    unsigned int h_devfn;
+    int bound;
+} AssignedDevice;
+
+typedef struct AssignedDevInfo AssignedDevInfo;
+
+struct AssignedDevInfo {
+    char name[15];
+    int bus;
+    int dev;
+    int func;
+    AssignedDevice *assigned_dev;
+    LIST_ENTRY(AssignedDevInfo) next;
+    int disable_iommu;
+};
+
+PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
+AssignedDevInfo *add_assigned_device(const char *arg);
+int init_all_assigned_devices(PCIBus *bus);
+
+#define MAX_DEV_ASSIGN_CMDLINE 8
+
+extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+extern int assigned_devices_index;
+
+#endif              /* __DEVICE_ASSIGNMENT_H__ */
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index d559f0c..30bb5ea 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -33,6 +33,7 @@
 #include "boards.h"
 #include "console.h"
 #include "fw_cfg.h"
+#include "device-assignment.h"
 
 #include "qemu-kvm.h"
 
@@ -1157,6 +1158,23 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
 
     if (pci_enabled)
         virtio_balloon_init(pci_bus);
+
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    if (kvm_enabled()) {
+	int i;
+        for (i = 0; i < assigned_devices_index; i++) {
+            if (add_assigned_device(assigned_devices[i]) < 0) {
+                fprintf(stderr, "Warning: could not add assigned device %s\n",
+                        assigned_devices[i]);
+            }
+        }
+
+	if (init_all_assigned_devices(pci_bus)) {
+	    fprintf(stderr, "Failed to initialize assigned devices\n");
+	    exit (1);
+	}
+    }
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
 }
 
 static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index c82cd20..75bc9a9 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
 
 static void pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
+void assigned_dev_update_irq(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -453,6 +454,13 @@ void pci_default_write_config(PCIDevice *d,
         val >>= 8;
     }
 
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+        address >= PIIX_CONFIG_IRQ_ROUTE &&
+	address < PIIX_CONFIG_IRQ_ROUTE + 4)
+        assigned_dev_update_irq(d);
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
+
     end = address + len;
     if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
         /* if the command register is modified, we must modify the mappings */
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index b9067b8..27d5f02 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -246,9 +246,9 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int level)
 int piix_get_irq(int pin)
 {
     if (piix3_dev)
-        return piix3_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
+        return piix3_dev->config[0x60+pin];
     if (piix4_dev)
-        return piix4_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
+        return piix4_dev->config[0x60+pin];
 
     return 0;
 }
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index c5f3f29..3b4f279 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -27,6 +27,7 @@ int kvm_pit = 1;
 #include <sys/utsname.h>
 #include <sys/syscall.h>
 #include <sys/mman.h>
+#include <sys/io.h>
 
 #define bool _Bool
 #define false 0
@@ -1047,3 +1048,15 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 {
     return kvm_unregister_coalesced_mmio(kvm_context, addr, size);
 }
+
+static void kvm_do_ioperm(void *_data)
+{
+    struct ioperm_data *data = _data;
+    ioperm(data->start_port, data->num, data->turn_on);
+}
+
+void kvm_ioperm(CPUState *env, void *data)
+{
+    if (kvm_enabled() && qemu_system_ready)
+	on_vcpu(env, kvm_do_ioperm, data);
+}
diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h
index a1d6646..1084cd6 100644
--- a/qemu/qemu-kvm.h
+++ b/qemu/qemu-kvm.h
@@ -93,6 +93,8 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 
 void qemu_kvm_system_reset_request(void);
 
+void kvm_ioperm(CPUState *env, void *data);
+
 #ifdef TARGET_PPC
 int handle_powerpc_dcr_read(int vcpu, uint32_t dcrn, uint32_t *data);
 int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
@@ -107,6 +109,12 @@ int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
 extern int kvm_allowed;
 extern kvm_context_t kvm_context;
 
+struct ioperm_data {
+    unsigned long start_port;
+    unsigned long num;
+    int turn_on;
+};
+
 #define kvm_enabled() (kvm_allowed)
 #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
 #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
diff --git a/qemu/vl.c b/qemu/vl.c
index 388e79d..967cb98 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -38,6 +38,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "balloon.h"
 #include "qemu-kvm.h"
@@ -215,6 +216,8 @@ CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack = 0;
 #endif
 int usb_enabled = 0;
+const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+int assigned_devices_index;
 static VLANState *first_vlan;
 int smp_cpus = 1;
 const char *vnc_display;
@@ -8692,6 +8695,12 @@ static void help(int exitcode)
 #endif
 	   "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
 	   "-no-kvm-pit	    disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+           "-pcidevice host=bus:dev.func[,dma=none][,name=string]\n"
+           "                expose a PCI device to the guest OS.\n"
+           "                dma=none: don't perform any dma translations (default is to use an iommu)\n"
+           "                'string' is used in log output.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-no-acpi        disable ACPI\n"
@@ -8811,6 +8820,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8900,6 +8912,9 @@ static const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9411,6 +9426,7 @@ int main(int argc, char **argv)
     parallel_device_index = 0;
 
     usb_devices_index = 0;
+    assigned_devices_index = 0;
 
     nb_net_clients = 0;
     nb_drives = 0;
@@ -9844,6 +9860,16 @@ int main(int argc, char **argv)
 		kvm_pit = 0;
 		break;
 	    }
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+            case QEMU_OPTION_pcidevice:
+		if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
+                    fprintf(stderr, "Too many assigned devices\n");
+                    exit(1);
+		}
+		assigned_devices[assigned_devices_index] = optarg;
+		assigned_devices_index++;
+                break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 6/6] device assignment: support for hot-plugging PCI devices
  2008-10-29 12:20         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
@ 2008-10-29 12:20           ` muli
  2008-10-29 12:27           ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests Mark McLoughlin
  1 sibling, 0 replies; 31+ messages in thread
From: muli @ 2008-10-29 12:20 UTC (permalink / raw)
  To: avi; +Cc: kvm, anthony, weidong.han, benami, muli, amit.shah, allen.m.kay

From: Amit Shah <amit.shah@redhat.com>

This patch adds support for hot-plugging host PCI devices into
guests

[muli: various small fixes for review comments]

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/hw/device-hotplug.c |   37 +++++++++++++++++++++++++++++++++++++
 qemu/monitor.c           |    2 +-
 2 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/qemu/hw/device-hotplug.c b/qemu/hw/device-hotplug.c
index 8e2bc35..ba1b161 100644
--- a/qemu/hw/device-hotplug.c
+++ b/qemu/hw/device-hotplug.c
@@ -6,6 +6,8 @@
 #include "pc.h"
 #include "console.h"
 #include "block_int.h"
+#include "device-assignment.h"
+#include "config.h"
 
 #define PCI_BASE_CLASS_STORAGE          0x01
 #define PCI_BASE_CLASS_NETWORK          0x02
@@ -27,6 +29,37 @@ static PCIDevice *qemu_system_hot_add_nic(const char *opts, int bus_nr)
     return pci_nic_init (pci_bus, &nd_table[ret], -1);
 }
 
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+static PCIDevice *qemu_system_hot_assign_device(const char *opts, int bus_nr)
+{
+    PCIBus *pci_bus;
+    AssignedDevInfo *adev;
+    PCIDevice *ret;
+
+    pci_bus = pci_find_bus(bus_nr);
+    if (!pci_bus) {
+        term_printf ("Can't find pci_bus %d\n", bus_nr);
+        return NULL;
+    }
+    adev = add_assigned_device(opts);
+    if (adev == NULL) {
+        term_printf ("Error adding device; check syntax\n");
+        return NULL;
+    }
+ 
+    ret = init_assigned_device(adev, pci_bus);
+
+    term_printf("Registered host PCI device %02x:%02x.%1x "
+		"(\"%s\") as guest device %02x:%02x.%1x\n",
+		adev->bus, adev->dev, adev->func, adev->name,
+		pci_bus_num(pci_bus), (ret->devfn >> 3) & 0x1f,
+		adev->func);
+
+    return ret;
+}
+
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
+
 static int add_init_drive(const char *opts)
 {
     int drive_opt_idx, drive_idx;
@@ -143,6 +176,10 @@ void device_hot_add(int pcibus, const char *type, const char *opts)
         dev = qemu_system_hot_add_nic(opts, pcibus);
     else if (strcmp(type, "storage") == 0)
         dev = qemu_system_hot_add_storage(opts, pcibus);
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    else if (strcmp(type, "host") == 0)
+        dev = qemu_system_hot_assign_device(opts, pcibus);
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
     else
         term_printf("invalid type: %s\n", type);
 
diff --git a/qemu/monitor.c b/qemu/monitor.c
index 79b6b4c..d1043b1 100644
--- a/qemu/monitor.c
+++ b/qemu/monitor.c
@@ -1529,7 +1529,7 @@ static const term_cmd_t term_cmds[] = {
                                         "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
                                         "[snapshot=on|off][,cache=on|off]",
                                         "add drive to PCI storage controller" },
-    { "pci_add", "iss", device_hot_add, "bus nic|storage [[vlan=n][,macaddr=addr][,model=type]] [file=file][,if=type][,bus=nr]...", "hot-add PCI device" },
+    { "pci_add", "iss", device_hot_add, "bus nic|storage|host [[vlan=n][,macaddr=addr][,model=type]] [file=file][,if=type][,bus=nr]... [host=02:00.0[,name=string][,dma=none]" "hot-add PCI device" },
     { "pci_del", "ii", device_hot_remove, "bus slot-number", "hot remove PCI device" },
 #endif
     { "balloon", "i", do_balloon,
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 12:20         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
  2008-10-29 12:20           ` [PATCH 6/6] device assignment: support for hot-plugging PCI devices muli
@ 2008-10-29 12:27           ` Mark McLoughlin
  2008-10-29 14:40             ` Muli Ben-Yehuda
  1 sibling, 1 reply; 31+ messages in thread
From: Mark McLoughlin @ 2008-10-29 12:27 UTC (permalink / raw)
  To: muli; +Cc: avi, kvm, anthony, weidong.han, benami, amit.shah, allen.m.kay

On Wed, 2008-10-29 at 14:20 +0200, muli@il.ibm.com wrote:
> diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
> index b9067b8..27d5f02 100644
> --- a/qemu/hw/piix_pci.c
> +++ b/qemu/hw/piix_pci.c
> @@ -246,9 +246,9 @@ static void piix3_set_irq(qemu_irq *pic, int
> irq_num, int level)
>  int piix_get_irq(int pin)
>  {
>      if (piix3_dev)
> -        return piix3_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
> +        return piix3_dev->config[0x60+pin];
>      if (piix4_dev)
> -        return piix4_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
> +        return piix4_dev->config[0x60+pin];
>  
>      return 0;
>  }

Another rebase mixup?

Cheers,
Mark.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
  2008-10-29 12:27           ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests Mark McLoughlin
@ 2008-10-29 14:40             ` Muli Ben-Yehuda
  0 siblings, 0 replies; 31+ messages in thread
From: Muli Ben-Yehuda @ 2008-10-29 14:40 UTC (permalink / raw)
  To: Mark McLoughlin
  Cc: avi, kvm, anthony, weidong.han, Ben-Ami Yassour1, amit.shah,
	allen.m.kay

On Wed, Oct 29, 2008 at 12:27:19PM +0000, Mark McLoughlin wrote:
> On Wed, 2008-10-29 at 14:20 +0200, muli@il.ibm.com wrote:
> > diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
> > index b9067b8..27d5f02 100644
> > --- a/qemu/hw/piix_pci.c
> > +++ b/qemu/hw/piix_pci.c
> > @@ -246,9 +246,9 @@ static void piix3_set_irq(qemu_irq *pic, int
> > irq_num, int level)
> >  int piix_get_irq(int pin)
> >  {
> >      if (piix3_dev)
> > -        return piix3_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
> > +        return piix3_dev->config[0x60+pin];
> >      if (piix4_dev)
> > -        return piix4_dev->config[PIIX_CONFIG_IRQ_ROUTE + pin];
> > +        return piix4_dev->config[0x60+pin];
> >  
> >      return 0;
> >  }
> 
> Another rebase mixup?

Argh. Indeed. Here's 5/6 again without the offending hunk. Hopefully
that's the last one for today or I might have to return my "git
competency" boy scout badge.

>From 1dd6f84986a4224635ad5a6f9edfa57b1c5a1e7b Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Wed, 29 Oct 2008 14:12:08 +0200
Subject: [PATCH 5/6] device assignment: support for assigning PCI devices to guests

This patch has been contributed to by the following people:

Or Sagi <ors@tutis.com>
Nir Peleg <nir@tutis.com>
Amit Shah <amit.shah@redhat.com>
Ben-Ami Yassour <benami@il.ibm.com>
Weidong Han <weidong.han@intel.com>
Glauber de Oliveira Costa <gcosta@redhat.com>
Muli Ben-Yehuda <muli@il.ibm.com>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:

        -pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

* The RAW_IO capability is needed for this to work

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 qemu/Makefile.target        |    3 +
 qemu/configure              |   21 ++
 qemu/hw/device-assignment.c |  616 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |  106 ++++++++
 qemu/hw/pc.c                |   18 ++
 qemu/hw/pci.c               |    8 +
 qemu/hw/piix_pci.c          |    4 +-
 qemu/qemu-kvm.c             |   13 +
 qemu/qemu-kvm.h             |    8 +
 qemu/vl.c                   |   26 ++
 10 files changed, 821 insertions(+), 2 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..64d4e44 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM_DEVICE_ASSIGNMENT), 1)
+OBJS+= device-assignment.o
+endif
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/configure b/qemu/configure
index 922a156..618dbce 100755
--- a/qemu/configure
+++ b/qemu/configure
@@ -101,6 +101,7 @@ linux="no"
 kqemu="no"
 kvm="no"
 kvm_cap_pit="no"
+kvm_cap_device_assignment="no"
 profiler="no"
 kernel_path=""
 cocoa="no"
@@ -749,6 +750,9 @@ fi
 # KVM probe
 
 if test "$kvm" = "yes" ; then
+
+# test for KVM_CAP_PIT
+
 cat > $TMPC <<EOF
 #include <libkvm.h>
 #ifndef KVM_CAP_PIT
@@ -759,6 +763,19 @@ EOF
     if $cc $ARCH_CFLAGS $CFLAGS -I"$kernel_path"/include -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
 	kvm_cap_pit="yes"
     fi
+
+# test for KVM_CAP_DEVICE_ASSIGNMENT
+
+cat > $TMPC <<EOF
+#include <libkvm.h>
+#ifndef KVM_CAP_DEVICE_ASSIGNMENT
+#error "kvm no device assignment capability"
+#endif
+int main(void) { return 0; }
+EOF
+    if $cc $ARCH_CFLAGS $CFLAGS -I"$kernel_path"/include -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
+	kvm_cap_device_assignment="yes"
+    fi
 fi
 
 ##########################################
@@ -1515,6 +1532,10 @@ configure_kvm() {
 	echo "USE_KVM_PIT=1" >> $config_mak
 	echo "#define USE_KVM_PIT 1" >> $config_h
     fi
+    if test $kvm_cap_device_assignment = "yes" ; then
+	echo "USE_KVM_DEVICE_ASSIGNMENT=1" >> $config_mak
+	echo "#define USE_KVM_DEVICE_ASSIGNMENT 1" >> $config_h
+    fi
     disable_cpu_emulation
   fi
 }
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..78b7e14
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+#include <stdio.h>
+#include <sys/io.h>
+#include "qemu-kvm.h"
+#include "hw.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "console.h"
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO       0x00000100  /* Resource type */
+#define IORESOURCE_MEM      0x00000200
+#define IORESOURCE_IRQ      0x00000400
+#define IORESOURCE_DMA      0x00000800
+#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...)                                       \
+    do {                                                      \
+      fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \
+    } while (0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+
+static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+{
+    return region->u.r_baseport + (addr - region->e_physbase);
+}
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    outb(value, r_pio);
+}
+
+static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    outw(value, r_pio);
+}
+
+static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
+                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+	  r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    outl(value, r_pio);
+}
+
+static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inb(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inw(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+	  (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+    uint32_t value;
+
+    value = inl(r_pio);
+
+    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
+          r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->u.r_baseport, value);
+
+    return value;
+}
+
+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t e_phys, uint32_t e_size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_ephys = region->e_physbase;
+    uint32_t old_esize = region->e_size;
+    int first_map = (region->e_size == 0);
+    int ret = 0;
+
+    DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+          e_phys, region->u.r_virtbase, type, e_size, region_num);
+
+    region->e_physbase = e_phys;
+    region->e_size = e_size;
+
+    if (!first_map)
+	kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
+
+    if (e_size > 0)
+	ret = kvm_register_phys_mem(kvm_context, e_phys,
+                                        region->u.r_virtbase, e_size, 0);
+    if (ret != 0) {
+	fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
+	exit(1);
+    }
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                    uint32_t addr, uint32_t size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_port = region->u.r_baseport;
+    uint32_t old_num = region->e_size;
+    int first_map = (old_num == 0);
+    struct ioperm_data data;
+    int i;
+
+    region->e_physbase = addr;
+    region->e_size = size;
+
+    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
+          addr, region->u.r_baseport, type, size, region_num);
+
+    memset(&data, 0, sizeof(data));
+
+    if (!first_map) {
+	data.start_port = old_port;
+	data.num = old_num; 
+	data.turn_on = 0;
+
+	for (i = 0; i < smp_cpus; ++i)
+	    kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+    }
+
+    data.start_port = region->u.r_baseport;
+    data.num = size;
+    data.turn_on = 1;
+ 
+    for (i = 0; i < smp_cpus; ++i)
+	kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+ 
+    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+                          (r_dev->v_addrs + region_num));
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                          uint32_t val, int len)
+{
+    int fd;
+    ssize_t ret;
+
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    if (address == 0x4) {
+        pci_default_write_config(d, address, val, len);
+        /* Continue to program the card */
+    }
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        /* used for update-mappings (BAR emulation) */
+        pci_default_write_config(d, address, val, len);
+        return;
+    }
+
+    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                             int len)
+{
+    uint32_t val = 0;
+    int fd;
+    ssize_t ret;
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        val = pci_default_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
+    /* vga specific, remove later */
+    if (address == 0xFC)
+        goto do_log;
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pread(fd, &val, len, address);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+
+do_log:
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+    /* kill the special capabilities */
+    if (address == 4 && len == 4)
+        val &= ~0x100000;
+    else if (address == 6)
+        val &= ~0x10;
+
+    return val;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid)
+            continue;
+        pci_dev->v_addrs[i].num = i;
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                : PCI_ADDRESS_SPACE_MEM;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_virtbase =
+                mmap(NULL,
+                     (cur_region->size + 0xFFF) & 0xFFFFF000,
+                     PROT_WRITE | PROT_READ, MAP_SHARED,
+                     cur_region->resource_fd, (off_t) 0);
+
+            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].u.r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            pci_register_io_region((PCIDevice *) pci_dev, i,
+                                   cur_region->size, t,
+                                   assigned_dev_iomem_map);
+            continue;
+        }
+        /* handle port io regions */
+        pci_register_io_region((PCIDevice *) pci_dev, i,
+                               cur_region->size, PCI_ADDRESS_SPACE_IO,
+                               assigned_dev_ioport_map);
+
+        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+        /* not relevant for port io */
+        pci_dev->v_addrs[i].memory_index = 0;
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                           uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128];
+    int fd, r = 0;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+	     r_bus, r_dev, r_func);
+
+    snprintf(name, sizeof(name), "%sconfig", dir);
+
+    fd = open(name, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    dev->config_fd = fd;
+again:
+    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+
+    snprintf(name, sizeof(name), "%sresource", dir);
+
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+
+    for (r = 0; r < MAX_IO_REGIONS; r++) {
+	if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
+	    break; 
+
+        rp = dev->regions + r;
+        rp->valid = 0;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+            continue;
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+	    snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;       /* probably ROM */
+            rp->resource_fd = fd;
+        } else
+            flags &= ~IORESOURCE_PREFETCH;
+
+        rp->type = flags;
+        rp->valid = 1;
+        rp->base_addr = start;
+        rp->size = size;
+        DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
+              r, rp->size, start, rp->type, rp->resource_fd);
+    }
+    fclose(f);
+
+    dev->region_number = r;
+    return 0;
+}
+
+static LIST_HEAD(, AssignedDevInfo) adev_head;
+
+static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
+{
+    return (uint32_t)bus << 8 | (uint32_t)devfn;
+}
+
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void assigned_dev_update_irq(PCIDevice *d)
+{
+    int irq, r;
+    AssignedDevice *assigned_dev;
+    AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next) {
+        assigned_dev = adev->assigned_dev;
+        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
+        irq = piix_get_irq(irq);
+
+        if (irq != assigned_dev->girq) {
+            struct kvm_assigned_irq assigned_irq_data;
+
+            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
+            assigned_irq_data.assigned_dev_id  =
+                calc_assigned_dev_id(assigned_dev->h_busnr,
+                                     (uint8_t) assigned_dev->h_devfn);
+            assigned_irq_data.guest_irq = irq;
+            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
+            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
+            if (r < 0) {
+                perror("assigned_dev_update_irq");
+                fprintf(stderr, "Are you assigning a device "
+                        "that shares IRQ with some other device?\n");
+                pci_unregister_device(&assigned_dev->dev);
+                /* FIXME: Delete node from list */
+                continue;
+            }
+            assigned_dev->girq = irq;
+        }
+    }
+}
+
+struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
+{
+    int r;
+    AssignedDevice *dev;
+    uint8_t e_device, e_intx;
+    struct kvm_assigned_pci_dev assigned_dev_data;
+
+    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
+          adev->name, e_devfn);
+
+    dev = (AssignedDevice *)
+        pci_register_device(bus, adev->name, sizeof(AssignedDevice),
+                            -1, assigned_dev_pci_read_config,
+                            assigned_dev_pci_write_config);
+    if (NULL == dev) {
+        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
+                __func__, adev->name);
+        return NULL;
+    }
+
+    if (get_real_device(dev, adev->bus, adev->dev, adev->func)) {
+        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
+                __func__, adev->name);
+        goto out;
+    }
+
+    /* handle real device's MMIO/PIO BARs */
+    if (assigned_dev_register_regions(dev->real_device.regions,
+                                      dev->real_device.region_number,
+                                      dev))
+        goto out;
+
+    /* handle interrupt routing */
+    e_device = (dev->dev.devfn >> 3) & 0x1f;
+    e_intx = dev->dev.config[0x3d] - 1;
+    dev->intpin = e_intx;
+    dev->run = 0;
+    dev->girq = 0;
+    dev->h_busnr = adev->bus;
+    dev->h_devfn = PCI_DEVFN(adev->dev, adev->func);
+
+    memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+    assigned_dev_data.assigned_dev_id  =
+	calc_assigned_dev_id(dev->h_busnr, (uint32_t)dev->h_devfn);
+    assigned_dev_data.busnr = dev->h_busnr;
+    assigned_dev_data.devfn = dev->h_devfn;
+
+#ifdef KVM_CAP_IOMMU
+    /* We always enable the IOMMU if present
+     * (or when not disabled on the command line)
+     */
+    r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
+    if (r && !adev->disable_iommu)
+	assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
+#endif
+      
+    r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
+    if (r < 0) {
+	fprintf(stderr, "Could not notify kernel about "
+                "assigned device \"%s\"\n", adev->name);
+	perror("register_real_device");
+	goto out;
+    }
+
+    adev->assigned_dev = dev;
+  out:
+    return &dev->dev;
+}
+
+int init_all_assigned_devices(PCIBus *bus)
+{
+    struct AssignedDevInfo *adev;
+
+    LIST_FOREACH(adev, &adev_head, next)
+        if (init_assigned_device(adev, bus) == NULL)
+            return -1;
+
+    return 0;
+}
+
+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice host=bus:dev.func[,dma=none][,name=Foo]
+ *
+ * Example:
+ * -pcidevice host=00:13.0,dma=pvdma
+ *
+ * dma can currently only be 'none' to disable iommu support.
+ */
+AssignedDevInfo *add_assigned_device(const char *arg)
+{
+    char *cp, *cp1;
+    char device[8];
+    char dma[6];
+    int r;
+    AssignedDevInfo *adev;
+
+    adev = qemu_mallocz(sizeof(AssignedDevInfo));
+    if (adev == NULL) {
+        fprintf(stderr, "%s: Out of memory\n", __func__);
+        return NULL;
+    }
+    r = get_param_value(device, sizeof(device), "host", arg);
+    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
+    if (!r)
+	snprintf(adev->name, sizeof(adev->name), "%s", device);
+
+#ifdef KVM_CAP_IOMMU
+    r = get_param_value(dma, sizeof(dma), "dma", arg);
+    if (r && !strncmp(dma, "none", 4))
+        adev->disable_iommu = 1;
+#endif
+    cp = device;
+    adev->bus = strtoul(cp, &cp1, 16);
+    if (*cp1 != ':')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->dev = strtoul(cp, &cp1, 16);
+    if (*cp1 != '.')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->func = strtoul(cp, &cp1, 16);
+
+    LIST_INSERT_HEAD(&adev_head, adev, next);
+    return adev;
+bad:
+    fprintf(stderr, "pcidevice argument parse error; "
+            "please check the help text for usage\n");
+    qemu_free(adev);
+    return NULL;
+}
diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
new file mode 100644
index 0000000..d6caa67
--- /dev/null
+++ b/qemu/hw/device-assignment.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Data structures for storing PCI state
+ *
+ *  Adapted to kvm by Qumranet
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ */
+
+#ifndef __DEVICE_ASSIGNMENT_H__
+#define __DEVICE_ASSIGNMENT_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "sys-queue.h"
+#include "pci.h"
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+/* The number of BARs in the config space header */
+#define MAX_IO_REGIONS (6)
+
+typedef struct {
+    int type;           /* Memory or port I/O */
+    int valid;
+    uint32_t base_addr;
+    uint32_t size;    /* size of the region */
+    int resource_fd;
+} PCIRegion;
+
+typedef struct {
+    uint8_t bus, dev, func; /* Bus inside domain, device and function */
+    int irq;                /* IRQ number */
+    uint16_t region_number; /* number of active regions */
+
+    /* Port I/O or MMIO Regions */
+    PCIRegion regions[MAX_IO_REGIONS];
+    int config_fd;
+} PCIDevRegions;
+
+typedef struct {
+    target_phys_addr_t e_physbase;
+    uint32_t memory_index;
+    union {
+        void *r_virtbase;    /* mmapped access address for memory regions */
+        uint32_t r_baseport; /* the base guest port for I/O regions */
+    } u;
+    int num;            /* our index within v_addrs[] */
+    uint32_t e_size;    /* emulated size of region in bytes */
+    uint32_t r_size;    /* real size of region in bytes */
+} AssignedDevRegion;
+
+typedef struct {
+    PCIDevice dev;
+    int intpin;
+    uint8_t debug_flags;
+    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
+    PCIDevRegions real_device;
+    int run;
+    int girq;
+    unsigned char h_busnr;
+    unsigned int h_devfn;
+    int bound;
+} AssignedDevice;
+
+typedef struct AssignedDevInfo AssignedDevInfo;
+
+struct AssignedDevInfo {
+    char name[15];
+    int bus;
+    int dev;
+    int func;
+    AssignedDevice *assigned_dev;
+    LIST_ENTRY(AssignedDevInfo) next;
+    int disable_iommu;
+};
+
+PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
+AssignedDevInfo *add_assigned_device(const char *arg);
+int init_all_assigned_devices(PCIBus *bus);
+
+#define MAX_DEV_ASSIGN_CMDLINE 8
+
+extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+extern int assigned_devices_index;
+
+#endif              /* __DEVICE_ASSIGNMENT_H__ */
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index d559f0c..30bb5ea 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -33,6 +33,7 @@
 #include "boards.h"
 #include "console.h"
 #include "fw_cfg.h"
+#include "device-assignment.h"
 
 #include "qemu-kvm.h"
 
@@ -1157,6 +1158,23 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
 
     if (pci_enabled)
         virtio_balloon_init(pci_bus);
+
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    if (kvm_enabled()) {
+	int i;
+        for (i = 0; i < assigned_devices_index; i++) {
+            if (add_assigned_device(assigned_devices[i]) < 0) {
+                fprintf(stderr, "Warning: could not add assigned device %s\n",
+                        assigned_devices[i]);
+            }
+        }
+
+	if (init_all_assigned_devices(pci_bus)) {
+	    fprintf(stderr, "Failed to initialize assigned devices\n");
+	    exit (1);
+	}
+    }
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
 }
 
 static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index c82cd20..75bc9a9 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
 
 static void pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
+void assigned_dev_update_irq(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -453,6 +454,13 @@ void pci_default_write_config(PCIDevice *d,
         val >>= 8;
     }
 
+#ifdef USE_KVM_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+        address >= PIIX_CONFIG_IRQ_ROUTE &&
+	address < PIIX_CONFIG_IRQ_ROUTE + 4)
+        assigned_dev_update_irq(d);
+#endif /* USE_KVM_DEVICE_ASSIGNMENT */
+
     end = address + len;
     if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
         /* if the command register is modified, we must modify the mappings */
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index c5f3f29..3b4f279 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -27,6 +27,7 @@ int kvm_pit = 1;
 #include <sys/utsname.h>
 #include <sys/syscall.h>
 #include <sys/mman.h>
+#include <sys/io.h>
 
 #define bool _Bool
 #define false 0
@@ -1047,3 +1048,15 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 {
     return kvm_unregister_coalesced_mmio(kvm_context, addr, size);
 }
+
+static void kvm_do_ioperm(void *_data)
+{
+    struct ioperm_data *data = _data;
+    ioperm(data->start_port, data->num, data->turn_on);
+}
+
+void kvm_ioperm(CPUState *env, void *data)
+{
+    if (kvm_enabled() && qemu_system_ready)
+	on_vcpu(env, kvm_do_ioperm, data);
+}
diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h
index a1d6646..1084cd6 100644
--- a/qemu/qemu-kvm.h
+++ b/qemu/qemu-kvm.h
@@ -93,6 +93,8 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr,
 
 void qemu_kvm_system_reset_request(void);
 
+void kvm_ioperm(CPUState *env, void *data);
+
 #ifdef TARGET_PPC
 int handle_powerpc_dcr_read(int vcpu, uint32_t dcrn, uint32_t *data);
 int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
@@ -107,6 +109,12 @@ int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data);
 extern int kvm_allowed;
 extern kvm_context_t kvm_context;
 
+struct ioperm_data {
+    unsigned long start_port;
+    unsigned long num;
+    int turn_on;
+};
+
 #define kvm_enabled() (kvm_allowed)
 #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
 #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
diff --git a/qemu/vl.c b/qemu/vl.c
index 388e79d..967cb98 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -38,6 +38,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "balloon.h"
 #include "qemu-kvm.h"
@@ -215,6 +216,8 @@ CharDriverState *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack = 0;
 #endif
 int usb_enabled = 0;
+const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
+int assigned_devices_index;
 static VLANState *first_vlan;
 int smp_cpus = 1;
 const char *vnc_display;
@@ -8692,6 +8695,12 @@ static void help(int exitcode)
 #endif
 	   "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
 	   "-no-kvm-pit	    disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+           "-pcidevice host=bus:dev.func[,dma=none][,name=string]\n"
+           "                expose a PCI device to the guest OS.\n"
+           "                dma=none: don't perform any dma translations (default is to use an iommu)\n"
+           "                'string' is used in log output.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-no-acpi        disable ACPI\n"
@@ -8811,6 +8820,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8900,6 +8912,9 @@ static const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9411,6 +9426,7 @@ int main(int argc, char **argv)
     parallel_device_index = 0;
 
     usb_devices_index = 0;
+    assigned_devices_index = 0;
 
     nb_net_clients = 0;
     nb_drives = 0;
@@ -9844,6 +9860,16 @@ int main(int argc, char **argv)
 		kvm_pit = 0;
 		break;
 	    }
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+            case QEMU_OPTION_pcidevice:
+		if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
+                    fprintf(stderr, "Too many assigned devices\n");
+                    exit(1);
+		}
+		assigned_devices[assigned_devices_index] = optarg;
+		assigned_devices_index++;
+                break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [v10] Userspace patches for PCI device assignment
  2008-10-29 12:19 [v10] Userspace patches for PCI device assignment muli
  2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
@ 2008-10-30 11:29 ` Avi Kivity
  2008-10-30 12:44 ` Anthony Liguori
  2 siblings, 0 replies; 31+ messages in thread
From: Avi Kivity @ 2008-10-30 11:29 UTC (permalink / raw)
  To: muli; +Cc: kvm, anthony, weidong.han, benami, amit.shah, allen.m.kay

muli@il.ibm.com wrote:
> This patchset enables device assignment for KVM hosts for PCI
> devices. It uses the Intel IOMMU by default if available.
>   

Applied all.  Thanks to everyone involved in this effort!


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [v10] Userspace patches for PCI device assignment
  2008-10-29 12:19 [v10] Userspace patches for PCI device assignment muli
  2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
  2008-10-30 11:29 ` [v10] Userspace patches for PCI device assignment Avi Kivity
@ 2008-10-30 12:44 ` Anthony Liguori
  2008-10-30 13:23   ` Han, Weidong
  2 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2008-10-30 12:44 UTC (permalink / raw)
  To: muli; +Cc: avi, kvm, weidong.han, benami, amit.shah, allen.m.kay

muli@il.ibm.com wrote:
> This patchset enables device assignment for KVM hosts for PCI
> devices. It uses the Intel IOMMU by default if available.
>   

This is still broken wrt CPU hotplug.  If a new CPU comes online, it 
won't have it's ioperm mask set correct and it will crash QEMU.  You 
need to keep track of the current ioperm settings and whenever a new 
VCPU comes online, issue an ioperm.

Regards,

Anthony Liguori

> Changes from v9->v10:
>
> - really emove nr_assigned_devices and device_assignment_enabled and
>   other cleanups
>
> Changes from v8->v9 in no particular order:
>
> - fixed DEBUG, removed uneeded headers
> - introduce USE_KVM_DEVICE_ASSIGNMENT modeled after KVM_USE_PIT, per
>   aliguori's suggestion
> - only call term_printf from the monitor
> - implement markmc's suggestions: don't overflow dev->regions, fix
>   disable_iommu bogosity, fix usage and comment with regards to
>   -pcidevice format, and some other bits
>
> Changes from v7->v8 in in particular order:
>
> - various formatting fixes, DEBUG cleanups, cast removals, etc.
> - s/strncpy/snprintf/
> - split initialization in two phases per aliguori's suggestion
> - bail out on errors when we can't limp on
> - do ioperm on every cpu and vcpu (Weidong Han)
> - use pwrite/pread where applicable
> - split r_virtbase into different fields for memory and IO
> - fix destruction of MMIO regions (Disheng Su and Weidong Han)
>
> Changes from v6->v7 in no particular order:
>
> - formatting changes: adhere to qemu style
> - use strncmp, strncpy etc. instead of the insecure ones
> - move from array to linked list
> - change iopl() to ioperm() (Weidong Han)
> - other small changes as suggested during the review of v6.
>   


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [v10] Userspace patches for PCI device assignment
  2008-10-30 12:44 ` Anthony Liguori
@ 2008-10-30 13:23   ` Han, Weidong
  0 siblings, 0 replies; 31+ messages in thread
From: Han, Weidong @ 2008-10-30 13:23 UTC (permalink / raw)
  To: 'Anthony Liguori', 'muli@il.ibm.com'
  Cc: 'avi@redhat.com', 'kvm@vger.kernel.org',
	'benami@il.ibm.com', 'amit.shah@redhat.com',
	Kay, Allen M

Anthony Liguori wrote:
> muli@il.ibm.com wrote:
>> This patchset enables device assignment for KVM hosts for PCI
>> devices. It uses the Intel IOMMU by default if available.
>> 
> 
> This is still broken wrt CPU hotplug.  If a new CPU comes online, it
> won't have it's ioperm mask set correct and it will crash QEMU.  You
> need to keep track of the current ioperm settings and whenever a new
> VCPU comes online, issue an ioperm.
> 

Yes, it's not fixed yet. After userspace patches are checked in, I will send out a patch to fix it. Thanks.

Regards,
Weidong

> Regards,
> 
> Anthony Liguori
> 
>> Changes from v9->v10:
>> 
>> - really emove nr_assigned_devices and device_assignment_enabled and
>> other cleanups 
>> 
>> Changes from v8->v9 in no particular order:
>> 
>> - fixed DEBUG, removed uneeded headers
>> - introduce USE_KVM_DEVICE_ASSIGNMENT modeled after KVM_USE_PIT, per
>> aliguori's suggestion 
>> - only call term_printf from the monitor
>> - implement markmc's suggestions: don't overflow dev->regions, fix
>>   disable_iommu bogosity, fix usage and comment with regards to
>>   -pcidevice format, and some other bits
>> 
>> Changes from v7->v8 in in particular order:
>> 
>> - various formatting fixes, DEBUG cleanups, cast removals, etc.
>> - s/strncpy/snprintf/
>> - split initialization in two phases per aliguori's suggestion
>> - bail out on errors when we can't limp on
>> - do ioperm on every cpu and vcpu (Weidong Han)
>> - use pwrite/pread where applicable
>> - split r_virtbase into different fields for memory and IO
>> - fix destruction of MMIO regions (Disheng Su and Weidong Han)
>> 
>> Changes from v6->v7 in no particular order:
>> 
>> - formatting changes: adhere to qemu style
>> - use strncmp, strncpy etc. instead of the insecure ones
>> - move from array to linked list
>> - change iopl() to ioperm() (Weidong Han)
>> - other small changes as suggested during the review of v6.


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2008-10-30 13:23 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-29 12:19 [v10] Userspace patches for PCI device assignment muli
2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-29 12:19   ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-29 12:19     ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-29 12:19       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-29 12:20         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-29 12:20           ` [PATCH 6/6] device assignment: support for hot-plugging PCI devices muli
2008-10-29 12:27           ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests Mark McLoughlin
2008-10-29 14:40             ` Muli Ben-Yehuda
2008-10-30 11:29 ` [v10] Userspace patches for PCI device assignment Avi Kivity
2008-10-30 12:44 ` Anthony Liguori
2008-10-30 13:23   ` Han, Weidong
  -- strict thread matches above, loose matches on Subject: below --
2008-10-29 10:22 [v9] " muli
2008-10-29 10:22 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-29 10:22   ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-29 10:22     ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-29 10:22       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-29 10:22         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-28 10:06 [v8] Userspace patches for PCI device assignment muli
2008-10-28 10:06 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-28 10:06   ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-28 10:06     ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-28 10:06       ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-28 10:06         ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-28 14:10           ` Han, Weidong
2008-10-28 15:32             ` Muli Ben-Yehuda
     [not found]           ` <715D42877B251141A38726ABF5CABF2C018683D874@pdsmsx503.ccr.corp.intel.com>
2008-10-28 15:31             ` Han, Weidong
2008-10-28 15:36           ` Han, Weidong
2008-10-28 15:47             ` Muli Ben-Yehuda
2008-10-28 15:45           ` Anthony Liguori
2008-10-28 15:53             ` Muli Ben-Yehuda
2008-10-29  7:56               ` Zhang, Xiantao
2008-10-29 10:27                 ` Muli Ben-Yehuda
2008-10-29  8:22               ` Han, Weidong
2008-10-29 10:25               ` Muli Ben-Yehuda
2008-10-29 10:39                 ` Muli Ben-Yehuda
2008-10-28 16:55           ` Mark McLoughlin
2008-10-29 10:31             ` Muli Ben-Yehuda
2008-10-29 11:07               ` Mark McLoughlin
2008-10-29 11:15               ` Mark McLoughlin
2008-10-29 11:47                 ` Muli Ben-Yehuda

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox