From mboxrd@z Thu Jan 1 00:00:00 1970 From: Anthony Liguori Subject: Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests Date: Tue, 23 Sep 2008 11:30:32 -0500 Message-ID: <48D919A8.8040409@codemonkey.ws> References: <1222181695-23418-1-git-send-email-amit.shah@redhat.com> <1222181695-23418-2-git-send-email-amit.shah@redhat.com> <1222181695-23418-3-git-send-email-amit.shah@redhat.com> <1222181695-23418-4-git-send-email-amit.shah@redhat.com> <1222181695-23418-5-git-send-email-amit.shah@redhat.com> <1222181695-23418-6-git-send-email-amit.shah@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: avi@redhat.com, kvm@vger.kernel.org, muli@il.ibm.com, benami@il.ibm.com, weidong.han@intel.com, allen.m.kay@intel.com To: Amit Shah Return-path: Received: from ey-out-2122.google.com ([74.125.78.27]:55846 "EHLO ey-out-2122.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751524AbYIWQbg (ORCPT ); Tue, 23 Sep 2008 12:31:36 -0400 Received: by ey-out-2122.google.com with SMTP id 6so623725eyi.37 for ; Tue, 23 Sep 2008 09:31:33 -0700 (PDT) In-Reply-To: <1222181695-23418-6-git-send-email-amit.shah@redhat.com> Sender: kvm-owner@vger.kernel.org List-ID: Amit Shah wrote: > From: Or Sagi > From: Nir Peleg > From: Amit Shah > From: Ben-Ami Yassour > From: Weidong Han > From: Glauber de Oliveira Costa > > With this patch, we can assign a device on the host machine to a > guest. > > A new command-line option, -pcidevice is added. > For example, to invoke it for a device sitting at PCI bus:dev.fn > 04:08.0, use this: > > -pcidevice host=04:08.0 > > * The host driver for the device, if any, is to be removed before > assigning the device (else device assignment will fail). > > * A device that shares IRQ with another host device cannot currently > be assigned. > > This works only with the in-kernel irqchip method; to use the > userspace irqchip, a kernel module (irqhook) and some extra changes > are needed. > > Signed-off-by: Amit Shah > --- > qemu/Makefile.target | 1 + > qemu/hw/device-assignment.c | 665 +++++++++++++++++++++++++++++++++++++++++++ > qemu/hw/device-assignment.h | 93 ++++++ > qemu/hw/pc.c | 9 + > qemu/hw/pci.c | 7 + > qemu/vl.c | 18 ++ > 6 files changed, 793 insertions(+), 0 deletions(-) > create mode 100644 qemu/hw/device-assignment.c > create mode 100644 qemu/hw/device-assignment.h > > diff --git a/qemu/Makefile.target b/qemu/Makefile.target > index 72f3db8..40eb273 100644 > --- a/qemu/Makefile.target > +++ b/qemu/Makefile.target > @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o > OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o > OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o > OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o > +OBJS+= device-assignment.o > This needs to be conditional on at least linux hosts, but probably also kvm support. > ifeq ($(USE_KVM_PIT), 1) > OBJS+= i8254-kvm.o > endif > diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c > new file mode 100644 > index 0000000..e70daf2 > --- /dev/null > +++ b/qemu/hw/device-assignment.c > @@ -0,0 +1,665 @@ > +/* > + * Copyright (c) 2007, Neocleus Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + * > + * Assign a PCI device from the host to a guest VM. > + * > + * Adapted for KVM by Qumranet. > + * > + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) > + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) > + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) > + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) > + */ > +#include > +#include > +#include "qemu-kvm.h" > +#include > +#include "device-assignment.h" > + > +/* From linux/ioport.h */ > +#define IORESOURCE_IO 0x00000100 /* Resource type */ > +#define IORESOURCE_MEM 0x00000200 > +#define IORESOURCE_IRQ 0x00000400 > +#define IORESOURCE_DMA 0x00000800 > +#define IORESOURCE_PREFETCH 0x00001000 /* No side effects */ > + > +/* #define DEVICE_ASSIGNMENT_DEBUG */ > + > +#ifdef DEVICE_ASSIGNMENT_DEBUG > +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args) > +#else > +#define DEBUG(fmt, args...) > +#endif > Both should be in do { } while(0) to preserve statement semantics. Please use C99 variadic macros too. > +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, > + uint32_t value) > +{ > + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; > + uint32_t r_pio = (unsigned long)r_access->r_virtbase > Should be target_ulong if it's a target virtual address. > + + (addr - r_access->e_physbase); > + > + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { > + fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" > + " r_virtbase=%08lx value=%08x\n", > + __func__, r_pio, (int)r_access->e_physbase, > + (unsigned long)r_access->r_virtbase, value); > + } > + iopl(3); > + outb(value, r_pio); > The formatting is wrong for this entire file. Also, you shouldn't have device specific debug. Should probably error check iopl(3). It's not necessary to call it every time you do an outb, just once when initialized. > +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, > + uint32_t e_phys, uint32_t e_size, int type) > +{ > + AssignedDevice *r_dev = (AssignedDevice *) pci_dev; > + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > + int first_map = (region->e_size == 0); > + int ret = 0; > + > + DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n", > + e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size, > + region_num); > + > + region->e_physbase = e_phys; > + region->e_size = e_size; > + > + /* FIXME: Add support for emulated MMIO for non-kvm guests */ > + if (kvm_enabled()) { > This doesn't work at all if kvm isn't enabled right? You should probably bail out in the init if kvm isn't enabled. If this whole file is included conditionally based on KVM support, then you don't have to worry about using kvm_enabled() guards to conditionally compile out code. > +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address, > + uint32_t val, int len) > +{ > + int fd, r; > + > + DEBUG("%s: (%x.%x): address=%04x val=0x%08x len=%d\n", > + __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), > + (uint16_t) address, val, len); > + > + if (address == 0x4) { > + pci_default_write_config(d, address, val, len); > + /* Continue to program the card */ > + } > + > + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || > + address == 0x3c || address == 0x3d) { > + /* used for update-mappings (BAR emulation) */ > + pci_default_write_config(d, address, val, len); > + return; > + } > + DEBUG("%s: NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n", > + __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), > + (uint16_t) address, val, len); > + fd = ((AssignedDevice *)d)->real_device.config_fd; > + r = lseek(fd, address, SEEK_SET); > + if (r < 0) { > + fprintf(stderr, "%s: bad seek, errno = %d\n", > + __func__, errno); > + return; > + } > +again: > + r = write(fd, &val, len); > + if (r < 0) { > + if (errno == EINTR || errno == EAGAIN) > + goto again; > + fprintf(stderr, "%s: write failed, errno = %d\n", > + __func__, errno); > + } > +} > Things may be simplified by doing pwrite/pread here. > +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address, > + int len) > +{ > + uint32_t val = 0; > + int fd, r; > + > + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || > + address == 0x3c || address == 0x3d) { > + val = pci_default_read_config(d, address, len); > + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, > + len); > + return val; > + } > + > + /* vga specific, remove later */ > + if (address == 0xFC) > + goto do_log; > + > + fd = ((AssignedDevice *)d)->real_device.config_fd; > + r = lseek(fd, address, SEEK_SET); > + if (r < 0) { > + fprintf(stderr, "%s: bad seek, errno = %d\n", > + __func__, errno); > + return val; > + } > +again: > + r = read(fd, &val, len); > + if (r < 0) { > + if (errno == EINTR || errno == EAGAIN) > + goto again; > + fprintf(stderr, "%s: read failed, errno = %d\n", > + __func__, errno); > + } > +do_log: > + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); > + > + /* kill the special capabilities */ > + if (address == 4 && len == 4) > + val &= ~0x100000; > + else if (address == 6) > + val &= ~0x10; > + > + return val; > +} > + > +static int assigned_dev_register_regions(PCIRegion *io_regions, > + unsigned long regions_num, > + AssignedDevice *pci_dev) > +{ > + uint32_t i; > + PCIRegion *cur_region = io_regions; > + > + for (i = 0; i < regions_num; i++, cur_region++) { > + if (!cur_region->valid) > + continue; > +#ifdef DEVICE_ASSIGNMENT_DEBUG > + pci_dev->v_addrs[i].debug |= DEVICE_ASSIGNMENT_DEBUG_MMIO > + | DEVICE_ASSIGNMENT_DEBUG_PIO; > +#endif > + pci_dev->v_addrs[i].num = i; > + > + /* handle memory io regions */ > + if (cur_region->type & IORESOURCE_MEM) { > + int t = cur_region->type & IORESOURCE_PREFETCH > + ? PCI_ADDRESS_SPACE_MEM_PREFETCH > + : PCI_ADDRESS_SPACE_MEM; > + > + /* map physical memory */ > + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; > + pci_dev->v_addrs[i].r_virtbase = > + mmap(NULL, > + (cur_region->size + 0xFFF) & 0xFFFFF000, > + PROT_WRITE | PROT_READ, MAP_SHARED, > + cur_region->resource_fd, (off_t) 0); > + > + if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) { > + fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!" > + "\n", __func__, > + (uint32_t) (cur_region->base_addr)); > + return -1; > + } > + pci_dev->v_addrs[i].r_size = cur_region->size; > + pci_dev->v_addrs[i].e_size = 0; > + > + /* add offset */ > + pci_dev->v_addrs[i].r_virtbase += > + (cur_region->base_addr & 0xFFF); > + > + pci_register_io_region((PCIDevice *) pci_dev, i, > + cur_region->size, t, > + assigned_dev_iomem_map); > + continue; > + } > + /* handle port io regions */ > + pci_register_io_region((PCIDevice *) pci_dev, i, > + cur_region->size, PCI_ADDRESS_SPACE_IO, > + assigned_dev_ioport_map); > + > + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; > + pci_dev->v_addrs[i].r_virtbase = > + (void *)(long)cur_region->base_addr; > + /* not relevant for port io */ > + pci_dev->v_addrs[i].memory_index = 0; > + } > + > + /* success */ > + return 0; > +} > + > +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus, > + uint8_t r_dev, uint8_t r_func) > +{ > + char dir[128], name[128], comp[16]; > + int fd, r = 0; > + FILE *f; > + unsigned long long start, end, size, flags; > + PCIRegion *rp; > + PCIDevRegions *dev = &pci_dev->real_device; > + > + dev->region_number = 0; > + > + sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%x/", > + r_bus, r_dev, r_func); > snprintf() > + strcpy(name, dir); > + strcat(name, "config"); > snprintf() > + fd = open(name, O_RDWR); > + if (fd == -1) { > + fprintf(stderr, "%s: %s: %m\n", __func__, name); > + return 1; > + } > + dev->config_fd = fd; > +again: > + r = read(fd, pci_dev->dev.config, sizeof pci_dev->dev.config); > Please use parens with sizeof(). > + if (r < 0) { > + if (errno == EINTR || errno == EAGAIN) > + goto again; > + fprintf(stderr, "%s: read failed, errno = %d\n", > + __func__, errno); > + } > + strcpy(name, dir); > + strcat(name, "resource"); > snprintf() > + f = fopen(name, "r"); > + if (f == NULL) { > + fprintf(stderr, "%s: %s: %m\n", __func__, name); > + return 1; > + } > + for (r = 0; fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3; > + r++) { > Please make this a while loop. > + rp = dev->regions + r; > + rp->valid = 0; > + size = end - start + 1; > + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; > + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) > + continue; > + if (flags & IORESOURCE_MEM) { > + flags &= ~IORESOURCE_IO; > + sprintf(comp, "resource%d", r); > + strcpy(name, dir); > + strcat(name, comp); > snprintf() > + fd = open(name, O_RDWR); > + if (fd == -1) > + continue; /* probably ROM */ > + rp->resource_fd = fd; > + } else > + flags &= ~IORESOURCE_PREFETCH; > + > + rp->type = flags; > + rp->valid = 1; > + rp->base_addr = start; > + rp->size = size; > + DEBUG("%s: region %d size %d start 0x%x type %d " > + "resource_fd %d\n", __func__, r, rp->size, start, > + rp->type, rp->resource_fd); > + } > + fclose(f); > + > + dev->region_number = r; > + return 0; > +} > + > +#define MAX_ASSIGNED_DEVS 4 > +struct { > + char name[15]; > + int bus; > + int dev; > + int func; > + AssignedDevice *assigned_dev; > +} assigned_devices[MAX_ASSIGNED_DEVS]; > Any reason not to just use a list here? sys-queue.h makes that very easy. > +int nr_assigned_devices; > +static int disable_iommu; > + > +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn) > +{ > + return (uint32_t)bus << 8 | (uint32_t)devfn; > +} > + > +static AssignedDevice *register_real_device(PCIBus *e_bus, > + const char *e_dev_name, > + int e_devfn, uint8_t r_bus, > + uint8_t r_dev, uint8_t r_func) > +{ > + int r; > + AssignedDevice *pci_dev; > + uint8_t e_device, e_intx; > + > + DEBUG("%s: Registering real physical device %s (devfn=0x%x)\n", > + __func__, e_dev_name, e_devfn); > + > + pci_dev = (AssignedDevice *) > + pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice), > + e_devfn, assigned_dev_pci_read_config, > + assigned_dev_pci_write_config); > + if (NULL == pci_dev) { > + fprintf(stderr, "%s: Error: Couldn't register real device %s\n", > + __func__, e_dev_name); > + return NULL; > + } > + if (get_real_device(pci_dev, r_bus, r_dev, r_func)) { > + fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n", > + __func__, e_dev_name); > + goto out; > + } > + > + /* handle real device's MMIO/PIO BARs */ > + if (assigned_dev_register_regions(pci_dev->real_device.regions, > + pci_dev->real_device.region_number, > + pci_dev)) > + goto out; > + > + /* handle interrupt routing */ > + e_device = (pci_dev->dev.devfn >> 3) & 0x1f; > + e_intx = pci_dev->dev.config[0x3d] - 1; > + pci_dev->intpin = e_intx; > + pci_dev->run = 0; > + pci_dev->girq = 0; > + pci_dev->h_busnr = r_bus; > + pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func); > + > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > + if (kvm_enabled()) { > + struct kvm_assigned_pci_dev assigned_dev_data; > + > + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data)); > + assigned_dev_data.assigned_dev_id = > + calc_assigned_dev_id(pci_dev->h_busnr, > + (uint32_t)pci_dev->h_devfn); > + assigned_dev_data.busnr = pci_dev->h_busnr; > + assigned_dev_data.devfn = pci_dev->h_devfn; > + > +#ifdef KVM_CAP_IOMMU > + /* We always enable the IOMMU if present > + * (or when not disabled on the command line) > + */ > + r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU); > + if (r && !disable_iommu) > + assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU; > +#endif > + r = kvm_assign_pci_device(kvm_context, &assigned_dev_data); > + if (r < 0) { > + fprintf(stderr, "Could not notify kernel about " > + "assigned device \"%s\"\n", e_dev_name); > + perror("pt-ioctl"); > + goto out; > + } > + } > +#endif > + fprintf(stderr, "Registered host PCI device %02x:%02x.%1x " > + "(\"%s\") as guest device %02x:%02x.%1x\n", > + r_bus, r_dev, r_func, e_dev_name, > + pci_bus_num(e_bus), e_device, r_func); > Please don't fprintf() unconditionally. A lot more checks are needed here to see if things can succeed. We definitely should bail out if they can't. > + > + return pci_dev; > +out: > + pci_unregister_device(&pci_dev->dev); > + return NULL; > +} > + > +extern int get_param_value(char *buf, int buf_size, > + const char *tag, const char *str); > +extern int piix_get_irq(int); > Don't do this in C files. > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > +/* The pci config space got updated. Check if irq numbers have changed > + * for our devices > + */ > +void assigned_dev_update_irq(PCIDevice *d) > +{ > + int i, irq, r; > + AssignedDevice *assigned_dev; > + > + for (i = 0; i < nr_assigned_devices; i++) { > + assigned_dev = assigned_devices[i].assigned_dev; > + if (assigned_dev == NULL) > + continue; > + > + irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin); > + irq = piix_get_irq(irq); > + > + if (irq != assigned_dev->girq) { > + struct kvm_assigned_irq assigned_irq_data; > + > + memset(&assigned_irq_data, 0, sizeof assigned_irq_data); > + assigned_irq_data.assigned_dev_id = > + calc_assigned_dev_id(assigned_dev->h_busnr, > + (uint8_t) > + assigned_dev->h_devfn); > + assigned_irq_data.guest_irq = irq; > + assigned_irq_data.host_irq = > + assigned_dev->real_device.irq; > + r = kvm_assign_irq(kvm_context, &assigned_irq_data); > + if (r < 0) { > + perror("assigned_dev_update_irq"); > + fprintf(stderr, "Are you assigning a device " > + "that shares IRQ with some other " > + "device?\n"); > + pci_unregister_device(&assigned_dev->dev); > + continue; > + } > + assigned_dev->girq = irq; > + } > + } > +} > +#endif > + > +static int init_device_assignment(void) > +{ > + /* Do we have any devices to be assigned? */ > + if (nr_assigned_devices == 0) > + return -1; > + iopl(3); > + return 0; > +} > + > +struct PCIDevice *init_assigned_device(PCIBus *bus, int *index) > +{ > + AssignedDevice *dev = NULL; > + int i; > + > + if (*index == -1) { > + if (init_device_assignment() < 0) > + return NULL; > + > + *index = nr_assigned_devices - 1; > + } > + i = *index; > + dev = register_real_device(bus, assigned_devices[i].name, -1, > + assigned_devices[i].bus, > + assigned_devices[i].dev, > + assigned_devices[i].func); > + if (dev == NULL) { > + fprintf(stderr, "Error: Couldn't register device \"%s\"\n", > + assigned_devices[i].name); > + } > + assigned_devices[i].assigned_dev = dev; > + > + --*index; > + return &dev->dev; > +} > + > +/* > + * Syntax to assign device: > + * > + * -pcidevice dev=bus:dev.func,dma=dma > + * > + * Example: > + * -pcidevice host=00:13.0,dma=pvdma > + * > + * dma can currently only be 'none' to disable iommu support. > Does it actually work if you disable iommu support? > + */ > +void add_assigned_device(const char *arg) > +{ > + char *cp, *cp1; > + char device[8]; > + char dma[6]; > + int r; > + > + if (nr_assigned_devices >= MAX_ASSIGNED_DEVS) { > + fprintf(stderr, "Too many assigned devices (max %d)\n", > + MAX_ASSIGNED_DEVS); > + return; > + } > + memset(&assigned_devices[nr_assigned_devices], 0, > + sizeof assigned_devices[nr_assigned_devices]); > + r = get_param_value(device, sizeof device, "host", arg); > + > + r = get_param_value(assigned_devices[nr_assigned_devices].name, > + sizeof assigned_devices[nr_assigned_devices].name, > + "name", arg); > + if (!r) > + strncpy(assigned_devices[nr_assigned_devices].name, device, 8); > + > +#ifdef KVM_CAP_IOMMU > + r = get_param_value(dma, sizeof dma, "dma", arg); > + if (r && !strncmp(dma, "none", 4)) > + disable_iommu = 1; > +#endif > + cp = device; > + assigned_devices[nr_assigned_devices].bus = strtoul(cp, &cp1, 16); > + if (*cp1 != ':') > + goto bad; > + cp = cp1 + 1; > + > + assigned_devices[nr_assigned_devices].dev = strtoul(cp, &cp1, 16); > + if (*cp1 != '.') > + goto bad; > + cp = cp1 + 1; > + > + assigned_devices[nr_assigned_devices].func = strtoul(cp, &cp1, 16); > + > + nr_assigned_devices++; > + return; > +bad: > + fprintf(stderr, "pcidevice argument parse error; " > + "please check the help text for usage\n"); > +} > diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h > new file mode 100644 > index 0000000..b77e484 > --- /dev/null > +++ b/qemu/hw/device-assignment.h > @@ -0,0 +1,93 @@ > +/* > + * Copyright (c) 2007, Neocleus Corporation. > + * Copyright (c) 2007, Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + * Data structures for storing PCI state > + * > + * Adapted to kvm by Qumranet > + * > + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) > + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) > + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) > + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) > + */ > + > +#ifndef __DEVICE_ASSIGNMENT_H__ > +#define __DEVICE_ASSIGNMENT_H__ > + > +#include > Don't think this is needed here. > +#include "qemu-common.h" > +#include "pci.h" > +#include > Nor this. > + > +#define DEVICE_ASSIGNMENT_DEBUG_PIO (0x01) > +#define DEVICE_ASSIGNMENT_DEBUG_MMIO (0x02) > + > +/* From include/linux/pci.h in the kernel sources */ > +#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) > + > +typedef uint32_t pciaddr_t; > + > +#define MAX_IO_REGIONS (6) > + > +typedef struct pci_region_s { > typedef struct PCIRegion > + int type; /* Memory or port I/O */ > + int valid; > + pciaddr_t base_addr; > + pciaddr_t size; /* size of the region */ > ram_addr_t. > + int resource_fd; > +} PCIRegion; > + > +typedef struct pci_dev_s { > typedef struct PCIDevRegions > + uint8_t bus, dev, func; /* Bus inside domain, device and function */ > + int irq; /* IRQ number */ > + uint16_t region_number; /* number of active regions */ > + > + /* Port I/O or MMIO Regions */ > + PCIRegion regions[MAX_IO_REGIONS]; > + int config_fd; > +} PCIDevRegions; > + > +typedef struct assigned_dev_region_s { > + target_phys_addr_t e_physbase; > + uint32_t memory_index; > + void *r_virtbase; /* mmapped access address */ > + int num; /* our index within v_addrs[] */ > + uint32_t e_size; /* emulated size of region in bytes */ > + uint32_t r_size; /* real size of region in bytes */ > + uint32_t debug; > +} AssignedDevRegion; > + > +typedef struct assigned_dev_s { > + PCIDevice dev; > + int intpin; > + uint8_t debug_flags; > + AssignedDevRegion v_addrs[PCI_NUM_REGIONS]; > + PCIDevRegions real_device; > + int run; > + int girq; > + unsigned char h_busnr; > + unsigned int h_devfn; > + int bound; > +} AssignedDevice; > + > +/* Initialization functions */ > +PCIDevice *init_assigned_device(PCIBus *bus, int *index); > +void add_assigned_device(const char *arg); > +void assigned_dev_set_vector(int irq, int vector); > +void assigned_dev_ack_mirq(int vector); > + > +#endif /* __DEVICE_ASSIGNMENT_H__ */ > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c > index 6053103..4a611cc 100644 > --- a/qemu/hw/pc.c > +++ b/qemu/hw/pc.c > @@ -32,6 +32,7 @@ > #include "smbus.h" > #include "boards.h" > #include "console.h" > +#include "device-assignment.h" > > #include "qemu-kvm.h" > > @@ -1006,6 +1007,14 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, > } > } > > + /* Initialize assigned devices */ > + if (pci_enabled) { > + int r = -1; > + do { > + init_assigned_device(pci_bus, &r); > Why pass r by reference instead of just returning it? At any rate, you should detect when this fails and gracefully terminate QEMU. > + } while (r >= 0); > + } > + > rtc_state = rtc_init(0x70, i8259[8]); > > qemu_register_boot_set(pc_boot_set, rtc_state); > diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c > index 61ff0f6..e4e8386 100644 > --- a/qemu/hw/pci.c > +++ b/qemu/hw/pci.c > @@ -50,6 +50,7 @@ struct PCIBus { > > static void pci_update_mappings(PCIDevice *d); > static void pci_set_irq(void *opaque, int irq_num, int level); > +void assigned_dev_update_irq(PCIDevice *d); > > target_phys_addr_t pci_mem_base; > static int pci_irq_index; > @@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d, > val >>= 8; > } > > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > + if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() && > + address >= 0x60 && address <= 0x63) > + assigned_dev_update_irq(d); > +#endif > + > end = address + len; > if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) { > /* if the command register is modified, we must modify the mappings */ > diff --git a/qemu/vl.c b/qemu/vl.c > index 2fb8552..83f28c5 100644 > --- a/qemu/vl.c > +++ b/qemu/vl.c > @@ -37,6 +37,7 @@ > #include "qemu-char.h" > #include "block.h" > #include "audio/audio.h" > +#include "hw/device-assignment.h" > #include "migration.h" > #include "balloon.h" > #include "qemu-kvm.h" > @@ -8469,6 +8470,12 @@ static void help(int exitcode) > #endif > "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n" > "-no-kvm-pit disable KVM kernel mode PIT\n" > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > + "-pcidevice host=bus:dev.func[,dma=none][,name=\"string\"]\n" > + " expose a PCI device to the guest OS.\n" > + " dma=none: don't perform any dma translations (default is to use an iommu)\n" > + " 'string' is used in log output.\n" > +#endif > #endif > #ifdef TARGET_I386 > "-std-vga simulate a standard VGA card with VESA Bochs Extensions\n" > @@ -8592,6 +8599,9 @@ enum { > QEMU_OPTION_no_kvm, > QEMU_OPTION_no_kvm_irqchip, > QEMU_OPTION_no_kvm_pit, > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > + QEMU_OPTION_pcidevice, > +#endif > QEMU_OPTION_no_reboot, > QEMU_OPTION_no_shutdown, > QEMU_OPTION_show_cursor, > @@ -8680,6 +8690,9 @@ const QEMUOption qemu_options[] = { > #endif > { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip }, > { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit }, > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > + { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice }, > +#endif > #endif > #if defined(TARGET_PPC) || defined(TARGET_SPARC) > { "g", 1, QEMU_OPTION_g }, > @@ -9586,6 +9599,11 @@ int main(int argc, char **argv) > kvm_pit = 0; > break; > } > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > + case QEMU_OPTION_pcidevice: > + add_assigned_device(optarg); > You should copy into an array, then in pc.c, iterate through the array and call into add_assigned_device. Regards, Anthony Liguori > + break; > +#endif > #endif > case QEMU_OPTION_usb: > usb_enabled = 1; >