From mboxrd@z Thu Jan 1 00:00:00 1970 From: Anthony Liguori Subject: Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests Date: Tue, 28 Oct 2008 10:45:57 -0500 Message-ID: <490733B5.5010102@codemonkey.ws> References: <1225188410-2222-1-git-send-email-muli@il.ibm.com> <1225188410-2222-2-git-send-email-muli@il.ibm.com> <1225188410-2222-3-git-send-email-muli@il.ibm.com> <1225188410-2222-4-git-send-email-muli@il.ibm.com> <1225188410-2222-5-git-send-email-muli@il.ibm.com> <1225188410-2222-6-git-send-email-muli@il.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: avi@redhat.com, kvm@vger.kernel.org, weidong.han@intel.com, benami@il.ibm.com, amit.shah@redhat.com, allen.m.kay@intel.com To: muli@il.ibm.com Return-path: Received: from mail-gx0-f18.google.com ([209.85.217.18]:55047 "EHLO mail-gx0-f18.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752852AbYJ1PqD (ORCPT ); Tue, 28 Oct 2008 11:46:03 -0400 Received: by gxk11 with SMTP id 11so1690871gxk.13 for ; Tue, 28 Oct 2008 08:46:01 -0700 (PDT) In-Reply-To: <1225188410-2222-6-git-send-email-muli@il.ibm.com> Sender: kvm-owner@vger.kernel.org List-ID: muli@il.ibm.com wrote: > Signed-off-by: Amit Shah > Signed-off-by: Muli Ben-Yehuda > --- > qemu/Makefile.target | 3 + > qemu/hw/device-assignment.c | 641 +++++++++++++++++++++++++++++++++++++++++++ > qemu/hw/device-assignment.h | 117 ++++++++ > qemu/hw/pc.c | 16 + > qemu/hw/pci.c | 7 + > qemu/qemu-kvm.c | 14 + > qemu/qemu-kvm.h | 8 + > qemu/vl.c | 28 ++ > 8 files changed, 834 insertions(+), 0 deletions(-) > create mode 100644 qemu/hw/device-assignment.c > create mode 100644 qemu/hw/device-assignment.h > > diff --git a/qemu/Makefile.target b/qemu/Makefile.target > index d9bdeca..5d44e08 100644 > --- a/qemu/Makefile.target > +++ b/qemu/Makefile.target > @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o > OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o > OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o > OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o > +ifeq ($(USE_KVM), 1) > +OBJS+= device-assignment.o > +endif > I don't think you want to build this on PPC so I think you need a stronger check. > +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr, > + uint32_t value) > +{ > + AssignedDevRegion *r_access = opaque; > + uint32_t r_pio = guest_to_host_ioport(r_access, addr); > + > + DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n", > + r_pio, (int)r_access->e_physbase, > + (unsigned long)r_access->r_virtbase, value); > The format doesn't match the parameter count. > +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, > + uint32_t e_phys, uint32_t e_size, int type) > +{ > + AssignedDevice *r_dev = (AssignedDevice *) pci_dev; > + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > + uint32_t old_ephys = region->e_physbase; > + uint32_t old_esize = region->e_size; > + int first_map = (region->e_size == 0); > + int ret = 0; > + > + DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n", > + e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num); > + > + region->e_physbase = e_phys; > + region->e_size = e_size; > + > + if (!first_map) > + kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize); > + > + if (e_size > 0) > + ret = kvm_register_phys_mem(kvm_context, e_phys, > + region->u.r_virtbase, e_size, 0); > + if (ret != 0) { > + fprintf(stderr, "%s: Error: create new mapping failed\n", __func__); > + exit(1); > + } > +} > + > +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num, > + uint32_t addr, uint32_t size, int type) > +{ > + AssignedDevice *r_dev = (AssignedDevice *) pci_dev; > + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; > + uint32_t old_port = region->u.r_baseport; > + uint32_t old_num = region->e_size; > + int first_map = (old_num == 0); > + struct ioperm_data data; > + int i; > + > + region->e_physbase = addr; > + region->e_size = size; > + > + DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n", > + addr, region->u.r_baseport, type, size, region_num); > + > + memset(&data, 0, sizeof(data)); > + > + if (!first_map) { > + data.start_port = old_port; > + data.num = old_num; > + data.turn_on = 0; > + > + for (i = 0; i < smp_cpus; ++i) > + kvm_ioperm(qemu_kvm_cpu_env(i), &data); > How does this interact with VCPU hot-plug? > + } > + > + data.start_port = region->u.r_baseport; > + data.num = size; > + data.turn_on = 1; > + > + for (i = 0; i < smp_cpus; ++i) > + kvm_ioperm(qemu_kvm_cpu_env(i), &data); > + > + register_ioport_read(addr, size, 1, assigned_dev_ioport_readb, > + (r_dev->v_addrs + region_num)); > + register_ioport_read(addr, size, 2, assigned_dev_ioport_readw, > + (r_dev->v_addrs + region_num)); > + register_ioport_read(addr, size, 4, assigned_dev_ioport_readl, > + (r_dev->v_addrs + region_num)); > + register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb, > + (r_dev->v_addrs + region_num)); > + register_ioport_write(addr, size, 2, assigned_dev_ioport_writew, > + (r_dev->v_addrs + region_num)); > + register_ioport_write(addr, size, 4, assigned_dev_ioport_writel, > + (r_dev->v_addrs + region_num)); > +} > + > +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address, > + uint32_t val, int len) > +{ > + int fd; > + ssize_t ret; > + > + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), > + (uint16_t) address, val, len); > + > + if (address == 0x4) { > + pci_default_write_config(d, address, val, len); > + /* Continue to program the card */ > + } > + > + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || > + address == 0x3c || address == 0x3d) { > + /* used for update-mappings (BAR emulation) */ > + pci_default_write_config(d, address, val, len); > + return; > + } > + > + DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n", > + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), > + (uint16_t) address, val, len); > + > + fd = ((AssignedDevice *)d)->real_device.config_fd; > + > +again: > + ret = pwrite(fd, &val, len, address); > + if (ret != len) { > + if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) > + goto again; > + > + fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n", > + __func__, ret, errno); > + > + exit(1); > + } > +} > + > +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address, > + int len) > +{ > + uint32_t val = 0; > + int fd; > + ssize_t ret; > + > + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || > + address == 0x3c || address == 0x3d) { > + val = pci_default_read_config(d, address, len); > + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); > + return val; > + } > + > + /* vga specific, remove later */ > + if (address == 0xFC) > + goto do_log; > + > + fd = ((AssignedDevice *)d)->real_device.config_fd; > + > +again: > + ret = pread(fd, &val, len, address); > + if (ret != len) { > + if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) > + goto again; > + > + fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n", > + __func__, ret, errno); > + > + exit(1); > + } > + > +do_log: > + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); > + > + /* kill the special capabilities */ > + if (address == 4 && len == 4) > + val &= ~0x100000; > + else if (address == 6) > + val &= ~0x10; > + > + return val; > +} > + > +static int assigned_dev_register_regions(PCIRegion *io_regions, > + unsigned long regions_num, > + AssignedDevice *pci_dev) > +{ > + uint32_t i; > + PCIRegion *cur_region = io_regions; > + > + for (i = 0; i < regions_num; i++, cur_region++) { > + if (!cur_region->valid) > + continue; > + pci_dev->v_addrs[i].num = i; > + > + /* handle memory io regions */ > + if (cur_region->type & IORESOURCE_MEM) { > + int t = cur_region->type & IORESOURCE_PREFETCH > + ? PCI_ADDRESS_SPACE_MEM_PREFETCH > + : PCI_ADDRESS_SPACE_MEM; > + > + /* map physical memory */ > + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; > + pci_dev->v_addrs[i].u.r_virtbase = > + mmap(NULL, > + (cur_region->size + 0xFFF) & 0xFFFFF000, > + PROT_WRITE | PROT_READ, MAP_SHARED, > + cur_region->resource_fd, (off_t) 0); > + > + if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) { > + fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!" > + "\n", __func__, > + (uint32_t) (cur_region->base_addr)); > + return -1; > + } > + pci_dev->v_addrs[i].r_size = cur_region->size; > + pci_dev->v_addrs[i].e_size = 0; > + > + /* add offset */ > + pci_dev->v_addrs[i].u.r_virtbase += > + (cur_region->base_addr & 0xFFF); > + > + pci_register_io_region((PCIDevice *) pci_dev, i, > + cur_region->size, t, > + assigned_dev_iomem_map); > + continue; > + } > + /* handle port io regions */ > + pci_register_io_region((PCIDevice *) pci_dev, i, > + cur_region->size, PCI_ADDRESS_SPACE_IO, > + assigned_dev_ioport_map); > + > + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; > + pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr; > + /* not relevant for port io */ > + pci_dev->v_addrs[i].memory_index = 0; > + } > + > + /* success */ > + return 0; > +} > + > +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus, > + uint8_t r_dev, uint8_t r_func) > +{ > + char dir[128], name[128]; > + int fd, r = 0; > + FILE *f; > + unsigned long long start, end, size, flags; > + PCIRegion *rp; > + PCIDevRegions *dev = &pci_dev->real_device; > + > + dev->region_number = 0; > + > + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/", > + r_bus, r_dev, r_func); > + > + snprintf(name, sizeof(name), "%sconfig", dir); > + > + fd = open(name, O_RDWR); > + if (fd == -1) { > + fprintf(stderr, "%s: %s: %m\n", __func__, name); > + return 1; > + } > + dev->config_fd = fd; > +again: > + r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config)); > + if (r < 0) { > + if (errno == EINTR || errno == EAGAIN) > + goto again; > + fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno); > + } > + > + snprintf(name, sizeof(name), "%sresource", dir); > + > + f = fopen(name, "r"); > + if (f == NULL) { > + fprintf(stderr, "%s: %s: %m\n", __func__, name); > + return 1; > + } > + r = -1; > + while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) { > + r++; > + rp = dev->regions + r; > + rp->valid = 0; > + size = end - start + 1; > + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; > + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) > + continue; > + if (flags & IORESOURCE_MEM) { > + flags &= ~IORESOURCE_IO; > + snprintf(name, sizeof(name), "%sresource%d", dir, r); > + fd = open(name, O_RDWR); > + if (fd == -1) > + continue; /* probably ROM */ > + rp->resource_fd = fd; > + } else > + flags &= ~IORESOURCE_PREFETCH; > + > + rp->type = flags; > + rp->valid = 1; > + rp->base_addr = start; > + rp->size = size; > + DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n", > + r, rp->size, start, rp->type, rp->resource_fd); > + } > + fclose(f); > + > + dev->region_number = r; > + return 0; > +} > + > +static int disable_iommu; > +int nr_assigned_devices; > +static LIST_HEAD(, AssignedDevInfo) adev_head; > + > +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn) > +{ > + return (uint32_t)bus << 8 | (uint32_t)devfn; > +} > + > +static AssignedDevice *register_real_device(PCIBus *e_bus, > + const char *e_dev_name, > + int e_devfn, uint8_t r_bus, > + uint8_t r_dev, uint8_t r_func) > +{ > + int r; > + AssignedDevice *pci_dev; > + uint8_t e_device, e_intx; > + > + DEBUG("Registering real physical device %s (devfn=0x%x)\n", > + e_dev_name, e_devfn); > + > + pci_dev = (AssignedDevice *) > + pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice), > + e_devfn, assigned_dev_pci_read_config, > + assigned_dev_pci_write_config); > + if (NULL == pci_dev) { > + fprintf(stderr, "%s: Error: Couldn't register real device %s\n", > + __func__, e_dev_name); > + return NULL; > + } > + if (get_real_device(pci_dev, r_bus, r_dev, r_func)) { > + fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n", > + __func__, e_dev_name); > + goto out; > + } > + > + /* handle real device's MMIO/PIO BARs */ > + if (assigned_dev_register_regions(pci_dev->real_device.regions, > + pci_dev->real_device.region_number, > + pci_dev)) > + goto out; > + > + /* handle interrupt routing */ > + e_device = (pci_dev->dev.devfn >> 3) & 0x1f; > + e_intx = pci_dev->dev.config[0x3d] - 1; > + pci_dev->intpin = e_intx; > + pci_dev->run = 0; > + pci_dev->girq = 0; > + pci_dev->h_busnr = r_bus; > + pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func); > + > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > + if (kvm_enabled()) { > + struct kvm_assigned_pci_dev assigned_dev_data; > + > + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data)); > + assigned_dev_data.assigned_dev_id = > + calc_assigned_dev_id(pci_dev->h_busnr, > + (uint32_t)pci_dev->h_devfn); > + assigned_dev_data.busnr = pci_dev->h_busnr; > + assigned_dev_data.devfn = pci_dev->h_devfn; > + > +#ifdef KVM_CAP_IOMMU > + /* We always enable the IOMMU if present > + * (or when not disabled on the command line) > + */ > + r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU); > + if (r && !disable_iommu) > + assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU; > +#endif > + r = kvm_assign_pci_device(kvm_context, &assigned_dev_data); > + if (r < 0) { > + fprintf(stderr, "Could not notify kernel about " > + "assigned device \"%s\"\n", e_dev_name); > + perror("register_real_device"); > + goto out; > + } > + } > You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined? That means a newer userspace compiled on an older kernel will silently fail if they try to do device assignment. There's probably no reason to build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined (see how the in-kernel PIT gets conditionally build depending on whether that cap is available). > +#endif > + term_printf("Registered host PCI device %02x:%02x.%1x " > + "(\"%s\") as guest device %02x:%02x.%1x\n", > + r_bus, r_dev, r_func, e_dev_name, > + pci_bus_num(e_bus), e_device, r_func); > > If I read the code correctly, this term_printf() happens regardless of whether this is being done for PCI hotplug or for command-line assignment? That's a problem as it'll print garbage on the monitor when you start QEMU which could break management applications. > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c > index d559f0c..5fdb726 100644 > --- a/qemu/hw/pc.c > +++ b/qemu/hw/pc.c > @@ -33,6 +33,7 @@ > #include "boards.h" > #include "console.h" > #include "fw_cfg.h" > +#include "device-assignment.h" > > #include "qemu-kvm.h" > > @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, > > if (pci_enabled) > virtio_balloon_init(pci_bus); > + > + if (kvm_enabled() && device_assignment_enabled) { > + int i; > Stray tab. > + for (i = 0; i < assigned_devices_index; i++) { > + if (add_assigned_device(assigned_devices[i]) < 0) { > + fprintf(stderr, "Warning: could not add assigned device %s\n", > + assigned_devices[i]); > + } > + } > + > + if (init_all_assigned_devices(pci_bus)) { > + fprintf(stderr, "Failed to initialize assigned devices\n"); > + exit (1); > + } > + } > } > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > + case QEMU_OPTION_pcidevice: > + device_assignment_enabled = 1; > + if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) { > + fprintf(stderr, "Too many assigned devices\n"); > + exit(1); > + } > + assigned_devices[assigned_devices_index] = optarg; > + assigned_devices_index++; > + break; > Tab damage. Regards, Anthony Liguori