From mboxrd@z Thu Jan 1 00:00:00 1970 From: Mark McLoughlin Subject: Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests Date: Tue, 28 Oct 2008 16:55:22 +0000 Message-ID: <1225212922.11515.85.camel@blaa> References: <1225188410-2222-1-git-send-email-muli@il.ibm.com> <1225188410-2222-2-git-send-email-muli@il.ibm.com> <1225188410-2222-3-git-send-email-muli@il.ibm.com> <1225188410-2222-4-git-send-email-muli@il.ibm.com> <1225188410-2222-5-git-send-email-muli@il.ibm.com> <1225188410-2222-6-git-send-email-muli@il.ibm.com> Reply-To: Mark McLoughlin Mime-Version: 1.0 Content-Type: text/plain Content-Transfer-Encoding: 7bit Cc: avi@redhat.com, kvm@vger.kernel.org, anthony@codemonkey.ws, weidong.han@intel.com, benami@il.ibm.com, amit.shah@redhat.com, allen.m.kay@intel.com To: muli@il.ibm.com Return-path: Received: from mx2.redhat.com ([66.187.237.31]:41311 "EHLO mx2.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752563AbYJ1RAQ (ORCPT ); Tue, 28 Oct 2008 13:00:16 -0400 In-Reply-To: <1225188410-2222-6-git-send-email-muli@il.ibm.com> Sender: kvm-owner@vger.kernel.org List-ID: On Tue, 2008-10-28 at 12:06 +0200, muli@il.ibm.com wrote: ... > +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus, > + uint8_t r_dev, uint8_t r_func) > +{ > + char dir[128], name[128]; > + int fd, r = 0; > + FILE *f; > + unsigned long long start, end, size, flags; > + PCIRegion *rp; > + PCIDevRegions *dev = &pci_dev->real_device; > + > + dev->region_number = 0; > + > + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/", > + r_bus, r_dev, r_func); > + > + snprintf(name, sizeof(name), "%sconfig", dir); > + > + fd = open(name, O_RDWR); > + if (fd == -1) { > + fprintf(stderr, "%s: %s: %m\n", __func__, name); > + return 1; > + } > + dev->config_fd = fd; > +again: > + r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config)); > + if (r < 0) { > + if (errno == EINTR || errno == EAGAIN) > + goto again; > + fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno); > + } > + > + snprintf(name, sizeof(name), "%sresource", dir); > + > + f = fopen(name, "r"); > + if (f == NULL) { > + fprintf(stderr, "%s: %s: %m\n", __func__, name); > + return 1; > + } > + r = -1; > + while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) { > + r++; > + rp = dev->regions + r; Could, in theory, overflow dev->regions here. Suggest: + for (r = 0; r < MAX_IO_REGIONS; r++) { + if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3) + break; > + rp->valid = 0; > + size = end - start + 1; > + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; > + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) > + continue; > + if (flags & IORESOURCE_MEM) { > + flags &= ~IORESOURCE_IO; > + snprintf(name, sizeof(name), "%sresource%d", dir, r); > + fd = open(name, O_RDWR); > + if (fd == -1) > + continue; /* probably ROM */ > + rp->resource_fd = fd; > + } else > + flags &= ~IORESOURCE_PREFETCH; > + > + rp->type = flags; > + rp->valid = 1; > + rp->base_addr = start; > + rp->size = size; > + DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n", > + r, rp->size, start, rp->type, rp->resource_fd); > + } > + fclose(f); > + > + dev->region_number = r; > + return 0; > +} > + > +static int disable_iommu; Why is this global? The flag is set per-device on the command-line and only affects whether we pass KVM_DEV_ASSIGN_ENABLE_IOMMU to kvm_assign_pci_device() > +int nr_assigned_devices; > +static LIST_HEAD(, AssignedDevInfo) adev_head; > + > +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn) > +{ > + return (uint32_t)bus << 8 | (uint32_t)devfn; > +} > + > +static AssignedDevice *register_real_device(PCIBus *e_bus, > + const char *e_dev_name, > + int e_devfn, uint8_t r_bus, > + uint8_t r_dev, uint8_t r_func) > +{ > + int r; > + AssignedDevice *pci_dev; > + uint8_t e_device, e_intx; > + > + DEBUG("Registering real physical device %s (devfn=0x%x)\n", > + e_dev_name, e_devfn); > + > + pci_dev = (AssignedDevice *) > + pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice), > + e_devfn, assigned_dev_pci_read_config, > + assigned_dev_pci_write_config); > + if (NULL == pci_dev) { > + fprintf(stderr, "%s: Error: Couldn't register real device %s\n", > + __func__, e_dev_name); > + return NULL; > + } > + if (get_real_device(pci_dev, r_bus, r_dev, r_func)) { > + fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n", > + __func__, e_dev_name); > + goto out; > + } > + > + /* handle real device's MMIO/PIO BARs */ > + if (assigned_dev_register_regions(pci_dev->real_device.regions, > + pci_dev->real_device.region_number, > + pci_dev)) > + goto out; > + > + /* handle interrupt routing */ > + e_device = (pci_dev->dev.devfn >> 3) & 0x1f; > + e_intx = pci_dev->dev.config[0x3d] - 1; > + pci_dev->intpin = e_intx; > + pci_dev->run = 0; > + pci_dev->girq = 0; > + pci_dev->h_busnr = r_bus; > + pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func); > + > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > + if (kvm_enabled()) { > + struct kvm_assigned_pci_dev assigned_dev_data; > + > + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data)); > + assigned_dev_data.assigned_dev_id = > + calc_assigned_dev_id(pci_dev->h_busnr, > + (uint32_t)pci_dev->h_devfn); > + assigned_dev_data.busnr = pci_dev->h_busnr; > + assigned_dev_data.devfn = pci_dev->h_devfn; > + > +#ifdef KVM_CAP_IOMMU > + /* We always enable the IOMMU if present > + * (or when not disabled on the command line) > + */ > + r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU); > + if (r && !disable_iommu) > + assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU; > +#endif > + r = kvm_assign_pci_device(kvm_context, &assigned_dev_data); > + if (r < 0) { > + fprintf(stderr, "Could not notify kernel about " > + "assigned device \"%s\"\n", e_dev_name); > + perror("register_real_device"); > + goto out; > + } > + } > +#endif > + term_printf("Registered host PCI device %02x:%02x.%1x " > + "(\"%s\") as guest device %02x:%02x.%1x\n", > + r_bus, r_dev, r_func, e_dev_name, > + pci_bus_num(e_bus), e_device, r_func); > + > + return pci_dev; > +out: > +/* pci_unregister_device(&pci_dev->dev); */ > + return NULL; > +} > + > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > +/* The pci config space got updated. Check if irq numbers have changed > + * for our devices > + */ > +void assigned_dev_update_irq(PCIDevice *d) > +{ > + int irq, r; > + AssignedDevice *assigned_dev; > + AssignedDevInfo *adev; > + > + LIST_FOREACH(adev, &adev_head, next) { > + assigned_dev = adev->assigned_dev; > + irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin); > + irq = piix_get_irq(irq); > + > + if (irq != assigned_dev->girq) { > + struct kvm_assigned_irq assigned_irq_data; > + > + memset(&assigned_irq_data, 0, sizeof(assigned_irq_data)); > + assigned_irq_data.assigned_dev_id = > + calc_assigned_dev_id(assigned_dev->h_busnr, > + (uint8_t) assigned_dev->h_devfn); > + assigned_irq_data.guest_irq = irq; > + assigned_irq_data.host_irq = assigned_dev->real_device.irq; > + r = kvm_assign_irq(kvm_context, &assigned_irq_data); > + if (r < 0) { > + perror("assigned_dev_update_irq"); > + fprintf(stderr, "Are you assigning a device " > + "that shares IRQ with some other device?\n"); > + pci_unregister_device(&assigned_dev->dev); > + /* FIXME: Delete node from list */ > + continue; > + } > + assigned_dev->girq = irq; > + } > + } > +} > +#endif > + > +struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus) > +{ > + adev->assigned_dev = register_real_device(bus, > + adev->name, -1, > + adev->bus, > + adev->dev, > + adev->func); > + return &adev->assigned_dev->dev; > +} This looks unnecessary - register_real_device() isn't used anywhere else. Why not just move register_real_device() into init_assigned_device() ? > +int init_all_assigned_devices(PCIBus *bus) > +{ > + struct AssignedDevInfo *adev; > + > + LIST_FOREACH(adev, &adev_head, next) > + if (init_assigned_device(adev, bus) == NULL) > + return -1; > + > + return 0; > +} > + > +/* > + * Syntax to assign device: > + * > + * -pcidevice dev=bus:dev.func,dma=dma ^^^ ^^^ Should be: -pcidevice host=bus:dev.func[,dma=none][,name=string] > + * > + * Example: > + * -pcidevice host=00:13.0,dma=pvdma ^^^^^ Should be: -pcidevice host=00:13.0,dma=none,name=Foo > + * > + * dma can currently only be 'none' to disable iommu support. > + */ > +AssignedDevInfo *add_assigned_device(const char *arg) > +{ > + char *cp, *cp1; > + char device[8]; > + char dma[6]; > + int r; > + AssignedDevInfo *adev; > + > + adev = qemu_mallocz(sizeof(AssignedDevInfo)); > + if (adev == NULL) { > + fprintf(stderr, "%s: Out of memory\n", __func__); > + return NULL; > + } > + r = get_param_value(device, sizeof(device), "host", arg); > + r = get_param_value(adev->name, sizeof(adev->name), "name", arg); > + if (!r) > + snprintf(adev->name, sizeof(adev->name), "%s", device); > + > +#ifdef KVM_CAP_IOMMU > + r = get_param_value(dma, sizeof(dma), "dma", arg); > + if (r && !strncmp(dma, "none", 4)) > + disable_iommu = 1; > +#endif > + cp = device; > + adev->bus = strtoul(cp, &cp1, 16); > + if (*cp1 != ':') > + goto bad; > + cp = cp1 + 1; > + > + adev->dev = strtoul(cp, &cp1, 16); > + if (*cp1 != '.') > + goto bad; > + cp = cp1 + 1; > + > + adev->func = strtoul(cp, &cp1, 16); > + > + nr_assigned_devices++; nr_assigned_devices isn't actually used anywhere. > + LIST_INSERT_HEAD(&adev_head, adev, next); > + return adev; > +bad: > + fprintf(stderr, "pcidevice argument parse error; " > + "please check the help text for usage\n"); > + qemu_free(adev); > + return NULL; > +} > diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h > new file mode 100644 > index 0000000..ebc0b50 > --- /dev/null > +++ b/qemu/hw/device-assignment.h > @@ -0,0 +1,117 @@ > +/* > + * Copyright (c) 2007, Neocleus Corporation. > + * Copyright (c) 2007, Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + * Data structures for storing PCI state > + * > + * Adapted to kvm by Qumranet > + * > + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) > + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) > + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) > + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) > + */ > + > +#ifndef __DEVICE_ASSIGNMENT_H__ > +#define __DEVICE_ASSIGNMENT_H__ > + > +#include > +#include "qemu-common.h" > +#include "sys-queue.h" > +#include "pci.h" > + > +/* From include/linux/pci.h in the kernel sources */ > +#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) > + > +#define MAX_IO_REGIONS (6) Perhaps a comment to say this is the number of BARs in the config space header? > +typedef struct { > + int type; /* Memory or port I/O */ > + int valid; > + uint32_t base_addr; > + uint32_t size; /* size of the region */ > + int resource_fd; > +} PCIRegion; > + > +typedef struct { > + uint8_t bus, dev, func; /* Bus inside domain, device and function */ > + int irq; /* IRQ number */ > + uint16_t region_number; /* number of active regions */ > + > + /* Port I/O or MMIO Regions */ > + PCIRegion regions[MAX_IO_REGIONS]; > + int config_fd; > +} PCIDevRegions; > + > +typedef struct { > + target_phys_addr_t e_physbase; > + uint32_t memory_index; > + union { > + void *r_virtbase; /* mmapped access address for memory regions */ > + uint32_t r_baseport; /* the base guest port for I/O regions */ > + } u; > + int num; /* our index within v_addrs[] */ > + uint32_t e_size; /* emulated size of region in bytes */ > + uint32_t r_size; /* real size of region in bytes */ > +} AssignedDevRegion; > + > +typedef struct { > + PCIDevice dev; > + int intpin; > + uint8_t debug_flags; > + AssignedDevRegion v_addrs[PCI_NUM_REGIONS]; > + PCIDevRegions real_device; > + int run; > + int girq; > + unsigned char h_busnr; > + unsigned int h_devfn; > + int bound; > +} AssignedDevice; > + > +typedef struct AssignedDevInfo AssignedDevInfo; > + > +struct AssignedDevInfo { > + char name[15]; > + int bus; > + int dev; > + int func; > + AssignedDevice *assigned_dev; > + LIST_ENTRY(AssignedDevInfo) next; > +}; > + > +PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus); > +AssignedDevInfo *add_assigned_device(const char *arg); > +void assigned_dev_set_vector(int irq, int vector); > +void assigned_dev_ack_mirq(int vector); > + > + > +#ifdef USE_KVM > +int init_all_assigned_devices(PCIBus *bus); > +#else /* not using kvm */ > +static inline int init_all_assigned_devices(PCIBus *bus) > +{ > + return 0; > +} > +#endif /* !USE_KVM */ > + > + > +#define MAX_DEV_ASSIGN_CMDLINE 8 > + > +extern int device_assignment_enabled; > +extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE]; > +extern int assigned_devices_index; Neither of these two are implemented anywhere. > +#endif /* __DEVICE_ASSIGNMENT_H__ */ > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c > index d559f0c..5fdb726 100644 > --- a/qemu/hw/pc.c > +++ b/qemu/hw/pc.c > @@ -33,6 +33,7 @@ > #include "boards.h" > #include "console.h" > #include "fw_cfg.h" > +#include "device-assignment.h" > > #include "qemu-kvm.h" > > @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, > > if (pci_enabled) > virtio_balloon_init(pci_bus); > + > + if (kvm_enabled() && device_assignment_enabled) { The device_assignment_enabled flag looks like it shouldn't be needed. If assigned_devices_index remains zero, nothing should happen anyway. > + int i; > + for (i = 0; i < assigned_devices_index; i++) { > + if (add_assigned_device(assigned_devices[i]) < 0) { > + fprintf(stderr, "Warning: could not add assigned device %s\n", > + assigned_devices[i]); > + } > + } > + > + if (init_all_assigned_devices(pci_bus)) { > + fprintf(stderr, "Failed to initialize assigned devices\n"); > + exit (1); > + } > + } > } > > static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size, > diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c > index c82cd20..f86a8a7 100644 > --- a/qemu/hw/pci.c > +++ b/qemu/hw/pci.c > @@ -50,6 +50,7 @@ struct PCIBus { > > static void pci_update_mappings(PCIDevice *d); > static void pci_set_irq(void *opaque, int irq_num, int level); > +void assigned_dev_update_irq(PCIDevice *d); > > target_phys_addr_t pci_mem_base; > static int pci_irq_index; > @@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d, > val >>= 8; > } > > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT > + if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() && > + address >= 0x60 && address <= 0x63) > + assigned_dev_update_irq(d); > +#endif Outside of the context of piix_pci.c, it's difficult to figure out what the 0x60-0x63 register range relates to - i.e. you need to know to go digging in the PIIX spec. How about something like in qemu/hw/pc.h: +/* config space register for IRQ routing */ +#define PIIX_CONFIG_IRQ_ROUTE 0x60 then: if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() && address >= PIIX_CONFIG_IRQ_ROUTE && address < PIIX_CONFIG_IRQ_ROUTE + 4) > + > end = address + len; > if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) { > /* if the command register is modified, we must modify the mappings */ > diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c > index c5f3f29..5e66832 100644 > --- a/qemu/qemu-kvm.c > +++ b/qemu/qemu-kvm.c > @@ -20,6 +20,7 @@ int kvm_pit = 1; > #include "console.h" > #include "block.h" > #include "compatfd.h" > +#include "hw/device-assignment.h" > > #include "qemu-kvm.h" > #include > @@ -27,6 +28,7 @@ int kvm_pit = 1; > #include > #include > #include > +#include > > #define bool _Bool > #define false 0 > @@ -1047,3 +1049,15 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr, > { > return kvm_unregister_coalesced_mmio(kvm_context, addr, size); > } > + > +static void kvm_do_ioperm(void *_data) > +{ > + struct ioperm_data *data = _data; > + ioperm(data->start_port, data->num, data->turn_on); > +} > + > +void kvm_ioperm(CPUState *env, void *data) > +{ > + if (kvm_enabled() && qemu_system_ready) > + on_vcpu(env, kvm_do_ioperm, data); > +} > diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h > index a1d6646..92d921d 100644 > --- a/qemu/qemu-kvm.h > +++ b/qemu/qemu-kvm.h > @@ -93,6 +93,8 @@ int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr, > > void qemu_kvm_system_reset_request(void); > > +void kvm_ioperm(CPUState *env, void *data); > + > #ifdef TARGET_PPC > int handle_powerpc_dcr_read(int vcpu, uint32_t dcrn, uint32_t *data); > int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data); > @@ -107,6 +109,12 @@ int handle_powerpc_dcr_write(int vcpu,uint32_t dcrn, uint32_t data); > extern int kvm_allowed; > extern kvm_context_t kvm_context; > > +struct ioperm_data { > + unsigned long start_port; > + unsigned long num; > + int turn_on; > +}; > + > #define kvm_enabled() (kvm_allowed) > #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context) > #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context) > diff --git a/qemu/vl.c b/qemu/vl.c > index 388e79d..9dda2f9 100644 > --- a/qemu/vl.c > +++ b/qemu/vl.c > @@ -38,6 +38,7 @@ > #include "qemu-char.h" > #include "block.h" > #include "audio/audio.h" > +#include "hw/device-assignment.h" > #include "migration.h" > #include "balloon.h" > #include "qemu-kvm.h" > @@ -215,6 +216,9 @@ CharDriverState *parallel_hds[MAX_PARALLEL_PORTS]; > int win2k_install_hack = 0; > #endif > int usb_enabled = 0; > +int device_assignment_enabled = 0; > +const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE]; > +int assigned_devices_index; > static VLANState *first_vlan; > int smp_cpus = 1; > const char *vnc_display; > @@ -8692,6 +8696,12 @@ static void help(int exitcode) > #endif > "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n" > "-no-kvm-pit disable KVM kernel mode PIT\n" > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > + "-pcidevice host=bus:dev.func[,dma=none][,name=\"string\"]\n" name="string" isn't correct, is it? e.g. this won't work -pcidevice host=04:08.0,name="Foo Bar" but this will: -pcidevice "host=04:08.0,name=Foo Bar" ... Cheers, Mark.