From: Anthony Liguori <anthony@codemonkey.ws>
To: muli@il.ibm.com
Cc: avi@redhat.com, kvm@vger.kernel.org, weidong.han@intel.com,
benami@il.ibm.com, amit.shah@redhat.com, allen.m.kay@intel.com
Subject: Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
Date: Tue, 28 Oct 2008 10:45:57 -0500 [thread overview]
Message-ID: <490733B5.5010102@codemonkey.ws> (raw)
In-Reply-To: <1225188410-2222-6-git-send-email-muli@il.ibm.com>
muli@il.ibm.com wrote:
> Signed-off-by: Amit Shah <amit.shah@redhat.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
> qemu/Makefile.target | 3 +
> qemu/hw/device-assignment.c | 641 +++++++++++++++++++++++++++++++++++++++++++
> qemu/hw/device-assignment.h | 117 ++++++++
> qemu/hw/pc.c | 16 +
> qemu/hw/pci.c | 7 +
> qemu/qemu-kvm.c | 14 +
> qemu/qemu-kvm.h | 8 +
> qemu/vl.c | 28 ++
> 8 files changed, 834 insertions(+), 0 deletions(-)
> create mode 100644 qemu/hw/device-assignment.c
> create mode 100644 qemu/hw/device-assignment.h
>
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> index d9bdeca..5d44e08 100644
> --- a/qemu/Makefile.target
> +++ b/qemu/Makefile.target
> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
> OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
> OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
> OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> +ifeq ($(USE_KVM), 1)
> +OBJS+= device-assignment.o
> +endif
>
I don't think you want to build this on PPC so I think you need a
stronger check.
> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
> + uint32_t value)
> +{
> + AssignedDevRegion *r_access = opaque;
> + uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> + DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> + r_pio, (int)r_access->e_physbase,
> + (unsigned long)r_access->r_virtbase, value);
>
The format doesn't match the parameter count.
> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> + uint32_t e_phys, uint32_t e_size, int type)
> +{
> + AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> + uint32_t old_ephys = region->e_physbase;
> + uint32_t old_esize = region->e_size;
> + int first_map = (region->e_size == 0);
> + int ret = 0;
> +
> + DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
> + e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
> +
> + region->e_physbase = e_phys;
> + region->e_size = e_size;
> +
> + if (!first_map)
> + kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
> +
> + if (e_size > 0)
> + ret = kvm_register_phys_mem(kvm_context, e_phys,
> + region->u.r_virtbase, e_size, 0);
> + if (ret != 0) {
> + fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
> + exit(1);
> + }
> +}
> +
> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
> + uint32_t addr, uint32_t size, int type)
> +{
> + AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> + uint32_t old_port = region->u.r_baseport;
> + uint32_t old_num = region->e_size;
> + int first_map = (old_num == 0);
> + struct ioperm_data data;
> + int i;
> +
> + region->e_physbase = addr;
> + region->e_size = size;
> +
> + DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
> + addr, region->u.r_baseport, type, size, region_num);
> +
> + memset(&data, 0, sizeof(data));
> +
> + if (!first_map) {
> + data.start_port = old_port;
> + data.num = old_num;
> + data.turn_on = 0;
> +
> + for (i = 0; i < smp_cpus; ++i)
> + kvm_ioperm(qemu_kvm_cpu_env(i), &data);
>
How does this interact with VCPU hot-plug?
> + }
> +
> + data.start_port = region->u.r_baseport;
> + data.num = size;
> + data.turn_on = 1;
> +
> + for (i = 0; i < smp_cpus; ++i)
> + kvm_ioperm(qemu_kvm_cpu_env(i), &data);
> +
> + register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
> + (r_dev->v_addrs + region_num));
> + register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
> + (r_dev->v_addrs + region_num));
> + register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
> + (r_dev->v_addrs + region_num));
> + register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
> + (r_dev->v_addrs + region_num));
> + register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
> + (r_dev->v_addrs + region_num));
> + register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
> + (r_dev->v_addrs + region_num));
> +}
> +
> +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
> + uint32_t val, int len)
> +{
> + int fd;
> + ssize_t ret;
> +
> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> + (uint16_t) address, val, len);
> +
> + if (address == 0x4) {
> + pci_default_write_config(d, address, val, len);
> + /* Continue to program the card */
> + }
> +
> + if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> + address == 0x3c || address == 0x3d) {
> + /* used for update-mappings (BAR emulation) */
> + pci_default_write_config(d, address, val, len);
> + return;
> + }
> +
> + DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
> + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> + (uint16_t) address, val, len);
> +
> + fd = ((AssignedDevice *)d)->real_device.config_fd;
> +
> +again:
> + ret = pwrite(fd, &val, len, address);
> + if (ret != len) {
> + if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> + goto again;
> +
> + fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> + __func__, ret, errno);
> +
> + exit(1);
> + }
> +}
> +
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
> + int len)
> +{
> + uint32_t val = 0;
> + int fd;
> + ssize_t ret;
> +
> + if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> + address == 0x3c || address == 0x3d) {
> + val = pci_default_read_config(d, address, len);
> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> + return val;
> + }
> +
> + /* vga specific, remove later */
> + if (address == 0xFC)
> + goto do_log;
> +
> + fd = ((AssignedDevice *)d)->real_device.config_fd;
> +
> +again:
> + ret = pread(fd, &val, len, address);
> + if (ret != len) {
> + if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> + goto again;
> +
> + fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
> + __func__, ret, errno);
> +
> + exit(1);
> + }
> +
> +do_log:
> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> +
> + /* kill the special capabilities */
> + if (address == 4 && len == 4)
> + val &= ~0x100000;
> + else if (address == 6)
> + val &= ~0x10;
> +
> + return val;
> +}
> +
> +static int assigned_dev_register_regions(PCIRegion *io_regions,
> + unsigned long regions_num,
> + AssignedDevice *pci_dev)
> +{
> + uint32_t i;
> + PCIRegion *cur_region = io_regions;
> +
> + for (i = 0; i < regions_num; i++, cur_region++) {
> + if (!cur_region->valid)
> + continue;
> + pci_dev->v_addrs[i].num = i;
> +
> + /* handle memory io regions */
> + if (cur_region->type & IORESOURCE_MEM) {
> + int t = cur_region->type & IORESOURCE_PREFETCH
> + ? PCI_ADDRESS_SPACE_MEM_PREFETCH
> + : PCI_ADDRESS_SPACE_MEM;
> +
> + /* map physical memory */
> + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
> + pci_dev->v_addrs[i].u.r_virtbase =
> + mmap(NULL,
> + (cur_region->size + 0xFFF) & 0xFFFFF000,
> + PROT_WRITE | PROT_READ, MAP_SHARED,
> + cur_region->resource_fd, (off_t) 0);
> +
> + if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
> + fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
> + "\n", __func__,
> + (uint32_t) (cur_region->base_addr));
> + return -1;
> + }
> + pci_dev->v_addrs[i].r_size = cur_region->size;
> + pci_dev->v_addrs[i].e_size = 0;
> +
> + /* add offset */
> + pci_dev->v_addrs[i].u.r_virtbase +=
> + (cur_region->base_addr & 0xFFF);
> +
> + pci_register_io_region((PCIDevice *) pci_dev, i,
> + cur_region->size, t,
> + assigned_dev_iomem_map);
> + continue;
> + }
> + /* handle port io regions */
> + pci_register_io_region((PCIDevice *) pci_dev, i,
> + cur_region->size, PCI_ADDRESS_SPACE_IO,
> + assigned_dev_ioport_map);
> +
> + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
> + pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
> + /* not relevant for port io */
> + pci_dev->v_addrs[i].memory_index = 0;
> + }
> +
> + /* success */
> + return 0;
> +}
> +
> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> + uint8_t r_dev, uint8_t r_func)
> +{
> + char dir[128], name[128];
> + int fd, r = 0;
> + FILE *f;
> + unsigned long long start, end, size, flags;
> + PCIRegion *rp;
> + PCIDevRegions *dev = &pci_dev->real_device;
> +
> + dev->region_number = 0;
> +
> + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> + r_bus, r_dev, r_func);
> +
> + snprintf(name, sizeof(name), "%sconfig", dir);
> +
> + fd = open(name, O_RDWR);
> + if (fd == -1) {
> + fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + return 1;
> + }
> + dev->config_fd = fd;
> +again:
> + r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> + if (r < 0) {
> + if (errno == EINTR || errno == EAGAIN)
> + goto again;
> + fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> + }
> +
> + snprintf(name, sizeof(name), "%sresource", dir);
> +
> + f = fopen(name, "r");
> + if (f == NULL) {
> + fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + return 1;
> + }
> + r = -1;
> + while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> + r++;
> + rp = dev->regions + r;
> + rp->valid = 0;
> + size = end - start + 1;
> + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> + continue;
> + if (flags & IORESOURCE_MEM) {
> + flags &= ~IORESOURCE_IO;
> + snprintf(name, sizeof(name), "%sresource%d", dir, r);
> + fd = open(name, O_RDWR);
> + if (fd == -1)
> + continue; /* probably ROM */
> + rp->resource_fd = fd;
> + } else
> + flags &= ~IORESOURCE_PREFETCH;
> +
> + rp->type = flags;
> + rp->valid = 1;
> + rp->base_addr = start;
> + rp->size = size;
> + DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> + r, rp->size, start, rp->type, rp->resource_fd);
> + }
> + fclose(f);
> +
> + dev->region_number = r;
> + return 0;
> +}
> +
> +static int disable_iommu;
> +int nr_assigned_devices;
> +static LIST_HEAD(, AssignedDevInfo) adev_head;
> +
> +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> +{
> + return (uint32_t)bus << 8 | (uint32_t)devfn;
> +}
> +
> +static AssignedDevice *register_real_device(PCIBus *e_bus,
> + const char *e_dev_name,
> + int e_devfn, uint8_t r_bus,
> + uint8_t r_dev, uint8_t r_func)
> +{
> + int r;
> + AssignedDevice *pci_dev;
> + uint8_t e_device, e_intx;
> +
> + DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> + e_dev_name, e_devfn);
> +
> + pci_dev = (AssignedDevice *)
> + pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> + e_devfn, assigned_dev_pci_read_config,
> + assigned_dev_pci_write_config);
> + if (NULL == pci_dev) {
> + fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> + __func__, e_dev_name);
> + return NULL;
> + }
> + if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> + fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> + __func__, e_dev_name);
> + goto out;
> + }
> +
> + /* handle real device's MMIO/PIO BARs */
> + if (assigned_dev_register_regions(pci_dev->real_device.regions,
> + pci_dev->real_device.region_number,
> + pci_dev))
> + goto out;
> +
> + /* handle interrupt routing */
> + e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> + e_intx = pci_dev->dev.config[0x3d] - 1;
> + pci_dev->intpin = e_intx;
> + pci_dev->run = 0;
> + pci_dev->girq = 0;
> + pci_dev->h_busnr = r_bus;
> + pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> +
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> + if (kvm_enabled()) {
> + struct kvm_assigned_pci_dev assigned_dev_data;
> +
> + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> + assigned_dev_data.assigned_dev_id =
> + calc_assigned_dev_id(pci_dev->h_busnr,
> + (uint32_t)pci_dev->h_devfn);
> + assigned_dev_data.busnr = pci_dev->h_busnr;
> + assigned_dev_data.devfn = pci_dev->h_devfn;
> +
> +#ifdef KVM_CAP_IOMMU
> + /* We always enable the IOMMU if present
> + * (or when not disabled on the command line)
> + */
> + r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> + if (r && !disable_iommu)
> + assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> +#endif
> + r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> + if (r < 0) {
> + fprintf(stderr, "Could not notify kernel about "
> + "assigned device \"%s\"\n", e_dev_name);
> + perror("register_real_device");
> + goto out;
> + }
> + }
>
You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined? That
means a newer userspace compiled on an older kernel will silently fail
if they try to do device assignment. There's probably no reason to
build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined (see how the
in-kernel PIT gets conditionally build depending on whether that cap is
available).
> +#endif
> + term_printf("Registered host PCI device %02x:%02x.%1x "
> + "(\"%s\") as guest device %02x:%02x.%1x\n",
> + r_bus, r_dev, r_func, e_dev_name,
> + pci_bus_num(e_bus), e_device, r_func);
>
>
If I read the code correctly, this term_printf() happens regardless of
whether this is being done for PCI hotplug or for command-line
assignment? That's a problem as it'll print garbage on the monitor when
you start QEMU which could break management applications.
> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> index d559f0c..5fdb726 100644
> --- a/qemu/hw/pc.c
> +++ b/qemu/hw/pc.c
> @@ -33,6 +33,7 @@
> #include "boards.h"
> #include "console.h"
> #include "fw_cfg.h"
> +#include "device-assignment.h"
>
> #include "qemu-kvm.h"
>
> @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
>
> if (pci_enabled)
> virtio_balloon_init(pci_bus);
> +
> + if (kvm_enabled() && device_assignment_enabled) {
> + int i;
>
Stray tab.
> + for (i = 0; i < assigned_devices_index; i++) {
> + if (add_assigned_device(assigned_devices[i]) < 0) {
> + fprintf(stderr, "Warning: could not add assigned device %s\n",
> + assigned_devices[i]);
> + }
> + }
> +
> + if (init_all_assigned_devices(pci_bus)) {
> + fprintf(stderr, "Failed to initialize assigned devices\n");
> + exit (1);
> + }
> + }
> }
> +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
> + case QEMU_OPTION_pcidevice:
> + device_assignment_enabled = 1;
> + if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
> + fprintf(stderr, "Too many assigned devices\n");
> + exit(1);
> + }
> + assigned_devices[assigned_devices_index] = optarg;
> + assigned_devices_index++;
> + break;
>
Tab damage.
Regards,
Anthony Liguori
next prev parent reply other threads:[~2008-10-28 15:46 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-10-28 10:06 [v8] Userspace patches for PCI device assignment muli
2008-10-28 10:06 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-28 10:06 ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-28 10:06 ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-28 10:06 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-28 10:06 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-28 10:06 ` [PATCH 6/6] device assignment: support for hot-plugging PCI devices muli
2008-10-28 14:10 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests Han, Weidong
2008-10-28 15:32 ` Muli Ben-Yehuda
[not found] ` <715D42877B251141A38726ABF5CABF2C018683D874@pdsmsx503.ccr.corp.intel.com>
2008-10-28 15:31 ` Han, Weidong
2008-10-28 15:36 ` Han, Weidong
2008-10-28 15:47 ` Muli Ben-Yehuda
2008-10-28 15:45 ` Anthony Liguori [this message]
2008-10-28 15:53 ` Muli Ben-Yehuda
2008-10-29 7:56 ` Zhang, Xiantao
2008-10-29 10:27 ` Muli Ben-Yehuda
2008-10-29 8:22 ` Han, Weidong
2008-10-29 10:25 ` Muli Ben-Yehuda
2008-10-29 10:39 ` Muli Ben-Yehuda
2008-10-28 16:55 ` Mark McLoughlin
2008-10-29 10:31 ` Muli Ben-Yehuda
2008-10-29 11:07 ` Mark McLoughlin
2008-10-29 11:15 ` Mark McLoughlin
2008-10-29 11:47 ` Muli Ben-Yehuda
2008-10-29 7:38 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support Zhang, Xiantao
-- strict thread matches above, loose matches on Subject: below --
2008-10-29 10:22 [v9] Userspace patches for PCI device assignment muli
2008-10-29 10:22 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-29 10:22 ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-29 10:22 ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-29 10:22 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-29 10:22 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-29 12:19 [v10] Userspace patches for PCI device assignment muli
2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-29 12:19 ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-29 12:19 ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-29 12:19 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-29 12:20 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-29 12:27 ` Mark McLoughlin
2008-10-29 14:40 ` Muli Ben-Yehuda
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=490733B5.5010102@codemonkey.ws \
--to=anthony@codemonkey.ws \
--cc=allen.m.kay@intel.com \
--cc=amit.shah@redhat.com \
--cc=avi@redhat.com \
--cc=benami@il.ibm.com \
--cc=kvm@vger.kernel.org \
--cc=muli@il.ibm.com \
--cc=weidong.han@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox