From: Anthony Liguori <anthony@codemonkey.ws>
To: muli@il.ibm.com
Cc: avi@redhat.com, kvm@vger.kernel.org, weidong.han@intel.com,
benami@il.ibm.com, amit.shah@redhat.com, allen.m.kay@intel.com
Subject: Re: [PATCH 5/6] device assignment: support for assigning PCI devices to guests
Date: Tue, 28 Oct 2008 10:45:57 -0500 [thread overview]
Message-ID: <490733B5.5010102@codemonkey.ws> (raw)
In-Reply-To: <1225188410-2222-6-git-send-email-muli@il.ibm.com>
muli@il.ibm.com wrote:
> Signed-off-by: Amit Shah <amit.shah@redhat.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
> qemu/Makefile.target | 3 +
> qemu/hw/device-assignment.c | 641 +++++++++++++++++++++++++++++++++++++++++++
> qemu/hw/device-assignment.h | 117 ++++++++
> qemu/hw/pc.c | 16 +
> qemu/hw/pci.c | 7 +
> qemu/qemu-kvm.c | 14 +
> qemu/qemu-kvm.h | 8 +
> qemu/vl.c | 28 ++
> 8 files changed, 834 insertions(+), 0 deletions(-)
> create mode 100644 qemu/hw/device-assignment.c
> create mode 100644 qemu/hw/device-assignment.h
>
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> index d9bdeca..5d44e08 100644
> --- a/qemu/Makefile.target
> +++ b/qemu/Makefile.target
> @@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
> OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
> OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
> OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> +ifeq ($(USE_KVM), 1)
> +OBJS+= device-assignment.o
> +endif
>
I don't think you want to build this on PPC so I think you need a
stronger check.
> +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
> + uint32_t value)
> +{
> + AssignedDevRegion *r_access = opaque;
> + uint32_t r_pio = guest_to_host_ioport(r_access, addr);
> +
> + DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
> + r_pio, (int)r_access->e_physbase,
> + (unsigned long)r_access->r_virtbase, value);
>
The format doesn't match the parameter count.
> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> + uint32_t e_phys, uint32_t e_size, int type)
> +{
> + AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> + uint32_t old_ephys = region->e_physbase;
> + uint32_t old_esize = region->e_size;
> + int first_map = (region->e_size == 0);
> + int ret = 0;
> +
> + DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
> + e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
> +
> + region->e_physbase = e_phys;
> + region->e_size = e_size;
> +
> + if (!first_map)
> + kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
> +
> + if (e_size > 0)
> + ret = kvm_register_phys_mem(kvm_context, e_phys,
> + region->u.r_virtbase, e_size, 0);
> + if (ret != 0) {
> + fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
> + exit(1);
> + }
> +}
> +
> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
> + uint32_t addr, uint32_t size, int type)
> +{
> + AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> + uint32_t old_port = region->u.r_baseport;
> + uint32_t old_num = region->e_size;
> + int first_map = (old_num == 0);
> + struct ioperm_data data;
> + int i;
> +
> + region->e_physbase = addr;
> + region->e_size = size;
> +
> + DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
> + addr, region->u.r_baseport, type, size, region_num);
> +
> + memset(&data, 0, sizeof(data));
> +
> + if (!first_map) {
> + data.start_port = old_port;
> + data.num = old_num;
> + data.turn_on = 0;
> +
> + for (i = 0; i < smp_cpus; ++i)
> + kvm_ioperm(qemu_kvm_cpu_env(i), &data);
>
How does this interact with VCPU hot-plug?
> + }
> +
> + data.start_port = region->u.r_baseport;
> + data.num = size;
> + data.turn_on = 1;
> +
> + for (i = 0; i < smp_cpus; ++i)
> + kvm_ioperm(qemu_kvm_cpu_env(i), &data);
> +
> + register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
> + (r_dev->v_addrs + region_num));
> + register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
> + (r_dev->v_addrs + region_num));
> + register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
> + (r_dev->v_addrs + region_num));
> + register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
> + (r_dev->v_addrs + region_num));
> + register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
> + (r_dev->v_addrs + region_num));
> + register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
> + (r_dev->v_addrs + region_num));
> +}
> +
> +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
> + uint32_t val, int len)
> +{
> + int fd;
> + ssize_t ret;
> +
> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> + (uint16_t) address, val, len);
> +
> + if (address == 0x4) {
> + pci_default_write_config(d, address, val, len);
> + /* Continue to program the card */
> + }
> +
> + if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> + address == 0x3c || address == 0x3d) {
> + /* used for update-mappings (BAR emulation) */
> + pci_default_write_config(d, address, val, len);
> + return;
> + }
> +
> + DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
> + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
> + (uint16_t) address, val, len);
> +
> + fd = ((AssignedDevice *)d)->real_device.config_fd;
> +
> +again:
> + ret = pwrite(fd, &val, len, address);
> + if (ret != len) {
> + if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> + goto again;
> +
> + fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> + __func__, ret, errno);
> +
> + exit(1);
> + }
> +}
> +
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
> + int len)
> +{
> + uint32_t val = 0;
> + int fd;
> + ssize_t ret;
> +
> + if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> + address == 0x3c || address == 0x3d) {
> + val = pci_default_read_config(d, address, len);
> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> + return val;
> + }
> +
> + /* vga specific, remove later */
> + if (address == 0xFC)
> + goto do_log;
> +
> + fd = ((AssignedDevice *)d)->real_device.config_fd;
> +
> +again:
> + ret = pread(fd, &val, len, address);
> + if (ret != len) {
> + if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> + goto again;
> +
> + fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
> + __func__, ret, errno);
> +
> + exit(1);
> + }
> +
> +do_log:
> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
> +
> + /* kill the special capabilities */
> + if (address == 4 && len == 4)
> + val &= ~0x100000;
> + else if (address == 6)
> + val &= ~0x10;
> +
> + return val;
> +}
> +
> +static int assigned_dev_register_regions(PCIRegion *io_regions,
> + unsigned long regions_num,
> + AssignedDevice *pci_dev)
> +{
> + uint32_t i;
> + PCIRegion *cur_region = io_regions;
> +
> + for (i = 0; i < regions_num; i++, cur_region++) {
> + if (!cur_region->valid)
> + continue;
> + pci_dev->v_addrs[i].num = i;
> +
> + /* handle memory io regions */
> + if (cur_region->type & IORESOURCE_MEM) {
> + int t = cur_region->type & IORESOURCE_PREFETCH
> + ? PCI_ADDRESS_SPACE_MEM_PREFETCH
> + : PCI_ADDRESS_SPACE_MEM;
> +
> + /* map physical memory */
> + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
> + pci_dev->v_addrs[i].u.r_virtbase =
> + mmap(NULL,
> + (cur_region->size + 0xFFF) & 0xFFFFF000,
> + PROT_WRITE | PROT_READ, MAP_SHARED,
> + cur_region->resource_fd, (off_t) 0);
> +
> + if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
> + fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
> + "\n", __func__,
> + (uint32_t) (cur_region->base_addr));
> + return -1;
> + }
> + pci_dev->v_addrs[i].r_size = cur_region->size;
> + pci_dev->v_addrs[i].e_size = 0;
> +
> + /* add offset */
> + pci_dev->v_addrs[i].u.r_virtbase +=
> + (cur_region->base_addr & 0xFFF);
> +
> + pci_register_io_region((PCIDevice *) pci_dev, i,
> + cur_region->size, t,
> + assigned_dev_iomem_map);
> + continue;
> + }
> + /* handle port io regions */
> + pci_register_io_region((PCIDevice *) pci_dev, i,
> + cur_region->size, PCI_ADDRESS_SPACE_IO,
> + assigned_dev_ioport_map);
> +
> + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
> + pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
> + /* not relevant for port io */
> + pci_dev->v_addrs[i].memory_index = 0;
> + }
> +
> + /* success */
> + return 0;
> +}
> +
> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> + uint8_t r_dev, uint8_t r_func)
> +{
> + char dir[128], name[128];
> + int fd, r = 0;
> + FILE *f;
> + unsigned long long start, end, size, flags;
> + PCIRegion *rp;
> + PCIDevRegions *dev = &pci_dev->real_device;
> +
> + dev->region_number = 0;
> +
> + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> + r_bus, r_dev, r_func);
> +
> + snprintf(name, sizeof(name), "%sconfig", dir);
> +
> + fd = open(name, O_RDWR);
> + if (fd == -1) {
> + fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + return 1;
> + }
> + dev->config_fd = fd;
> +again:
> + r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> + if (r < 0) {
> + if (errno == EINTR || errno == EAGAIN)
> + goto again;
> + fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> + }
> +
> + snprintf(name, sizeof(name), "%sresource", dir);
> +
> + f = fopen(name, "r");
> + if (f == NULL) {
> + fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + return 1;
> + }
> + r = -1;
> + while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> + r++;
> + rp = dev->regions + r;
> + rp->valid = 0;
> + size = end - start + 1;
> + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> + continue;
> + if (flags & IORESOURCE_MEM) {
> + flags &= ~IORESOURCE_IO;
> + snprintf(name, sizeof(name), "%sresource%d", dir, r);
> + fd = open(name, O_RDWR);
> + if (fd == -1)
> + continue; /* probably ROM */
> + rp->resource_fd = fd;
> + } else
> + flags &= ~IORESOURCE_PREFETCH;
> +
> + rp->type = flags;
> + rp->valid = 1;
> + rp->base_addr = start;
> + rp->size = size;
> + DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> + r, rp->size, start, rp->type, rp->resource_fd);
> + }
> + fclose(f);
> +
> + dev->region_number = r;
> + return 0;
> +}
> +
> +static int disable_iommu;
> +int nr_assigned_devices;
> +static LIST_HEAD(, AssignedDevInfo) adev_head;
> +
> +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> +{
> + return (uint32_t)bus << 8 | (uint32_t)devfn;
> +}
> +
> +static AssignedDevice *register_real_device(PCIBus *e_bus,
> + const char *e_dev_name,
> + int e_devfn, uint8_t r_bus,
> + uint8_t r_dev, uint8_t r_func)
> +{
> + int r;
> + AssignedDevice *pci_dev;
> + uint8_t e_device, e_intx;
> +
> + DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> + e_dev_name, e_devfn);
> +
> + pci_dev = (AssignedDevice *)
> + pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> + e_devfn, assigned_dev_pci_read_config,
> + assigned_dev_pci_write_config);
> + if (NULL == pci_dev) {
> + fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> + __func__, e_dev_name);
> + return NULL;
> + }
> + if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> + fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> + __func__, e_dev_name);
> + goto out;
> + }
> +
> + /* handle real device's MMIO/PIO BARs */
> + if (assigned_dev_register_regions(pci_dev->real_device.regions,
> + pci_dev->real_device.region_number,
> + pci_dev))
> + goto out;
> +
> + /* handle interrupt routing */
> + e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> + e_intx = pci_dev->dev.config[0x3d] - 1;
> + pci_dev->intpin = e_intx;
> + pci_dev->run = 0;
> + pci_dev->girq = 0;
> + pci_dev->h_busnr = r_bus;
> + pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> +
> +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> + if (kvm_enabled()) {
> + struct kvm_assigned_pci_dev assigned_dev_data;
> +
> + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> + assigned_dev_data.assigned_dev_id =
> + calc_assigned_dev_id(pci_dev->h_busnr,
> + (uint32_t)pci_dev->h_devfn);
> + assigned_dev_data.busnr = pci_dev->h_busnr;
> + assigned_dev_data.devfn = pci_dev->h_devfn;
> +
> +#ifdef KVM_CAP_IOMMU
> + /* We always enable the IOMMU if present
> + * (or when not disabled on the command line)
> + */
> + r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> + if (r && !disable_iommu)
> + assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> +#endif
> + r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> + if (r < 0) {
> + fprintf(stderr, "Could not notify kernel about "
> + "assigned device \"%s\"\n", e_dev_name);
> + perror("register_real_device");
> + goto out;
> + }
> + }
>
You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined? That
means a newer userspace compiled on an older kernel will silently fail
if they try to do device assignment. There's probably no reason to
build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined (see how the
in-kernel PIT gets conditionally build depending on whether that cap is
available).
> +#endif
> + term_printf("Registered host PCI device %02x:%02x.%1x "
> + "(\"%s\") as guest device %02x:%02x.%1x\n",
> + r_bus, r_dev, r_func, e_dev_name,
> + pci_bus_num(e_bus), e_device, r_func);
>
>
If I read the code correctly, this term_printf() happens regardless of
whether this is being done for PCI hotplug or for command-line
assignment? That's a problem as it'll print garbage on the monitor when
you start QEMU which could break management applications.
> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> index d559f0c..5fdb726 100644
> --- a/qemu/hw/pc.c
> +++ b/qemu/hw/pc.c
> @@ -33,6 +33,7 @@
> #include "boards.h"
> #include "console.h"
> #include "fw_cfg.h"
> +#include "device-assignment.h"
>
> #include "qemu-kvm.h"
>
> @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
>
> if (pci_enabled)
> virtio_balloon_init(pci_bus);
> +
> + if (kvm_enabled() && device_assignment_enabled) {
> + int i;
>
Stray tab.
> + for (i = 0; i < assigned_devices_index; i++) {
> + if (add_assigned_device(assigned_devices[i]) < 0) {
> + fprintf(stderr, "Warning: could not add assigned device %s\n",
> + assigned_devices[i]);
> + }
> + }
> +
> + if (init_all_assigned_devices(pci_bus)) {
> + fprintf(stderr, "Failed to initialize assigned devices\n");
> + exit (1);
> + }
> + }
> }
> +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
> + case QEMU_OPTION_pcidevice:
> + device_assignment_enabled = 1;
> + if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
> + fprintf(stderr, "Too many assigned devices\n");
> + exit(1);
> + }
> + assigned_devices[assigned_devices_index] = optarg;
> + assigned_devices_index++;
> + break;
>
Tab damage.
Regards,
Anthony Liguori
next prev parent reply other threads:[~2008-10-28 15:46 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-10-28 10:06 [v8] Userspace patches for PCI device assignment muli
2008-10-28 10:06 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-28 10:06 ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-28 10:06 ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-28 10:06 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-28 10:06 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-28 10:06 ` [PATCH 6/6] device assignment: support for hot-plugging PCI devices muli
2008-10-28 14:10 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests Han, Weidong
2008-10-28 15:32 ` Muli Ben-Yehuda
[not found] ` <715D42877B251141A38726ABF5CABF2C018683D874@pdsmsx503.ccr.corp.intel.com>
2008-10-28 15:31 ` Han, Weidong
2008-10-28 15:36 ` Han, Weidong
2008-10-28 15:47 ` Muli Ben-Yehuda
2008-10-28 15:45 ` Anthony Liguori [this message]
2008-10-28 15:53 ` Muli Ben-Yehuda
2008-10-29 7:56 ` Zhang, Xiantao
2008-10-29 10:27 ` Muli Ben-Yehuda
2008-10-29 8:22 ` Han, Weidong
2008-10-29 10:25 ` Muli Ben-Yehuda
2008-10-29 10:39 ` Muli Ben-Yehuda
2008-10-28 16:55 ` Mark McLoughlin
2008-10-29 10:31 ` Muli Ben-Yehuda
2008-10-29 11:07 ` Mark McLoughlin
2008-10-29 11:15 ` Mark McLoughlin
2008-10-29 11:47 ` Muli Ben-Yehuda
2008-10-29 7:38 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support Zhang, Xiantao
-- strict thread matches above, loose matches on Subject: below --
2008-10-29 10:22 [v9] Userspace patches for PCI device assignment muli
2008-10-29 10:22 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-29 10:22 ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-29 10:22 ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-29 10:22 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-29 10:22 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-29 12:19 [v10] Userspace patches for PCI device assignment muli
2008-10-29 12:19 ` [PATCH 1/6] device assignment: add ioctl wrappers muli
2008-10-29 12:19 ` [PATCH 2/6] device assignment: introduce pci_map_irq to get irq nr from pin number muli
2008-10-29 12:19 ` [PATCH 3/6] device assignment: introduce functions to correlate pin number and irq muli
2008-10-29 12:19 ` [PATCH 4/6] device assignment: build vtd.c for Intel IOMMU support muli
2008-10-29 12:20 ` [PATCH 5/6] device assignment: support for assigning PCI devices to guests muli
2008-10-29 12:27 ` Mark McLoughlin
2008-10-29 14:40 ` Muli Ben-Yehuda
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=490733B5.5010102@codemonkey.ws \
--to=anthony@codemonkey.ws \
--cc=allen.m.kay@intel.com \
--cc=amit.shah@redhat.com \
--cc=avi@redhat.com \
--cc=benami@il.ibm.com \
--cc=kvm@vger.kernel.org \
--cc=muli@il.ibm.com \
--cc=weidong.han@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.