* [Qemu-devel] [PULL v2 1/8] vfio-pci: Add support for MSI affinity
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
@ 2013-10-10 18:43 ` Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 2/8] vfio-pci: Test device reset capabilities Alex Williamson
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:43 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel, kvm
When MSI is accelerated through KVM the vectors are only programmed
when the guest first enables MSI support. Subsequent writes to the
vector address or data fields are ignored. Unfortunately that means
we're ignore updates done to adjust SMP affinity of the vectors.
MSI SMP affinity already works in non-KVM mode because the address
and data fields are read from their backing store on each interrupt.
This patch stores the MSIMessage programmed into KVM so that we can
determine when changes are made and update the routes.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
hw/misc/vfio.c | 47 ++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 40 insertions(+), 7 deletions(-)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index a1c08fb..75a53e2 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -119,6 +119,7 @@ typedef struct VFIOINTx {
typedef struct VFIOMSIVector {
EventNotifier interrupt; /* eventfd triggered on interrupt */
struct VFIODevice *vdev; /* back pointer to device */
+ MSIMessage msg; /* cache the MSI message so we know when it changes */
int virq; /* KVM irqchip route for QEMU bypass */
bool use;
} VFIOMSIVector;
@@ -795,7 +796,6 @@ retry:
vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
for (i = 0; i < vdev->nr_vectors; i++) {
- MSIMessage msg;
VFIOMSIVector *vector = &vdev->msi_vectors[i];
vector->vdev = vdev;
@@ -805,13 +805,13 @@ retry:
error_report("vfio: Error: event_notifier_init failed");
}
- msg = msi_get_message(&vdev->pdev, i);
+ vector->msg = msi_get_message(&vdev->pdev, i);
/*
* Attempt to enable route through KVM irqchip,
* default to userspace handling if unavailable.
*/
- vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+ vector->virq = kvm_irqchip_add_msi_route(kvm_state, vector->msg);
if (vector->virq < 0 ||
kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
NULL, vector->virq) < 0) {
@@ -917,6 +917,33 @@ static void vfio_disable_msi(VFIODevice *vdev)
vdev->host.bus, vdev->host.slot, vdev->host.function);
}
+static void vfio_update_msi(VFIODevice *vdev)
+{
+ int i;
+
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ VFIOMSIVector *vector = &vdev->msi_vectors[i];
+ MSIMessage msg;
+
+ if (!vector->use || vector->virq < 0) {
+ continue;
+ }
+
+ msg = msi_get_message(&vdev->pdev, i);
+
+ if (msg.address != vector->msg.address ||
+ msg.data != vector->msg.data) {
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
+ __func__, vdev->host.domain, vdev->host.bus,
+ vdev->host.slot, vdev->host.function, i);
+
+ kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
+ vector->msg = msg;
+ }
+ }
+}
+
/*
* IO Port/MMIO - Beware of the endians, VFIO is always little endian
*/
@@ -1834,10 +1861,16 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
is_enabled = msi_enabled(pdev);
- if (!was_enabled && is_enabled) {
- vfio_enable_msi(vdev);
- } else if (was_enabled && !is_enabled) {
- vfio_disable_msi(vdev);
+ if (!was_enabled) {
+ if (is_enabled) {
+ vfio_enable_msi(vdev);
+ }
+ } else {
+ if (!is_enabled) {
+ vfio_disable_msi(vdev);
+ } else {
+ vfio_update_msi(vdev);
+ }
}
} else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 2/8] vfio-pci: Test device reset capabilities
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
2013-10-10 18:43 ` [Qemu-devel] [PULL v2 1/8] vfio-pci: Add support for MSI affinity Alex Williamson
@ 2013-10-10 18:44 ` Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 3/8] vfio-pci: Lazy PCI option ROM loading Alex Williamson
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:44 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel, kvm
Not all resets are created equal. PM reset is not very reliable,
especially for GPUs, so we might want to opt for a bus reset if a
standard reset will only do a D3hot->D0 transition. We can also
use this to tell if the standard reset will do a bus reset (if
neither has_pm_reset or has_flr is probed, but the device still
supports reset).
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
hw/misc/vfio.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 75a53e2..ede026d 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -185,6 +185,8 @@ typedef struct VFIODevice {
bool reset_works;
bool has_vga;
bool pci_aer;
+ bool has_flr;
+ bool has_pm_reset;
} VFIODevice;
typedef struct VFIOGroup {
@@ -2513,6 +2515,42 @@ static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
return pos;
}
+static void vfio_check_pcie_flr(VFIODevice *vdev, uint8_t pos)
+{
+ uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
+
+ if (cap & PCI_EXP_DEVCAP_FLR) {
+ DPRINTF("%04x:%02x:%02x.%x Supports FLR via PCIe cap\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+ vdev->has_flr = true;
+ }
+}
+
+static void vfio_check_pm_reset(VFIODevice *vdev, uint8_t pos)
+{
+ uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
+
+ if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
+ DPRINTF("%04x:%02x:%02x.%x Supports PM reset\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+ vdev->has_pm_reset = true;
+ }
+}
+
+static void vfio_check_af_flr(VFIODevice *vdev, uint8_t pos)
+{
+ uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
+
+ if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
+ DPRINTF("%04x:%02x:%02x.%x Supports FLR via AF cap\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+ vdev->has_flr = true;
+ }
+}
+
static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
{
PCIDevice *pdev = &vdev->pdev;
@@ -2557,13 +2595,21 @@ static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
ret = vfio_setup_msi(vdev, pos);
break;
case PCI_CAP_ID_EXP:
+ vfio_check_pcie_flr(vdev, pos);
ret = vfio_setup_pcie_cap(vdev, pos, size);
break;
case PCI_CAP_ID_MSIX:
ret = vfio_setup_msix(vdev, pos);
break;
case PCI_CAP_ID_PM:
+ vfio_check_pm_reset(vdev, pos);
vdev->pm_cap = pos;
+ ret = pci_add_capability(pdev, cap_id, pos, size);
+ break;
+ case PCI_CAP_ID_AF:
+ vfio_check_af_flr(vdev, pos);
+ ret = pci_add_capability(pdev, cap_id, pos, size);
+ break;
default:
ret = pci_add_capability(pdev, cap_id, pos, size);
break;
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 3/8] vfio-pci: Lazy PCI option ROM loading
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
2013-10-10 18:43 ` [Qemu-devel] [PULL v2 1/8] vfio-pci: Add support for MSI affinity Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 2/8] vfio-pci: Test device reset capabilities Alex Williamson
@ 2013-10-10 18:44 ` Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 4/8] vfio-pci: Cleanup error_reports Alex Williamson
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:44 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel, kvm
During vfio-pci initfn, the device is not always in a state where the
option ROM can be read. In the case of graphics cards, there's often
no per function reset, which means we have host driver state affecting
whether the option ROM is usable. Ideally we want to move reading the
option ROM past any co-assigned device resets to the point where the
guest first tries to read the ROM itself.
To accomplish this, we switch the memory region for the option rom to
an I/O region rather than a memory mapped region. This has the side
benefit that we don't waste KVM memory slots for a BAR where we don't
care about performance. This also allows us to delay loading the ROM
from the device until the first read by the guest. We then use the
PCI config space size of the ROM BAR when setting up the BAR through
QEMU PCI.
Another benefit of this approach is that previously when a user set
the ROM to a file using the romfile= option, we still probed VFIO for
the parameters of the ROM, which can result in dmesg errors about an
invalid ROM. We now only probe VFIO to get the ROM contents if the
guest actually tries to read the ROM.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
hw/misc/vfio.c | 184 +++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 122 insertions(+), 62 deletions(-)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index ede026d..730dec5 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -166,6 +166,7 @@ typedef struct VFIODevice {
off_t config_offset; /* Offset of config space region within device fd */
unsigned int rom_size;
off_t rom_offset; /* Offset of ROM region within device fd */
+ void *rom;
int msi_cap_size;
VFIOMSIVector *msi_vectors;
VFIOMSIXInfo *msix;
@@ -1058,6 +1059,125 @@ static const MemoryRegionOps vfio_bar_ops = {
.endianness = DEVICE_LITTLE_ENDIAN,
};
+static void vfio_pci_load_rom(VFIODevice *vdev)
+{
+ struct vfio_region_info reg_info = {
+ .argsz = sizeof(reg_info),
+ .index = VFIO_PCI_ROM_REGION_INDEX
+ };
+ uint64_t size;
+ off_t off = 0;
+ size_t bytes;
+
+ if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) {
+ error_report("vfio: Error getting ROM info: %m");
+ return;
+ }
+
+ DPRINTF("Device %04x:%02x:%02x.%x ROM:\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+ (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+ (unsigned long)reg_info.flags);
+
+ vdev->rom_size = size = reg_info.size;
+ vdev->rom_offset = reg_info.offset;
+
+ if (!vdev->rom_size) {
+ return;
+ }
+
+ vdev->rom = g_malloc(size);
+ memset(vdev->rom, 0xff, size);
+
+ while (size) {
+ bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
+ if (bytes == 0) {
+ break;
+ } else if (bytes > 0) {
+ off += bytes;
+ size -= bytes;
+ } else {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ }
+ error_report("vfio: Error reading device ROM: %m");
+ break;
+ }
+ }
+}
+
+static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
+{
+ VFIODevice *vdev = opaque;
+ uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
+
+ /* Load the ROM lazily when the guest tries to read it */
+ if (unlikely(!vdev->rom)) {
+ vfio_pci_load_rom(vdev);
+ }
+
+ memcpy(&val, vdev->rom + addr,
+ (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
+
+ DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
+ __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function, addr, size, val);
+
+ return val;
+}
+
+static const MemoryRegionOps vfio_rom_ops = {
+ .read = vfio_rom_read,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_pci_size_rom(VFIODevice *vdev)
+{
+ uint32_t orig, size = (uint32_t)PCI_ROM_ADDRESS_MASK;
+ off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
+ char name[32];
+
+ if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
+ return;
+ }
+
+ /*
+ * Use the same size ROM BAR as the physical device. The contents
+ * will get filled in later when the guest tries to read it.
+ */
+ if (pread(vdev->fd, &orig, 4, offset) != 4 ||
+ pwrite(vdev->fd, &size, 4, offset) != 4 ||
+ pread(vdev->fd, &size, 4, offset) != 4 ||
+ pwrite(vdev->fd, &orig, 4, offset) != 4) {
+ error_report("%s(%04x:%02x:%02x.%x) failed: %m",
+ __func__, vdev->host.domain, vdev->host.bus,
+ vdev->host.slot, vdev->host.function);
+ return;
+ }
+
+ size = ~(size & PCI_ROM_ADDRESS_MASK) + 1;
+
+ if (!size) {
+ return;
+ }
+
+ DPRINTF("%04x:%02x:%02x.%x ROM size 0x%x\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function, size);
+
+ snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+
+ memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
+ &vfio_rom_ops, vdev, name, size);
+
+ pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
+ PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
+
+ vdev->pdev.has_rom = true;
+}
+
static void vfio_vga_write(void *opaque, hwaddr addr,
uint64_t data, unsigned size)
{
@@ -2638,51 +2758,6 @@ static int vfio_add_capabilities(VFIODevice *vdev)
return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
}
-static int vfio_load_rom(VFIODevice *vdev)
-{
- uint64_t size = vdev->rom_size;
- char name[32];
- off_t off = 0, voff = vdev->rom_offset;
- ssize_t bytes;
- void *ptr;
-
- /* If loading ROM from file, pci handles it */
- if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
- return 0;
- }
-
- DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
- vdev->host.bus, vdev->host.slot, vdev->host.function);
-
- snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
- vdev->host.domain, vdev->host.bus, vdev->host.slot,
- vdev->host.function);
- memory_region_init_ram(&vdev->pdev.rom, OBJECT(vdev), name, size);
- ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
- memset(ptr, 0xff, size);
-
- while (size) {
- bytes = pread(vdev->fd, ptr + off, size, voff + off);
- if (bytes == 0) {
- break; /* expect that we could get back less than the ROM BAR */
- } else if (bytes > 0) {
- off += bytes;
- size -= bytes;
- } else {
- if (errno == EINTR || errno == EAGAIN) {
- continue;
- }
- error_report("vfio: Error reading device ROM: %m");
- memory_region_destroy(&vdev->pdev.rom);
- return -errno;
- }
- }
-
- pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
- vdev->pdev.has_rom = true;
- return 0;
-}
-
static int vfio_connect_container(VFIOGroup *group)
{
VFIOContainer *container;
@@ -2916,22 +2991,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
QLIST_INIT(&vdev->bars[i].quirks);
}
- reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
-
- ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
- if (ret) {
- error_report("vfio: Error getting ROM info: %m");
- goto error;
- }
-
- DPRINTF("Device %s ROM:\n", name);
- DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
- (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
- (unsigned long)reg_info.flags);
-
- vdev->rom_size = reg_info.size;
- vdev->rom_offset = reg_info.offset;
-
reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
@@ -3229,7 +3288,7 @@ static int vfio_initfn(PCIDevice *pdev)
memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
- vfio_load_rom(vdev);
+ vfio_pci_size_rom(vdev);
ret = vfio_early_setup_msix(vdev);
if (ret) {
@@ -3294,6 +3353,7 @@ static void vfio_exitfn(PCIDevice *pdev)
vfio_teardown_msi(vdev);
vfio_unmap_bars(vdev);
g_free(vdev->emulated_config_bits);
+ g_free(vdev->rom);
vfio_put_device(vdev);
vfio_put_group(group);
}
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 4/8] vfio-pci: Cleanup error_reports
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
` (2 preceding siblings ...)
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 3/8] vfio-pci: Lazy PCI option ROM loading Alex Williamson
@ 2013-10-10 18:44 ` Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 5/8] vfio-pci: Implement PCI hot reset Alex Williamson
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:44 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel, kvm
Remove carriage returns and tweak formatting for error_reports.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
hw/misc/vfio.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 730dec5..a73e7f5 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -3055,13 +3055,15 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
if (ret) {
/* This can fail for an old kernel or legacy PCI dev */
- DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret);
+ DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
ret = 0;
} else if (irq_info.count == 1) {
vdev->pci_aer = true;
} else {
- error_report("vfio: Warning: "
- "Could not enable error recovery for the device\n");
+ error_report("vfio: %04x:%02x:%02x.%x "
+ "Could not enable error recovery for the device",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
}
error:
@@ -3102,11 +3104,10 @@ static void vfio_err_notifier_handler(void *opaque)
* guest to contain the error.
*/
- error_report("%s (%04x:%02x:%02x.%x)"
- "Unrecoverable error detected...\n"
- "Please collect any data possible and then kill the guest",
- __func__, vdev->host.domain, vdev->host.bus,
- vdev->host.slot, vdev->host.function);
+ error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
+ "Please collect any data possible and then kill the guest",
+ __func__, vdev->host.domain, vdev->host.bus,
+ vdev->host.slot, vdev->host.function);
vm_stop(RUN_STATE_IO_ERROR);
}
@@ -3129,8 +3130,7 @@ static void vfio_register_err_notifier(VFIODevice *vdev)
}
if (event_notifier_init(&vdev->err_notifier, 0)) {
- error_report("vfio: Warning: "
- "Unable to init event notifier for error detection\n");
+ error_report("vfio: Unable to init event notifier for error detection");
vdev->pci_aer = false;
return;
}
@@ -3151,7 +3151,7 @@ static void vfio_register_err_notifier(VFIODevice *vdev)
ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
if (ret) {
- error_report("vfio: Failed to set up error notification\n");
+ error_report("vfio: Failed to set up error notification");
qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
event_notifier_cleanup(&vdev->err_notifier);
vdev->pci_aer = false;
@@ -3184,7 +3184,7 @@ static void vfio_unregister_err_notifier(VFIODevice *vdev)
ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
if (ret) {
- error_report("vfio: Failed to de-assign error fd: %d\n", ret);
+ error_report("vfio: Failed to de-assign error fd: %m");
}
g_free(irq_set);
qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 5/8] vfio-pci: Implement PCI hot reset
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
` (3 preceding siblings ...)
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 4/8] vfio-pci: Cleanup error_reports Alex Williamson
@ 2013-10-10 18:44 ` Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 6/8] vfio: Fix debug output for int128 values Alex Williamson
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:44 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel, kvm
Now that VFIO has a PCI hot reset interface, take advantage of it.
There are two modes that we need to consider. The first is when only
one device within the set of devices affected is actually assigned to
the guest. In this case the other devices are are just held by VFIO
for isolation and we can pretend they're not there, doing an entire
bus reset whenever the device reset callback is triggered. Supporting
this case separately allows us to do the best reset we can do of the
device even if the device is hotplugged.
The second mode is when multiple affected devices are all exposed to
the guest. In this case we can only do a hot reset when the entire
system is being reset. However, this also allows us to track which
individual devices are affected by a reset and only do them once.
We split our reset function into pre- and post-reset helper functions
prioritize the types of device resets available to us, and create
separate _one vs _multi reset interfaces to handle the distinct cases
above.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
hw/misc/vfio.c | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 300 insertions(+), 38 deletions(-)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index a73e7f5..0c9bb95 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -188,6 +188,7 @@ typedef struct VFIODevice {
bool pci_aer;
bool has_flr;
bool has_pm_reset;
+ bool needs_reset;
} VFIODevice;
typedef struct VFIOGroup {
@@ -2758,6 +2759,279 @@ static int vfio_add_capabilities(VFIODevice *vdev)
return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
}
+static void vfio_pci_pre_reset(VFIODevice *vdev)
+{
+ PCIDevice *pdev = &vdev->pdev;
+ uint16_t cmd;
+
+ vfio_disable_interrupts(vdev);
+
+ /* Make sure the device is in D0 */
+ if (vdev->pm_cap) {
+ uint16_t pmcsr;
+ uint8_t state;
+
+ pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+ state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+ if (state) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
+ /* vfio handles the necessary delay here */
+ pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+ state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+ if (state) {
+ error_report("vfio: Unable to power on device, stuck in D%d\n",
+ state);
+ }
+ }
+ }
+
+ /*
+ * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
+ * Also put INTx Disable in known state.
+ */
+ cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
+ cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
+ PCI_COMMAND_INTX_DISABLE);
+ vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
+}
+
+static void vfio_pci_post_reset(VFIODevice *vdev)
+{
+ vfio_enable_intx(vdev);
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
+ PCIHostDeviceAddress *host2)
+{
+ return (host1->domain == host2->domain && host1->bus == host2->bus &&
+ host1->slot == host2->slot && host1->function == host2->function);
+}
+
+static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
+{
+ VFIOGroup *group;
+ struct vfio_pci_hot_reset_info *info;
+ struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_hot_reset *reset;
+ int32_t *fds;
+ int ret, i, count;
+ bool multi = false;
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function,
+ single ? "one" : "multi");
+
+ vfio_pci_pre_reset(vdev);
+ vdev->needs_reset = false;
+
+ info = g_malloc0(sizeof(*info));
+ info->argsz = sizeof(*info);
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+ if (ret && errno != ENOSPC) {
+ ret = -errno;
+ if (!vdev->has_pm_reset) {
+ error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
+ "no available reset mechanism.", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ }
+ goto out_single;
+ }
+
+ count = info->count;
+ info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
+ info->argsz = sizeof(*info) + (count * sizeof(*devices));
+ devices = &info->devices[0];
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+ if (ret) {
+ ret = -errno;
+ error_report("vfio: hot reset info failed: %m");
+ goto out_single;
+ }
+
+ DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+
+ /* Verify that we have all the groups required */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIODevice *tmp;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain,
+ host.bus, host.slot, host.function, devices[i].group_id);
+
+ if (vfio_pci_host_match(&host, &vdev->host)) {
+ continue;
+ }
+
+ QLIST_FOREACH(group, &group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ if (!vdev->has_pm_reset) {
+ error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
+ "depends on group %d which is not owned.",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function, devices[i].group_id);
+ }
+ ret = -EPERM;
+ goto out;
+ }
+
+ /* Prep dependent devices for reset and clear our marker. */
+ QLIST_FOREACH(tmp, &group->device_list, next) {
+ if (vfio_pci_host_match(&host, &tmp->host)) {
+ if (single) {
+ DPRINTF("vfio: found another in-use device "
+ "%04x:%02x:%02x.%x\n", host.domain, host.bus,
+ host.slot, host.function);
+ ret = -EINVAL;
+ goto out_single;
+ }
+ vfio_pci_pre_reset(tmp);
+ tmp->needs_reset = false;
+ multi = true;
+ break;
+ }
+ }
+ }
+
+ if (!single && !multi) {
+ DPRINTF("vfio: No other in-use devices for multi hot reset\n");
+ ret = -EINVAL;
+ goto out_single;
+ }
+
+ /* Determine how many group fds need to be passed */
+ count = 0;
+ QLIST_FOREACH(group, &group_list, next) {
+ for (i = 0; i < info->count; i++) {
+ if (group->groupid == devices[i].group_id) {
+ count++;
+ break;
+ }
+ }
+ }
+
+ reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+ reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+ fds = &reset->group_fds[0];
+
+ /* Fill in group fds */
+ QLIST_FOREACH(group, &group_list, next) {
+ for (i = 0; i < info->count; i++) {
+ if (group->groupid == devices[i].group_id) {
+ fds[reset->count++] = group->fd;
+ break;
+ }
+ }
+ }
+
+ /* Bus reset! */
+ ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+ g_free(reset);
+
+ DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function,
+ ret ? "%m" : "Success");
+
+out:
+ /* Re-enable INTx on affected devices */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIODevice *tmp;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ if (vfio_pci_host_match(&host, &vdev->host)) {
+ continue;
+ }
+
+ QLIST_FOREACH(group, &group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ break;
+ }
+
+ QLIST_FOREACH(tmp, &group->device_list, next) {
+ if (vfio_pci_host_match(&host, &tmp->host)) {
+ vfio_pci_post_reset(tmp);
+ break;
+ }
+ }
+ }
+out_single:
+ vfio_pci_post_reset(vdev);
+ g_free(info);
+
+ return ret;
+}
+
+/*
+ * We want to differentiate hot reset of mulitple in-use devices vs hot reset
+ * of a single in-use device. VFIO_DEVICE_RESET will already handle the case
+ * of doing hot resets when there is only a single device per bus. The in-use
+ * here refers to how many VFIODevices are affected. A hot reset that affects
+ * multiple devices, but only a single in-use device, means that we can call
+ * it from our bus ->reset() callback since the extent is effectively a single
+ * device. This allows us to make use of it in the hotplug path. When there
+ * are multiple in-use devices, we can only trigger the hot reset during a
+ * system reset and thus from our reset handler. We separate _one vs _multi
+ * here so that we don't overlap and do a double reset on the system reset
+ * path where both our reset handler and ->reset() callback are used. Calling
+ * _one() will only do a hot reset for the one in-use devices case, calling
+ * _multi() will do nothing if a _one() would have been sufficient.
+ */
+static int vfio_pci_hot_reset_one(VFIODevice *vdev)
+{
+ return vfio_pci_hot_reset(vdev, true);
+}
+
+static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
+{
+ return vfio_pci_hot_reset(vdev, false);
+}
+
+static void vfio_pci_reset_handler(void *opaque)
+{
+ VFIOGroup *group;
+ VFIODevice *vdev;
+
+ QLIST_FOREACH(group, &group_list, next) {
+ QLIST_FOREACH(vdev, &group->device_list, next) {
+ if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
+ vdev->needs_reset = true;
+ }
+ }
+ }
+
+ QLIST_FOREACH(group, &group_list, next) {
+ QLIST_FOREACH(vdev, &group->device_list, next) {
+ if (vdev->needs_reset) {
+ vfio_pci_hot_reset_multi(vdev);
+ }
+ }
+ }
+}
+
static int vfio_connect_container(VFIOGroup *group)
{
VFIOContainer *container;
@@ -2900,6 +3174,10 @@ static VFIOGroup *vfio_get_group(int groupid)
return NULL;
}
+ if (QLIST_EMPTY(&group_list)) {
+ qemu_register_reset(vfio_pci_reset_handler, NULL);
+ }
+
QLIST_INSERT_HEAD(&group_list, group, next);
return group;
@@ -2916,6 +3194,10 @@ static void vfio_put_group(VFIOGroup *group)
DPRINTF("vfio_put_group: close group->fd\n");
close(group->fd);
g_free(group);
+
+ if (QLIST_EMPTY(&group_list)) {
+ qemu_unregister_reset(vfio_pci_reset_handler, NULL);
+ }
}
static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
@@ -2954,9 +3236,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
}
vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
- if (!vdev->reset_works) {
- error_report("Warning, device %s does not support reset", name);
- }
if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
error_report("vfio: unexpected number of io regions %u",
@@ -3362,51 +3641,34 @@ static void vfio_pci_reset(DeviceState *dev)
{
PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
- uint16_t cmd;
DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
vdev->host.bus, vdev->host.slot, vdev->host.function);
- vfio_disable_interrupts(vdev);
-
- /* Make sure the device is in D0 */
- if (vdev->pm_cap) {
- uint16_t pmcsr;
- uint8_t state;
+ vfio_pci_pre_reset(vdev);
- pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
- state = pmcsr & PCI_PM_CTRL_STATE_MASK;
- if (state) {
- pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
- vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
- /* vfio handles the necessary delay here */
- pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
- state = pmcsr & PCI_PM_CTRL_STATE_MASK;
- if (state) {
- error_report("vfio: Unable to power on device, stuck in D%d\n",
- state);
- }
- }
+ if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
+ !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+ DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ goto post_reset;
}
- /*
- * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
- * Also put INTx Disable in known state.
- */
- cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
- cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
- PCI_COMMAND_INTX_DISABLE);
- vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
+ /* See if we can do our own bus reset */
+ if (!vfio_pci_hot_reset_one(vdev)) {
+ goto post_reset;
+ }
- if (vdev->reset_works) {
- if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
- error_report("vfio: Error unable to reset physical device "
- "(%04x:%02x:%02x.%x): %m", vdev->host.domain,
- vdev->host.bus, vdev->host.slot, vdev->host.function);
- }
+ /* If nothing else works and the device supports PM reset, use it */
+ if (vdev->reset_works && vdev->has_pm_reset &&
+ !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+ DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ goto post_reset;
}
- vfio_enable_intx(vdev);
+post_reset:
+ vfio_pci_post_reset(vdev);
}
static Property vfio_pci_dev_properties[] = {
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 6/8] vfio: Fix debug output for int128 values
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
` (4 preceding siblings ...)
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 5/8] vfio-pci: Implement PCI hot reset Alex Williamson
@ 2013-10-10 18:44 ` Alex Williamson
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 7/8] vfio-pci: Add dummy PCI ROM write accessor Alex Williamson
2013-10-10 18:45 ` [Qemu-devel] [PULL v2 8/8] vfio-pci: Fix endian issues in vfio_pci_size_rom() Alex Williamson
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:44 UTC (permalink / raw)
To: anthony; +Cc: Alexey Kardashevskiy, qemu-devel, kvm
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Memory regions can easily be 2^64 byte long and therefore overflow
for just a bit but that is enough for int128_get64() to assert.
This takes care of debug printing of huge section sizes.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
hw/misc/vfio.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 0c9bb95..68e25bd 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -2084,7 +2084,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
if (vfio_listener_skipped_section(section)) {
DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
section->offset_within_address_space,
- section->offset_within_address_space + section->size - 1);
+ section->offset_within_address_space +
+ int128_get64(int128_sub(section->size, int128_one())));
return;
}
@@ -2129,7 +2130,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (vfio_listener_skipped_section(section)) {
DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
section->offset_within_address_space,
- section->offset_within_address_space + section->size - 1);
+ section->offset_within_address_space +
+ int128_get64(int128_sub(section->size, int128_one())));
return;
}
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 7/8] vfio-pci: Add dummy PCI ROM write accessor
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
` (5 preceding siblings ...)
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 6/8] vfio: Fix debug output for int128 values Alex Williamson
@ 2013-10-10 18:44 ` Alex Williamson
2013-10-10 18:45 ` [Qemu-devel] [PULL v2 8/8] vfio-pci: Fix endian issues in vfio_pci_size_rom() Alex Williamson
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:44 UTC (permalink / raw)
To: anthony; +Cc: Paolo Bonzini, qemu-devel, kvm
Just to be sure we don't jump off any NULL pointer cliffs.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reported-by: Paolo Bonzini <pbonzini@redhat.com>
---
hw/misc/vfio.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 68e25bd..1fbc40b 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1128,8 +1128,14 @@ static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
return val;
}
+static void vfio_rom_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+}
+
static const MemoryRegionOps vfio_rom_ops = {
.read = vfio_rom_read,
+ .write = vfio_rom_write,
.endianness = DEVICE_LITTLE_ENDIAN,
};
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [Qemu-devel] [PULL v2 8/8] vfio-pci: Fix endian issues in vfio_pci_size_rom()
2013-10-10 18:43 [Qemu-devel] [PULL v2 0/8] VFIO updates for QEMU Alex Williamson
` (6 preceding siblings ...)
2013-10-10 18:44 ` [Qemu-devel] [PULL v2 7/8] vfio-pci: Add dummy PCI ROM write accessor Alex Williamson
@ 2013-10-10 18:45 ` Alex Williamson
7 siblings, 0 replies; 9+ messages in thread
From: Alex Williamson @ 2013-10-10 18:45 UTC (permalink / raw)
To: anthony; +Cc: Alexey Kardashevskiy, qemu-devel, kvm
VFIO is always little endian so do byte swapping of our mask on the
way in and byte swapping of the size on the way out.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
hw/misc/vfio.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 1fbc40b..a2d5283 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1141,7 +1141,7 @@ static const MemoryRegionOps vfio_rom_ops = {
static void vfio_pci_size_rom(VFIODevice *vdev)
{
- uint32_t orig, size = (uint32_t)PCI_ROM_ADDRESS_MASK;
+ uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
char name[32];
@@ -1163,7 +1163,7 @@ static void vfio_pci_size_rom(VFIODevice *vdev)
return;
}
- size = ~(size & PCI_ROM_ADDRESS_MASK) + 1;
+ size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
if (!size) {
return;
^ permalink raw reply related [flat|nested] 9+ messages in thread