* [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC
@ 2015-10-21 16:52 Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 1/3] Qemu: Add pci-assign.h to share functions and struct definition with new file Lan Tianyu
` (3 more replies)
0 siblings, 4 replies; 6+ messages in thread
From: Lan Tianyu @ 2015-10-21 16:52 UTC (permalink / raw)
To: amit.shah, eblake, eddie.dong, nrupal.jani, yang.z.zhang, agraf,
kvm, pbonzini, qemu-devel, emil.s.tantilov, ehabkost, lcapitulino,
lersek, mst, quintela, rth
Cc: Lan Tianyu
This patchset is Qemu part for live migration support for SRIOV NIC.
kernel part patch information is in the following link.
http://marc.info/?l=kvm&m=144544635330193&w=2
Lan Tianyu (3):
Qemu: Add pci-assign.h to share functions and struct definition with
new file
Qemu: Add post_load_state() to run after restoring CPU state
Qemu: Introduce pci-sriov device type to support VF live migration
hw/i386/kvm/Makefile.objs | 2 +-
hw/i386/kvm/pci-assign.c | 113 +----------------------
hw/i386/kvm/pci-assign.h | 109 +++++++++++++++++++++++
hw/i386/kvm/sriov.c | 213 ++++++++++++++++++++++++++++++++++++++++++++
include/migration/vmstate.h | 2 +
migration/savevm.c | 15 ++++
6 files changed, 344 insertions(+), 110 deletions(-)
create mode 100644 hw/i386/kvm/pci-assign.h
create mode 100644 hw/i386/kvm/sriov.c
--
1.9.3
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Qemu-devel] [RFC PATCH 1/3] Qemu: Add pci-assign.h to share functions and struct definition with new file
2015-10-21 16:52 [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Lan Tianyu
@ 2015-10-21 16:52 ` Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 2/3] Qemu: Add post_load_state() to run after restoring CPU state Lan Tianyu
` (2 subsequent siblings)
3 siblings, 0 replies; 6+ messages in thread
From: Lan Tianyu @ 2015-10-21 16:52 UTC (permalink / raw)
To: amit.shah, eblake, eddie.dong, nrupal.jani, yang.z.zhang, agraf,
kvm, pbonzini, qemu-devel, emil.s.tantilov, ehabkost, lcapitulino,
lersek, mst, quintela, rth
Cc: Lan Tianyu
Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
hw/i386/kvm/pci-assign.c | 111 ++---------------------------------------------
hw/i386/kvm/pci-assign.h | 109 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 112 insertions(+), 108 deletions(-)
create mode 100644 hw/i386/kvm/pci-assign.h
diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
index 74d22f4..616532d 100644
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -37,112 +37,7 @@
#include "hw/pci/pci.h"
#include "hw/pci/msi.h"
#include "kvm_i386.h"
-
-#define MSIX_PAGE_SIZE 0x1000
-
-/* From linux/ioport.h */
-#define IORESOURCE_IO 0x00000100 /* Resource type */
-#define IORESOURCE_MEM 0x00000200
-#define IORESOURCE_IRQ 0x00000400
-#define IORESOURCE_DMA 0x00000800
-#define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
-#define IORESOURCE_MEM_64 0x00100000
-
-//#define DEVICE_ASSIGNMENT_DEBUG
-
-#ifdef DEVICE_ASSIGNMENT_DEBUG
-#define DEBUG(fmt, ...) \
- do { \
- fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
- } while (0)
-#else
-#define DEBUG(fmt, ...)
-#endif
-
-typedef struct PCIRegion {
- int type; /* Memory or port I/O */
- int valid;
- uint64_t base_addr;
- uint64_t size; /* size of the region */
- int resource_fd;
-} PCIRegion;
-
-typedef struct PCIDevRegions {
- uint8_t bus, dev, func; /* Bus inside domain, device and function */
- int irq; /* IRQ number */
- uint16_t region_number; /* number of active regions */
-
- /* Port I/O or MMIO Regions */
- PCIRegion regions[PCI_NUM_REGIONS - 1];
- int config_fd;
-} PCIDevRegions;
-
-typedef struct AssignedDevRegion {
- MemoryRegion container;
- MemoryRegion real_iomem;
- union {
- uint8_t *r_virtbase; /* mmapped access address for memory regions */
- uint32_t r_baseport; /* the base guest port for I/O regions */
- } u;
- pcibus_t e_size; /* emulated size of region in bytes */
- pcibus_t r_size; /* real size of region in bytes */
- PCIRegion *region;
-} AssignedDevRegion;
-
-#define ASSIGNED_DEVICE_PREFER_MSI_BIT 0
-#define ASSIGNED_DEVICE_SHARE_INTX_BIT 1
-
-#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
-#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
-
-typedef struct MSIXTableEntry {
- uint32_t addr_lo;
- uint32_t addr_hi;
- uint32_t data;
- uint32_t ctrl;
-} MSIXTableEntry;
-
-typedef enum AssignedIRQType {
- ASSIGNED_IRQ_NONE = 0,
- ASSIGNED_IRQ_INTX_HOST_INTX,
- ASSIGNED_IRQ_INTX_HOST_MSI,
- ASSIGNED_IRQ_MSI,
- ASSIGNED_IRQ_MSIX
-} AssignedIRQType;
-
-typedef struct AssignedDevice {
- PCIDevice dev;
- PCIHostDeviceAddress host;
- uint32_t dev_id;
- uint32_t features;
- int intpin;
- AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
- PCIDevRegions real_device;
- PCIINTxRoute intx_route;
- AssignedIRQType assigned_irq_type;
- struct {
-#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
-#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
- uint32_t available;
-#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
-#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
-#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
- uint32_t state;
- } cap;
- uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
- uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
- int msi_virq_nr;
- int *msi_virq;
- MSIXTableEntry *msix_table;
- hwaddr msix_table_addr;
- uint16_t msix_max;
- MemoryRegion mmio;
- char *configfd_name;
- int32_t bootindex;
-} AssignedDevice;
-
-#define TYPE_PCI_ASSIGN "kvm-pci-assign"
-#define PCI_ASSIGN(obj) OBJECT_CHECK(AssignedDevice, (obj), TYPE_PCI_ASSIGN)
+#include "pci-assign.h"
static void assigned_dev_update_irq_routing(PCIDevice *dev);
@@ -1044,7 +939,7 @@ static bool assigned_dev_msix_masked(MSIXTableEntry *entry)
* sure the physical MSI-X state tracks the guest's view, which is important
* for some VF/PF and PF/fw communication channels.
*/
-static bool assigned_dev_msix_skipped(MSIXTableEntry *entry)
+bool assigned_dev_msix_skipped(MSIXTableEntry *entry)
{
return !entry->data;
}
@@ -1114,7 +1009,7 @@ static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
return r;
}
-static void assigned_dev_update_msix(PCIDevice *pci_dev)
+void assigned_dev_update_msix(PCIDevice *pci_dev)
{
AssignedDevice *assigned_dev = PCI_ASSIGN(pci_dev);
uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
diff --git a/hw/i386/kvm/pci-assign.h b/hw/i386/kvm/pci-assign.h
new file mode 100644
index 0000000..91d00ea
--- /dev/null
+++ b/hw/i386/kvm/pci-assign.h
@@ -0,0 +1,109 @@
+#define MSIX_PAGE_SIZE 0x1000
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO 0x00000100 /* Resource type */
+#define IORESOURCE_MEM 0x00000200
+#define IORESOURCE_IRQ 0x00000400
+#define IORESOURCE_DMA 0x00000800
+#define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
+#define IORESOURCE_MEM_64 0x00100000
+
+//#define DEVICE_ASSIGNMENT_DEBUG
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, ...) \
+ do { \
+ fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
+ } while (0)
+#else
+#define DEBUG(fmt, ...)
+#endif
+
+typedef struct PCIRegion {
+ int type; /* Memory or port I/O */
+ int valid;
+ uint64_t base_addr;
+ uint64_t size; /* size of the region */
+ int resource_fd;
+} PCIRegion;
+
+typedef struct PCIDevRegions {
+ uint8_t bus, dev, func; /* Bus inside domain, device and function */
+ int irq; /* IRQ number */
+ uint16_t region_number; /* number of active regions */
+
+ /* Port I/O or MMIO Regions */
+ PCIRegion regions[PCI_NUM_REGIONS - 1];
+ int config_fd;
+} PCIDevRegions;
+
+typedef struct AssignedDevRegion {
+ MemoryRegion container;
+ MemoryRegion real_iomem;
+ union {
+ uint8_t *r_virtbase; /* mmapped access address for memory regions */
+ uint32_t r_baseport; /* the base guest port for I/O regions */
+ } u;
+ pcibus_t e_size; /* emulated size of region in bytes */
+ pcibus_t r_size; /* real size of region in bytes */
+ PCIRegion *region;
+} AssignedDevRegion;
+
+#define ASSIGNED_DEVICE_PREFER_MSI_BIT 0
+#define ASSIGNED_DEVICE_SHARE_INTX_BIT 1
+
+#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
+#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT)
+
+typedef struct MSIXTableEntry {
+ uint32_t addr_lo;
+ uint32_t addr_hi;
+ uint32_t data;
+ uint32_t ctrl;
+} MSIXTableEntry;
+
+typedef enum AssignedIRQType {
+ ASSIGNED_IRQ_NONE = 0,
+ ASSIGNED_IRQ_INTX_HOST_INTX,
+ ASSIGNED_IRQ_INTX_HOST_MSI,
+ ASSIGNED_IRQ_MSI,
+ ASSIGNED_IRQ_MSIX
+} AssignedIRQType;
+
+typedef struct AssignedDevice {
+ PCIDevice dev;
+ PCIHostDeviceAddress host;
+ uint32_t dev_id;
+ uint32_t features;
+ int intpin;
+ AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1];
+ PCIDevRegions real_device;
+ PCIINTxRoute intx_route;
+ AssignedIRQType assigned_irq_type;
+ struct {
+#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
+#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
+ uint32_t available;
+#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
+#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
+#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
+ uint32_t state;
+ } cap;
+ uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
+ uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
+ int msi_virq_nr;
+ int *msi_virq;
+ MSIXTableEntry *msix_table;
+ hwaddr msix_table_addr;
+ uint16_t msix_max;
+ MemoryRegion mmio;
+ char *configfd_name;
+ int32_t bootindex;
+} AssignedDevice;
+
+#define TYPE_PCI_ASSIGN "kvm-pci-assign"
+#define PCI_ASSIGN(obj) OBJECT_CHECK(AssignedDevice, (obj), TYPE_PCI_ASSIGN)
+
+bool assigned_dev_msix_skipped(MSIXTableEntry *entry);
+void assigned_dev_update_msix(PCIDevice *pci_dev);
+
--
1.9.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [Qemu-devel] [RFC PATCH 2/3] Qemu: Add post_load_state() to run after restoring CPU state
2015-10-21 16:52 [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 1/3] Qemu: Add pci-assign.h to share functions and struct definition with new file Lan Tianyu
@ 2015-10-21 16:52 ` Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 3/3] Qemu: Introduce pci-sriov device type to support VF live migration Lan Tianyu
2015-10-21 18:39 ` [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Alex Williamson
3 siblings, 0 replies; 6+ messages in thread
From: Lan Tianyu @ 2015-10-21 16:52 UTC (permalink / raw)
To: amit.shah, eblake, eddie.dong, nrupal.jani, yang.z.zhang, agraf,
kvm, pbonzini, qemu-devel, emil.s.tantilov, ehabkost, lcapitulino,
lersek, mst, quintela, rth
Cc: Lan Tianyu
After migration, Qemu needs to trigger mailbox irq to notify VF driver
in the guest about status change. The irq delivery restarts to work after
restoring CPU state. This patch is to add new callback to run after
restoring CPU state and provide a way to trigger mailbox irq later.
Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
include/migration/vmstate.h | 2 ++
migration/savevm.c | 15 +++++++++++++++
2 files changed, 17 insertions(+)
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 0695d7c..dc681a6 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -56,6 +56,8 @@ typedef struct SaveVMHandlers {
int (*save_live_setup)(QEMUFile *f, void *opaque);
uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
+ /* This runs after restoring CPU related state */
+ void (*post_load_state)(void *opaque);
LoadStateHandler *load_state;
} SaveVMHandlers;
diff --git a/migration/savevm.c b/migration/savevm.c
index 9e0e286..48b6223 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -702,6 +702,20 @@ bool qemu_savevm_state_blocked(Error **errp)
return false;
}
+void qemu_savevm_post_load(void)
+{
+ SaveStateEntry *se;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->ops || !se->ops->post_load_state) {
+ continue;
+ }
+
+ se->ops->post_load_state(se->opaque);
+ }
+}
+
+
void qemu_savevm_state_header(QEMUFile *f)
{
trace_savevm_state_header();
@@ -1140,6 +1154,7 @@ int qemu_loadvm_state(QEMUFile *f)
}
cpu_synchronize_all_post_init();
+ qemu_savevm_post_load();
ret = 0;
--
1.9.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [Qemu-devel] [RFC PATCH 3/3] Qemu: Introduce pci-sriov device type to support VF live migration
2015-10-21 16:52 [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 1/3] Qemu: Add pci-assign.h to share functions and struct definition with new file Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 2/3] Qemu: Add post_load_state() to run after restoring CPU state Lan Tianyu
@ 2015-10-21 16:52 ` Lan Tianyu
2015-10-21 18:39 ` [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Alex Williamson
3 siblings, 0 replies; 6+ messages in thread
From: Lan Tianyu @ 2015-10-21 16:52 UTC (permalink / raw)
To: amit.shah, eblake, eddie.dong, nrupal.jani, yang.z.zhang, agraf,
kvm, pbonzini, qemu-devel, emil.s.tantilov, ehabkost, lcapitulino,
lersek, mst, quintela, rth
Cc: Lan Tianyu
This patch is to migrate VF status during migration between
source and target machine.
There are three kinds of VF status involved.
1) PCI configure space regs
2) MSIX configure
3) VF status in the PF driver
The PCI configure space regs and MSIX configure are originally
stored in Qemu.
VF status in the PF driver can be saved and restored via new sysfs
node state_in_pf under VF sysfs directory.
Fake PCI configure space regs "0xF0" to let VF driver to know migration
status. Qemu set reg "0xF0" to 1 when migration starts and set it to 0
when migration completes. VF driver tells Qemu it's ready for migration
via writing 1 to reg "0xF1".
Qemu notifies VF driver about migration status change via new sysfs
node notify_vf to send mailbox msg to VF driver.
Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
hw/i386/kvm/Makefile.objs | 2 +-
hw/i386/kvm/pci-assign.c | 2 +-
hw/i386/kvm/sriov.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 215 insertions(+), 2 deletions(-)
create mode 100644 hw/i386/kvm/sriov.c
diff --git a/hw/i386/kvm/Makefile.objs b/hw/i386/kvm/Makefile.objs
index d8bce20..09324e9 100644
--- a/hw/i386/kvm/Makefile.objs
+++ b/hw/i386/kvm/Makefile.objs
@@ -1 +1 @@
-obj-y += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
+obj-y += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o sriov.o
diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
index 616532d..84c5ff5 100644
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -1770,7 +1770,7 @@ static void assign_class_init(ObjectClass *klass, void *data)
k->config_read = assigned_dev_pci_read_config;
k->config_write = assigned_dev_pci_write_config;
dc->props = assigned_dev_properties;
- dc->vmsd = &vmstate_assigned_device;
+// dc->vmsd = &vmstate_assigned_device;
dc->reset = reset_assigned_device;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
dc->desc = "KVM-based PCI passthrough";
diff --git a/hw/i386/kvm/sriov.c b/hw/i386/kvm/sriov.c
new file mode 100644
index 0000000..ac37035
--- /dev/null
+++ b/hw/i386/kvm/sriov.c
@@ -0,0 +1,213 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#include "hw/hw.h"
+#include "hw/i386/pc.h"
+#include "pci-assign.h"
+
+
+#define TYPE_PCI_SRIOV "pci-sriov"
+
+#define SRIOV_LM_SETUP 0x01
+#define SRIOV_LM_COMPLETE 0x02
+
+static int pt_save_pf_buf(struct PCIDevice *pdev, unsigned char **buf,
+ int *len)
+{
+ AssignedDevice *adev = PCI_ASSIGN(pdev);
+ char file[128];
+ FILE *f;
+
+ *len = 0;
+
+ snprintf(file, sizeof(file),
+ "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/state_in_pf",
+ adev->host.domain, adev->host.bus, adev->host.slot,
+ adev->host.function);
+
+ if (!(f = fopen(file, "rb"))) {
+ return -EEXIST;
+ }
+ *buf = g_malloc(4096);
+ *len = fread(*buf, 1, 4096, f);
+ fclose(f);
+
+ return 0;
+}
+
+static void pt_restore_pf_buf(struct PCIDevice *pdev, unsigned char *buf, int len)
+{
+ AssignedDevice *adev = PCI_ASSIGN(pdev);
+ FILE *f;
+ char file[128];
+
+ snprintf(file, sizeof(file),
+ "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/state_in_pf",
+ adev->host.domain, adev->host.bus, adev->host.slot,
+ adev->host.function);
+
+ printf("path: %s\n", file);
+ if (!(f = fopen(file, "wb")))
+ return;
+
+ fwrite(buf, 1, len, f);
+ fclose(f);
+
+}
+
+static void assign_dev_post_load(void *opaque)
+{
+ struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+ AssignedDevice *adev = PCI_ASSIGN(pdev);
+ char file[128];
+ FILE *f;
+
+ snprintf(file, sizeof(file),
+ "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/notify_vf",
+ adev->host.domain, adev->host.bus, adev->host.slot,
+ adev->host.function);
+
+ printf("notify path %s\n", file);
+ if (!(f = fopen(file, "wb")))
+ return;
+
+ fwrite("1", 1, 1, f);
+ fclose(f);
+}
+
+static int assign_dev_load(QEMUFile *f, void *opaque, int version_id)
+{
+ struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+ AssignedDevice *adev = PCI_ASSIGN(pdev);
+ unsigned char *buf = NULL;
+ int ret, len, num;
+
+ if(qemu_get_byte(f)!= SRIOV_LM_COMPLETE)
+ return 0;
+
+ ret = pci_device_load(pdev, f);
+ if (ret) {
+ printf("pci config error %d\n", ret);
+ return ret;
+ }
+
+ qemu_get_sbe32s(f, &num);
+ qemu_get_buffer(f, (unsigned char *)adev->msix_table,
+ num * PCI_MSIX_ENTRY_SIZE);
+ assigned_dev_update_msix(pdev);
+
+ len = qemu_get_be32(f);
+ if (len) {
+ buf = g_malloc(len);
+ qemu_get_buffer(f, buf, len);
+ pt_restore_pf_buf(pdev, buf, len);
+ g_free(buf);
+ }
+
+
+ pci_default_write_config(pdev, 0xf0, 0x00, 1);
+ pci_default_write_config(pdev, 0xf1, 0x00, 1);
+ return 0;
+}
+
+static int assign_dev_save_complete(QEMUFile *f, void *opaque)
+{
+ struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+ AssignedDevice *adev = PCI_ASSIGN(pdev);
+ int len, entries_nr = 0;
+ unsigned char *buf = NULL;
+ int i;
+ MSIXTableEntry *entry = adev->msix_table;
+
+ qemu_put_byte(f, SRIOV_LM_COMPLETE);
+ pci_device_save(pdev, f);
+
+ for (i = 0; i < adev->msix_max; i++, entry++) {
+ if (assigned_dev_msix_skipped(entry)) {
+ continue;
+ }
+ entries_nr++;
+ }
+
+ qemu_put_sbe32s(f, &entries_nr);
+ qemu_put_buffer(f, (unsigned char *)adev->msix_table, entries_nr * PCI_MSIX_ENTRY_SIZE);
+
+ if (pt_save_pf_buf(pdev, &buf, &len))
+ return -EFAULT;
+
+ qemu_put_be32(f, len);
+ if (len) {
+ printf("pf state saved, size %d\n", len);
+ qemu_put_buffer(f, buf, len);
+ }
+
+ return 0;
+}
+
+static int assign_dev_setup(QEMUFile *f, void *opaque)
+{
+ struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+ AssignedDevice *adev = PCI_ASSIGN(pdev);
+ char file[128];
+ FILE *fd;
+
+ pci_default_write_config(pdev, 0xf0, 0x01, 1);
+
+ snprintf(file, sizeof(file),
+ "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/notify_vf",
+ adev->host.domain, adev->host.bus, adev->host.slot,
+ adev->host.function);
+
+ if (!(fd = fopen(file, "wb")))
+ return -EFAULT;
+
+ fwrite("1", 1, 1, fd);
+ fclose(fd);
+
+ printf("notify path %s\n", file);
+ qemu_put_byte(f, SRIOV_LM_SETUP);
+ return 0;
+}
+
+static uint64_t assign_dev_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+ struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+
+ return pci_default_read_config(pdev, 0xf1,1) ?
+ 0 : max_size;
+}
+
+static SaveVMHandlers savevm_pt_handlers = {
+ .save_live_setup = assign_dev_setup,
+ .save_live_complete = assign_dev_save_complete,
+ .save_live_pending = assign_dev_save_pending,
+ .load_state = assign_dev_load,
+ .post_load_state = assign_dev_post_load,
+};
+
+static void sriov_pci_instance_init(Object *obj)
+{
+ PCIDevice *pci_dev = PCI_DEVICE(obj);
+
+ register_savevm_live(NULL, "pci-assign", 1, 1,
+ &savevm_pt_handlers, pci_dev);
+}
+
+static const TypeInfo sriov_pci_type_info = {
+ .name = TYPE_PCI_SRIOV,
+ .parent = TYPE_PCI_ASSIGN,
+ .instance_init = sriov_pci_instance_init,
+};
+
+static void sriov_register_types(void)
+{
+ type_register_static(&sriov_pci_type_info);
+}
+type_init(sriov_register_types)
--
1.9.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC
2015-10-21 16:52 [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Lan Tianyu
` (2 preceding siblings ...)
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 3/3] Qemu: Introduce pci-sriov device type to support VF live migration Lan Tianyu
@ 2015-10-21 18:39 ` Alex Williamson
2015-10-23 3:10 ` Lan Tianyu
3 siblings, 1 reply; 6+ messages in thread
From: Alex Williamson @ 2015-10-21 18:39 UTC (permalink / raw)
To: Lan Tianyu
Cc: emil.s.tantilov, kvm, mst, lersek, rth, quintela, eddie.dong,
agraf, qemu-devel, yang.z.zhang, nrupal.jani, amit.shah, pbonzini,
lcapitulino, ehabkost
On Thu, 2015-10-22 at 00:52 +0800, Lan Tianyu wrote:
> This patchset is Qemu part for live migration support for SRIOV NIC.
> kernel part patch information is in the following link.
> http://marc.info/?l=kvm&m=144544635330193&w=2
>
>
> Lan Tianyu (3):
> Qemu: Add pci-assign.h to share functions and struct definition with
> new file
> Qemu: Add post_load_state() to run after restoring CPU state
> Qemu: Introduce pci-sriov device type to support VF live migration
>
> hw/i386/kvm/Makefile.objs | 2 +-
> hw/i386/kvm/pci-assign.c | 113 +----------------------
> hw/i386/kvm/pci-assign.h | 109 +++++++++++++++++++++++
> hw/i386/kvm/sriov.c | 213 ++++++++++++++++++++++++++++++++++++++++++++
> include/migration/vmstate.h | 2 +
> migration/savevm.c | 15 ++++
> 6 files changed, 344 insertions(+), 110 deletions(-)
> create mode 100644 hw/i386/kvm/pci-assign.h
> create mode 100644 hw/i386/kvm/sriov.c
>
Hi Lan,
Seems like there are a couple immediate problems with this approach.
The first is that you're modifying legacy KVM device assignment, which
is deprecated upstream and not even enabled by some distros. VFIO is
the supported mechanism for doing PCI device assignment now and any
features like this need to be added there first. It's not only more
secure than legacy KVM device assignment, but it also doesn't limit this
to an x86-only solution. Surely you want to support 82599 VF migration
on other platforms as well.
Using sysfs to interact with the PF is also problematic since that means
that libvirt needs to grant qemu access to these files, adding one more
layer to the stack. If we were to use VFIO, we could potentially enable
this through a save-state region on the device file descriptor and if
necessary, virtual interrupt channels for the device as well. This of
course implies that the kernel internal channels are made as general as
possible in order to support any PF driver.
That said, there are some nice features here. Using unused PCI config
bytes to communicate with the guest driver and enable guest-based page
dirtying is a nice hack. However, if we want to add this capability to
other devices, we're not always going to be able to use fixed addresses
0xf0 and 0xf1. I would suggest that we probably want to create a
virtual capability in the config space of the VF, perhaps a Vendor
Specific capability. Obviously some devices won't have room for a full
capability in the standard config space, so we may need to optionally
expose it in extended config space. Those device would be limited to
only supporting migration in PCI-e configurations in the guest. Also,
plenty of devices make use of undefined PCI config space, so we may not
be able to simply add a capability to a region we think is unused, maybe
it needs to happen through reserved space in another capability or
perhaps defining a virtual BAR that unenlightened guest drivers would
ignore. The point is that we somehow need to standardize that so that
rather than implicitly know that it's at 0xf0/0xf1 on 82599 VFs.
Also, I haven't looked at the kernel-side patches yet, but the saved
state received from and loaded into the PF driver needs to be versioned
and maybe we need some way to know whether versions are compatible.
Migration version information is difficult enough for QEMU, it's a
completely foreign concept in the kernel. Thanks,
Alex
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC
2015-10-21 18:39 ` [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Alex Williamson
@ 2015-10-23 3:10 ` Lan Tianyu
0 siblings, 0 replies; 6+ messages in thread
From: Lan Tianyu @ 2015-10-23 3:10 UTC (permalink / raw)
To: Alex Williamson
Cc: emil.s.tantilov, kvm, mst, lersek, rth, quintela, eddie.dong,
agraf, qemu-devel, yang.z.zhang, nrupal.jani, amit.shah, pbonzini,
lcapitulino, ehabkost
[-- Attachment #1: Type: text/plain, Size: 3949 bytes --]
On 2015年10月22日 02:39, Alex Williamson wrote:
> On Thu, 2015-10-22 at 00:52 +0800, Lan Tianyu wrote:
>> This patchset is Qemu part for live migration support for SRIOV NIC.
>> kernel part patch information is in the following link.
>> http://marc.info/?l=kvm&m=144544635330193&w=2
>>
>>
>> Lan Tianyu (3):
>> Qemu: Add pci-assign.h to share functions and struct definition with
>> new file
>> Qemu: Add post_load_state() to run after restoring CPU state
>> Qemu: Introduce pci-sriov device type to support VF live migration
>>
>> hw/i386/kvm/Makefile.objs | 2 +-
>> hw/i386/kvm/pci-assign.c | 113 +----------------------
>> hw/i386/kvm/pci-assign.h | 109 +++++++++++++++++++++++
>> hw/i386/kvm/sriov.c | 213 ++++++++++++++++++++++++++++++++++++++++++++
>> include/migration/vmstate.h | 2 +
>> migration/savevm.c | 15 ++++
>> 6 files changed, 344 insertions(+), 110 deletions(-)
>> create mode 100644 hw/i386/kvm/pci-assign.h
>> create mode 100644 hw/i386/kvm/sriov.c
>>
> Hi Lan,
Hi Alex:
Thanks a lot for your comments. It's very helpful.
>
> Seems like there are a couple immediate problems with this approach.
> The first is that you're modifying legacy KVM device assignment, which
> is deprecated upstream and not even enabled by some distros. VFIO is
> the supported mechanism for doing PCI device assignment now and any
> features like this need to be added there first. It's not only more
> secure than legacy KVM device assignment, but it also doesn't limit this
> to an x86-only solution. Surely you want to support 82599 VF migration
> on other platforms as well.
Yes, we will turn to VFIO and just uses legacy mode to show our
idea as soon as possible.
>
> Using sysfs to interact with the PF is also problematic since that means
> that libvirt needs to grant qemu access to these files, adding one more
> layer to the stack. If we were to use VFIO, we could potentially enable
> this through a save-state region on the device file descriptor and if
> necessary, virtual interrupt channels for the device as well. This of
> course implies that the kernel internal channels are made as general as
> possible in order to support any PF driver.
This sounds reasonable.
>
> That said, there are some nice features here. Using unused PCI config
> bytes to communicate with the guest driver and enable guest-based page
> dirtying is a nice hack. However, if we want to add this capability to
> other devices, we're not always going to be able to use fixed addresses
> 0xf0 and 0xf1. I would suggest that we probably want to create a
> virtual capability in the config space of the VF, perhaps a Vendor
> Specific capability. Obviously some devices won't have room for a full
> capability in the standard config space, so we may need to optionally
> expose it in extended config space. Those device would be limited to
> only supporting migration in PCI-e configurations in the guest. Also,
> plenty of devices make use of undefined PCI config space, so we may not
> be able to simply add a capability to a region we think is unused, maybe
> it needs to happen through reserved space in another capability or
> perhaps defining a virtual BAR that unenlightened guest drivers would
> ignore. The point is that we somehow need to standardize that so that
> rather than implicitly know that it's at 0xf0/0xf1 on 82599 VFs.
Yes, use "0xF0" and "0xF1" to show idea and it's need more
effort to find the suitable place. Will research more.
>
> Also, I haven't looked at the kernel-side patches yet, but the saved
> state received from and loaded into the PF driver needs to be versioned
> and maybe we need some way to know whether versions are compatible.
> Migration version information is difficult enough for QEMU, it's a
> completely foreign concept in the kernel. Thanks,
Good point. Will add it into next version.
>
> Alex
>
--
Best regards
Tianyu Lan
[-- Attachment #2: Type: text/html, Size: 5213 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2015-10-23 3:21 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-21 16:52 [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 1/3] Qemu: Add pci-assign.h to share functions and struct definition with new file Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 2/3] Qemu: Add post_load_state() to run after restoring CPU state Lan Tianyu
2015-10-21 16:52 ` [Qemu-devel] [RFC PATCH 3/3] Qemu: Introduce pci-sriov device type to support VF live migration Lan Tianyu
2015-10-21 18:39 ` [Qemu-devel] [RFC PATCH 0/3] Qemu/IXGBE: Add live migration support for SRIOV NIC Alex Williamson
2015-10-23 3:10 ` Lan Tianyu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).