[Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2
@ 2016-04-05 11:41 Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
                   ` (11 more replies)
  0 siblings, 12 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

v5-v6:
   1. register resume handler both qemu and kernel to ensure the
      reset in order.
   2. fix the patches 6/12, 7/12 as Alex and MST sugguestion.

v4-v5:
   1. add back the common function 0 hotplug code in pci core.
   2. fix a sporadic device stuck on D3 problem when doing aer recovery.
   3. fix patches 5/12 ~ 9/12 as Alex sugguestion.

v3-v4:
   1. rebase patchset to fit latest master branch.
   2. modifying patches 5/10 ~ 8/10 as Alex sugguestion(Thanks).

v2-v3:
   1. fix patch 4/9, 5/9 as Alex sugguestion.
   2. patches 5/9 ~ 8/9 are made to force limiting that all vfio functions
      are combined in the same way as on the host.

v1-v2:
   1. limit all devices on same bus in guest are on same bus in host in patch 5/11.
   2. patch 05/11 ~ 09/11 has been changed.


Chen Fan (12):
  vfio: extract vfio_get_hot_reset_info as a single function
  vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  vfio: add pcie extended capability support
  vfio: add aer support for vfio device
  vfio: refine function vfio_pci_host_match
  vfio: add check host bus reset is support or not
  pci: add a pci_function_is_valid callback to check function if valid
  vfio: add check aer functionality for hotplug device
  vfio: vote the function 0 to do host bus reset when aer occurred
  vfio-pci: pass the aer error to guest
  vfio: register aer resume notification handler for aer resume
  vfio: add 'aer' property to expose aercap

 hw/pci/pci.c               |  49 +++
 hw/vfio/pci.c              | 738 +++++++++++++++++++++++++++++++++++++++++----
 hw/vfio/pci.h              |   7 +
 include/hw/pci/pci.h       |   1 +
 linux-headers/linux/vfio.h |   1 +
 5 files changed, 730 insertions(+), 66 deletions(-)

-- 
1.9.3

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 01/12] vfio: extract vfio_get_hot_reset_info as a single function
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

the function is used to get affected devices by bus reset.
so here extract it, and can used for aer soon.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 66 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d091d8c..cf40f9e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1701,6 +1701,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
     }
 }
 
+/*
+ * return negative with errno, return 0 on success.
+ * if success, the point of ret_info fill with the affected device reset info.
+ *
+ */
+static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev,
+                                   struct vfio_pci_hot_reset_info **ret_info)
+{
+    struct vfio_pci_hot_reset_info *info;
+    int ret, count;
+
+    *ret_info = NULL;
+
+    info = g_malloc0(sizeof(*info));
+    info->argsz = sizeof(*info);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+    if (ret && errno != ENOSPC) {
+        ret = -errno;
+        goto error;
+    }
+
+    count = info->count;
+
+    info = g_realloc(info, sizeof(*info) +
+                     (count * sizeof(struct vfio_pci_dependent_device)));
+    info->argsz = sizeof(*info) +
+                  (count * sizeof(struct vfio_pci_dependent_device));
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+    if (ret) {
+        ret = -errno;
+        error_report("vfio: hot reset info failed: %m");
+        goto error;
+    }
+
+    *ret_info = info;
+    info = NULL;
+
+    return 0;
+error:
+    g_free(info);
+    return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1842,7 +1887,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
-    struct vfio_pci_hot_reset_info *info;
+    struct vfio_pci_hot_reset_info *info = NULL;
     struct vfio_pci_dependent_device *devices;
     struct vfio_pci_hot_reset *reset;
     int32_t *fds;
@@ -1854,12 +1899,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     vfio_pci_pre_reset(vdev);
     vdev->vbasedev.needs_reset = false;
 
-    info = g_malloc0(sizeof(*info));
-    info->argsz = sizeof(*info);
-
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-    if (ret && errno != ENOSPC) {
-        ret = -errno;
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
         if (!vdev->has_pm_reset) {
             error_report("vfio: Cannot reset device %s, "
                          "no available reset mechanism.", vdev->vbasedev.name);
@@ -1867,18 +1908,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
         goto out_single;
     }
 
-    count = info->count;
-    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
-    info->argsz = sizeof(*info) + (count * sizeof(*devices));
     devices = &info->devices[0];
-
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-    if (ret) {
-        ret = -errno;
-        error_report("vfio: hot reset info failed: %m");
-        goto out_single;
-    }
-
     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
 
     /* Verify that we have all the groups required */
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 03/12] vfio: add pcie extended capability support Cao jin
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 75 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index cf40f9e..1ad47ef 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1746,6 +1746,48 @@ error:
     return ret;
 }
 
+static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev,
+                                 struct vfio_pci_hot_reset_info *info)
+{
+    VFIOGroup *group;
+    struct vfio_pci_hot_reset *reset;
+    int32_t *fds;
+    int ret, i, count;
+    struct vfio_pci_dependent_device *devices;
+
+    /* Determine how many group fds need to be passed */
+    count = 0;
+    devices = &info->devices[0];
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                count++;
+                break;
+            }
+        }
+    }
+
+    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+    fds = &reset->group_fds[0];
+
+    /* Fill in group fds */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                fds[reset->count++] = group->fd;
+                break;
+            }
+        }
+    }
+
+    /* Bus reset! */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+    g_free(reset);
+
+    return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1889,9 +1931,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     VFIOGroup *group;
     struct vfio_pci_hot_reset_info *info = NULL;
     struct vfio_pci_dependent_device *devices;
-    struct vfio_pci_hot_reset *reset;
-    int32_t *fds;
-    int ret, i, count;
+    int ret, i;
     bool multi = false;
 
     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
@@ -1969,34 +2009,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
         goto out_single;
     }
 
-    /* Determine how many group fds need to be passed */
-    count = 0;
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                count++;
-                break;
-            }
-        }
-    }
-
-    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
-    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
-    fds = &reset->group_fds[0];
-
-    /* Fill in group fds */
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                fds[reset->count++] = group->fd;
-                break;
-            }
-        }
-    }
-
-    /* Bus reset! */
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
-    g_free(reset);
+    ret = vfio_pci_do_hot_reset(vdev, info);
 
     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
                                     ret ? "%m" : "Success");
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 03/12] vfio: add pcie extended capability support
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 04/12] vfio: add aer support for vfio device Cao jin
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For vfio pcie device, we could expose the extended capability on
PCIE bus. due to add a new pcie capability at the tail of the chain,
in order to avoid config space overwritten, we introduce a copy config
for parsing extended caps. and rebuild the pcie extended config space.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 1ad47ef..ff14af0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1528,6 +1528,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
     return next - pos;
 }
 
+
+static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
+{
+    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
+
+    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
+        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
 {
     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
@@ -1862,16 +1877,71 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint32_t header;
+    uint16_t cap_id, next, size;
+    uint8_t cap_ver;
+    uint8_t *config;
+
+    /*
+     * pcie_add_capability always inserts the new capability at the tail
+     * of the chain.  Therefore to end up with a chain that matches the
+     * physical device, we cache the config space to avoid overwriting
+     * the original config space when we parse the extended capabilities.
+     */
+    config = g_memdup(pdev->config, vdev->config_size);
+
+    for (next = PCI_CONFIG_SPACE_SIZE; next;
+         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
+        header = pci_get_long(config + next);
+        cap_id = PCI_EXT_CAP_ID(header);
+        cap_ver = PCI_EXT_CAP_VER(header);
+
+        /*
+         * If it becomes important to configure extended capabilities to their
+         * actual size, use this as the default when it's something we don't
+         * recognize. Since QEMU doesn't actually handle many of the config
+         * accesses, exact size doesn't seem worthwhile.
+         */
+        size = vfio_ext_cap_max_size(config, next);
+
+        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+
+        /* Use emulated next pointer to allow dropping extended caps */
+        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
+                                   PCI_EXT_CAP_NEXT_MASK);
+    }
+
+    g_free(config);
+    return 0;
+}
+
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
+    int ret;
 
     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
         !pdev->config[PCI_CAPABILITY_LIST]) {
         return 0; /* Nothing to add */
     }
 
-    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    if (ret) {
+        return ret;
+    }
+
+    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
+    if (!pci_is_express(pdev) ||
+        !pci_bus_is_express(pdev->bus) ||
+        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
+        return 0;
+    }
+
+    return vfio_add_ext_cap(vdev);
 }
 
 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 04/12] vfio: add aer support for vfio device
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (2 preceding siblings ...)
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 03/12] vfio: add pcie extended capability support Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 05/12] vfio: refine function vfio_pci_host_match Cao jin
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Calling pcie_aer_init to initilize aer related registers for
vfio device, then reload physical related registers to expose
device capability.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/vfio/pci.h |  3 +++
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ff14af0..0516d94 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1877,6 +1877,66 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
+                          int pos, uint16_t size)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    PCIDevice *dev_iter;
+    uint8_t type;
+    uint32_t errcap;
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
+                            cap_ver, pos, size);
+        return 0;
+    }
+
+    dev_iter = pci_bridge_get_device(pdev->bus);
+    if (!dev_iter) {
+        goto error;
+    }
+
+    while (dev_iter) {
+        if (!pci_is_express(dev_iter)) {
+            goto error;
+        }
+
+        type = pcie_cap_get_type(dev_iter);
+        if ((type != PCI_EXP_TYPE_ROOT_PORT &&
+             type != PCI_EXP_TYPE_UPSTREAM &&
+             type != PCI_EXP_TYPE_DOWNSTREAM)) {
+            goto error;
+        }
+
+        if (!dev_iter->exp.aer_cap) {
+            goto error;
+        }
+
+        dev_iter = pci_bridge_get_device(dev_iter->bus);
+    }
+
+    errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
+    /*
+     * The ability to record multiple headers is depending on
+     * the state of the Multiple Header Recording Capable bit and
+     * enabled by the Multiple Header Recording Enable bit.
+     */
+    if ((errcap & PCI_ERR_CAP_MHRC) &&
+        (errcap & PCI_ERR_CAP_MHRE)) {
+        pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+    } else {
+        pdev->exp.aer_log.log_max = 0;
+    }
+
+    pcie_cap_deverr_init(pdev);
+    return pcie_aer_init(pdev, pos, size);
+
+error:
+    error_report("vfio: Unable to enable AER for device %s, parent bus "
+                 "does not support AER signaling", vdev->vbasedev.name);
+    return -1;
+}
+
 static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1884,6 +1944,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
     uint16_t cap_id, next, size;
     uint8_t cap_ver;
     uint8_t *config;
+    int ret = 0;
 
     /*
      * pcie_add_capability always inserts the new capability at the tail
@@ -1907,16 +1968,29 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
          */
         size = vfio_ext_cap_max_size(config, next);
 
-        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
-        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+        switch (cap_id) {
+        case PCI_EXT_CAP_ID_ERR:
+            ret = vfio_setup_aer(vdev, cap_ver, next, size);
+            break;
+        default:
+            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+            break;
+        }
+
+        if (ret) {
+            goto out;
+        }
+
+        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
 
         /* Use emulated next pointer to allow dropping extended caps */
         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
                                    PCI_EXT_CAP_NEXT_MASK);
     }
 
+out:
     g_free(config);
-    return 0;
+    return ret;
 }
 
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
@@ -2673,6 +2747,11 @@ static int vfio_initfn(PCIDevice *pdev)
         goto out_teardown;
     }
 
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        !pdev->exp.aer_cap) {
+        goto out_teardown;
+    }
+
     /* QEMU emulates all of MSI & MSIX */
     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3976f68..7b3924e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
 #include "qemu/queue.h"
@@ -128,6 +129,8 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
+#define VFIO_FEATURE_ENABLE_AER_BIT 2
+#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
     int32_t bootindex;
     uint8_t pm_cap;
     bool has_vga;
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 05/12] vfio: refine function vfio_pci_host_match
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (3 preceding siblings ...)
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 04/12] vfio: add aer support for vfio device Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 06/12] vfio: add check host bus reset is support or not Cao jin
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0516d94..5b23a86 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2060,14 +2060,27 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_intx_enable(vdev);
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+    if (strlen(name) != 12 ||
+        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+               &addr->bus, &addr->slot, &addr->function) != 4) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 {
-    char tmp[13];
+    PCIHostDeviceAddress tmp;
 
-    sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
-            addr->bus, addr->slot, addr->function);
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
 
-    return (strcmp(tmp, name) == 0);
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot && tmp.function == addr->function);
 }
 
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 06/12] vfio: add check host bus reset is support or not
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (4 preceding siblings ...)
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 05/12] vfio: refine function vfio_pci_host_match Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

When assigning a vfio device with AER enabled, we must check whether
the device supports a host bus reset (ie. hot reset) as this may be
used by the guest OS in order to recover the device from an AER
error.  QEMU must therefore have the ability to perform a physical
host bus reset using the existing vfio APIs in response to a virtual
bus reset in the VM.  A physical bus reset affects all of the devices
on the host bus, therefore we place a few simplifying configuration
restriction on the VM:

 - All physical devices affected by a bus reset must be assigned to
   the VM with AER enabled on each and be configured on the same
   virtual bus in the VM.

 - No devices unaffected by the bus reset, be they physical, emulated,
   or paravirtual may be configured on the same virtual bus as a
   device supporting AER signaling through vfio.

In other words users wishing to enable AER on a multifunction device
need to assign all functions of the device to the same virtual bus
and enable AER support for each device.  The easiest way to
accomplish this is to identity map the physical functions to virtual
functions with multifunction enabled on the virtual device.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/vfio/pci.h |   1 +
 2 files changed, 258 insertions(+), 23 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5b23a86..d94e643 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1716,6 +1716,41 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
     }
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+    if (strlen(name) != 12 ||
+        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+               &addr->bus, &addr->slot, &addr->function) != 4) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+{
+    PCIHostDeviceAddress tmp;
+
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
+
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot && tmp.function == addr->function);
+}
+
+static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr, const char *name)
+{
+    PCIHostDeviceAddress tmp;
+
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
+
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot);
+}
+
 /*
  * return negative with errno, return 0 on success.
  * if success, the point of ret_info fill with the affected device reset info.
@@ -1877,6 +1912,203 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+/*
+ * Calculate the max function number on specified bus.
+ * if the bridge is not pcie bridge or support ARI, return
+ * 255, otherwise return 8.
+ */
+static int vfio_pci_bus_devfn_limit(PCIBus *bus)
+{
+    PCIDevice *br;
+
+    br = pci_bridge_get_device(bus);
+    if (!br ||
+        !pci_bus_is_express(bus) ||
+        pcie_cap_is_arifwd_enabled(br)) {
+        return 255;
+    }
+
+    return 8;
+}
+
+static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
+{
+    PCIBus *bus = vdev->pdev.bus;
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    VFIOGroup *group;
+    int ret, i, devfn, devfn_limit;
+
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
+        error_setg(errp, "vfio: Cannot enable AER for device %s,"
+                   " device does not support hot reset.",
+                   vdev->vbasedev.name);
+        return;
+    }
+
+    /* List all affected devices by bus reset */
+    devices = &info->devices[0];
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+        bool found = false;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        /* Skip the current device */
+        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+            continue;
+        }
+
+        /* Ensure we own the group of the affected device */
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                       "depends on group %d which is not owned.",
+                       vdev->vbasedev.name, devices[i].group_id);
+            goto out;
+        }
+
+        /* Ensure affected devices for reset on the same bus */
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                /*
+                 * AER errors may be broadcast to all functions of a multi-
+                 * function endpoint.  If any of those sibling functions are
+                 * also assigned, they need to have AER enabled or else an
+                 * error may continue to cause a vm_stop condition.  IOW,
+                 * AER setup of this function would be pointless.
+                 */
+                if (vfio_pci_host_match_slot(&host, vdev->vbasedev.name) &&
+                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
+                    error_setg(errp, "vfio: Cannot enable AER for device %s, on same slot"
+                               " the dependent device %s which does not enable AER.",
+                               vdev->vbasedev.name, tmp->vbasedev.name);
+                    goto out;
+                }
+
+                if (tmp->pdev.bus != bus) {
+                    error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                               "the dependent device %s is not on the same bus",
+                               vdev->vbasedev.name, tmp->vbasedev.name);
+                    goto out;
+                }
+                found = true;
+                break;
+            }
+        }
+
+        /* Ensure all affected devices assigned to VM */
+        if (!found) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                       "the dependent device %04x:%02x:%02x.%x "
+                       "is not assigned to VM.",
+                       vdev->vbasedev.name, host.domain, host.bus,
+                       host.slot, host.function);
+            goto out;
+        }
+    }
+
+    /*
+     * The above code verified that all devices affected by a bus reset
+     * exist on the same bus in the VM.  To further simplify, we also
+     * require that there are no additional devices beyond those existing on
+     * the VM bus.
+     */
+    devfn_limit = vfio_pci_bus_devfn_limit(bus);
+    for (devfn = 0; devfn < devfn_limit; devfn++) {
+        VFIOPCIDevice *tmp;
+        PCIDevice *dev;
+        bool found = false;
+
+        dev = pci_find_device(bus, pci_bus_num(bus), devfn);
+
+        if (!dev) {
+            continue;
+        }
+
+        if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, device"
+                             " %s: VM address %02x.%d cannot be configured"
+                             " on the same virtual bus",
+                             vdev->vbasedev.name, dev->name,
+                             PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+            goto out;
+        }
+
+        tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+        for (i = 0; i < info->count; i++) {
+            PCIHostDeviceAddress host;
+
+            host.domain = devices[i].segment;
+            host.bus = devices[i].bus;
+            host.slot = PCI_SLOT(devices[i].devfn);
+            host.function = PCI_FUNC(devices[i].devfn);
+
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                found = true;
+                break;
+            }
+        }
+
+        if (!found) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, vfio-pci"
+                             " device %s at VM address %02x.%d cannot be"
+                             " configured on the same virtual bus",
+                             vdev->vbasedev.name, tmp->vbasedev.name,
+                             PCI_SLOT(tmp->pdev.devfn), PCI_FUNC(tmp->pdev.devfn));
+            goto out;
+        }
+    }
+
+out:
+    g_free(info);
+    return;
+}
+
+static void vfio_aer_check_host_bus_reset(Error **errp)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+    VFIOPCIDevice *vdev;
+    Error *local_err = NULL;
+
+    /* Check All vfio-pci devices if have bus reset capability */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+            if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+                vfio_check_hot_bus_reset(vdev, &local_err);
+                if (local_err) {
+                    error_propagate(errp, local_err);
+                    return;
+                }
+            }
+        }
+    }
+
+    return;
+}
+
 static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
                           int pos, uint16_t size)
 {
@@ -2060,29 +2292,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_intx_enable(vdev);
 }
 
-static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
-{
-    if (strlen(name) != 12 ||
-        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
-               &addr->bus, &addr->slot, &addr->function) != 4) {
-        return -EINVAL;
-    }
-
-    return 0;
-}
-
-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
-{
-    PCIHostDeviceAddress tmp;
-
-    if (vfio_pci_name_to_addr(name, &tmp)) {
-        return false;
-    }
-
-    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
-            tmp.slot == addr->slot && tmp.function == addr->function);
-}
-
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
@@ -2589,6 +2798,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
+{
+    Error *local_err = NULL;
+
+    vfio_aer_check_host_bus_reset(&local_err);
+    if (local_err) {
+        fprintf(stderr, "%s\n", error_get_pretty(local_err));
+        error_free(local_err);
+        exit(1);
+    }
+}
+
+static Notifier machine_notifier = {
+    .notify = vfio_pci_machine_done_notify,
+};
+
 static int vfio_initfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
@@ -2934,6 +3159,15 @@ static const TypeInfo vfio_pci_dev_info = {
 static void register_vfio_pci_dev_type(void)
 {
     type_register_static(&vfio_pci_dev_info);
+
+    /*
+     * The AER configuration may depend on multiple devices, so we cannot
+     * validate consistency after each device is initialized.  We can only
+     * depend on function initialization order (function 0 last) for hotplug
+     * devices, therefore a machine-init-done notifier is used to validate
+     * the configuration after all cold-plug devices are processed.
+     */
+     qemu_add_machine_init_done_notifier(&machine_notifier);
 }
 
 type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 7b3924e..db7c6d5 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 07/12] pci: add a pci_function_is_valid callback to check function if valid
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (5 preceding siblings ...)
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 06/12] vfio: add check host bus reset is support or not Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 08/12] vfio: add check aer functionality for hotplug device Cao jin
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

PCI hotplug requires that function 0 is added last to close the
slot.  Since vfio supporting AER, we require that the VM bus
contains the same set of devices as the host bus to support AER,
we can perform an AER validation test whenever a function 0 in
the VM is hot-added.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/pci/pci.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci/pci.h |  1 +
 2 files changed, 50 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e67664d..9dcd7d5 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1840,6 +1840,40 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
     return bus->devices[devfn];
 }
 
+static void pci_functions_validate(PCIBus *bus, PCIDevice *d, Error **errp)
+{
+    PCIDevice *br = pci_bridge_get_device(bus);
+    PCIDeviceClass *pc;
+    PCIDevice *dev;
+    int devfn;
+    int limit = d->devfn + PCI_FUNC_MAX;
+    Error *local_err = NULL;
+
+    if (br &&
+        pci_bus_is_express(bus) &&
+        pcie_cap_is_arifwd_enabled(br)) {
+        limit = 255;
+    }
+
+    for (devfn = d->devfn; devfn < limit; devfn++) {
+        dev = pci_find_device(bus, pci_bus_num(bus), devfn);
+        if (!dev) {
+            continue;
+        }
+
+        pc = PCI_DEVICE_GET_CLASS(dev);
+        if (!pc->is_valid_func) {
+            continue;
+        }
+
+        pc->is_valid_func(d, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
 static void pci_qdev_realize(DeviceState *qdev, Error **errp)
 {
     PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -1882,6 +1916,21 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
         pci_qdev_unrealize(DEVICE(pci_dev), NULL);
         return;
     }
+
+    /*
+     *  If the function number is 0, indicate the closure of the slot.
+     *  then we get the chance to check all functions on same device
+     *  if valid.
+     */
+    if (DEVICE(pci_dev)->hotplugged &&
+        pci_get_function_0(pci_dev) == pci_dev) {
+        pci_functions_validate(bus, pci_dev, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
+            return;
+        }
+    }
 }
 
 static void pci_default_realize(PCIDevice *dev, Error **errp)
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 0be07c8..4a2f7d4 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -190,6 +190,7 @@ typedef struct PCIDeviceClass {
 
     void (*realize)(PCIDevice *dev, Error **errp);
     int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
+    void (*is_valid_func)(PCIDevice *dev, Error **errp);
     PCIUnregisterFunc *exit;
     PCIConfigReadFunc *config_read;
     PCIConfigWriteFunc *config_write;
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 08/12] vfio: add check aer functionality for hotplug device
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (6 preceding siblings ...)
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
@ 2016-04-05 11:41 ` Cao jin
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 09/12] vfio: vote the function 0 to do host bus reset when aer occurred Cao jin
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when function 0 is hot-added, we can check the vfio device
whether support hot bus reset.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d94e643..16a4a3b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3088,6 +3088,19 @@ post_reset:
     vfio_pci_post_reset(vdev);
 }
 
+static void vfio_pci_is_valid(PCIDevice *dev, Error **errp)
+{
+    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+    Error *local_err = NULL;
+
+    if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+        vfio_check_hot_bus_reset(vdev, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
+    }
+}
+
 static void vfio_instance_init(Object *obj)
 {
     PCIDevice *pci_dev = PCI_DEVICE(obj);
@@ -3142,6 +3155,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->init = vfio_initfn;
     pdc->exit = vfio_exitfn;
+    pdc->is_valid_func = vfio_pci_is_valid;
     pdc->config_read = vfio_pci_read_config;
     pdc->config_write = vfio_pci_write_config;
     pdc->is_express = 1; /* We might be */
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 09/12] vfio: vote the function 0 to do host bus reset when aer occurred
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (7 preceding siblings ...)
  2016-04-05 11:41 ` [Qemu-devel] [patch v6 08/12] vfio: add check aer functionality for hotplug device Cao jin
@ 2016-04-05 11:42 ` Cao jin
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 10/12] vfio-pci: pass the aer error to guest Cao jin
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Due to all devices assigned to VM on the same way as host if enable
aer, so we can easily do the hot reset by selecting the function #0
to do the hot reset.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 14 ++++++++++++++
 hw/vfio/pci.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 16a4a3b..5142707 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1950,6 +1950,8 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
     /* List all affected devices by bus reset */
     devices = &info->devices[0];
 
+    vdev->single_depend_dev = (info->count == 1);
+
     /* Verify that we have all the groups required */
     for (i = 0; i < info->count; i++) {
         PCIHostDeviceAddress host;
@@ -3059,6 +3061,18 @@ static void vfio_pci_reset(DeviceState *dev)
 
     trace_vfio_pci_reset(vdev->vbasedev.name);
 
+    if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+        PCIDevice *br = pci_bridge_get_device(pdev->bus);
+
+        if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
+             PCI_BRIDGE_CTL_BUS_RESET)) {
+            if (pci_get_function_0(pdev) == pdev) {
+                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+            }
+            return;
+        }
+    }
+
     vfio_pci_pre_reset(vdev);
 
     if (vdev->resetfn && !vdev->resetfn(vdev)) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index db7c6d5..9fb0206 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -143,6 +143,7 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_intx;
     bool no_kvm_msi;
     bool no_kvm_msix;
+    bool single_depend_dev;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 10/12] vfio-pci: pass the aer error to guest
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (8 preceding siblings ...)
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 09/12] vfio: vote the function 0 to do host bus reset when aer occurred Cao jin
@ 2016-04-05 11:42 ` Cao jin
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume Cao jin
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 12/12] vfio: add 'aer' property to expose aercap Cao jin
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, resulting in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and let the guest driver
recover from the error.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5142707..691ff5e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2610,18 +2610,66 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    Error *local_err = NULL;
+    PCIEAERMsg msg = {
+        .severity = 0,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
 
     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
         return;
     }
 
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        goto stop;
+    }
+
+    /*
+     * in case the real hardware configuration has been changed,
+     * here we should recheck the bus reset capability.
+     */
+    vfio_check_hot_bus_reset(vdev, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        goto stop;
+    }
+
+    /*
+     * we should read the error details from the real hardware
+     * configuration spaces, here we only need to do is signaling
+     * to guest an uncorrectable error has occurred.
+     */
+    if (dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+        /*
+         * if the error is not emitted by this device, we can
+         * just ignore it.
+         */
+        if (!(uncor_status & ~0UL)) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
+stop:
     /*
-     * TBD. Retrieve the error details and decide what action
-     * needs to be taken. One of the actions could be to pass
-     * the error to the guest and have the guest driver recover
-     * from the error. This requires that PCIe capabilities be
-     * exposed to the guest. For now, we just terminate the
-     * guest to contain the error.
+     * If the aer capability is not exposed to the guest. we just
+     * terminate the guest to contain the error.
      */
 
     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (9 preceding siblings ...)
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 10/12] vfio-pci: pass the aer error to guest Cao jin
@ 2016-04-05 11:42 ` Cao jin
  2016-04-11 21:38   ` Alex Williamson
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 12/12] vfio: add 'aer' property to expose aercap Cao jin
  11 siblings, 1 reply; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

for supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

it indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. but in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely. then do guest aer injection.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c              | 157 +++++++++++++++++++++++++++++++++++----------
 hw/vfio/pci.h              |   2 +
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 126 insertions(+), 34 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 691ff5e..d79fb3d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2610,12 +2610,7 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
-    PCIDevice *dev = &vdev->pdev;
     Error *local_err = NULL;
-    PCIEAERMsg msg = {
-        .severity = 0,
-        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
-    };
 
     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
         return;
@@ -2636,35 +2631,7 @@ static void vfio_err_notifier_handler(void *opaque)
         goto stop;
     }
 
-    /*
-     * we should read the error details from the real hardware
-     * configuration spaces, here we only need to do is signaling
-     * to guest an uncorrectable error has occurred.
-     */
-    if (dev->exp.aer_cap) {
-        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
-        uint32_t uncor_status;
-        bool isfatal;
-
-        uncor_status = vfio_pci_read_config(dev,
-                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
-
-        /*
-         * if the error is not emitted by this device, we can
-         * just ignore it.
-         */
-        if (!(uncor_status & ~0UL)) {
-            return;
-        }
-
-        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
-
-        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
-                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
-
-        pcie_aer_msg(dev, &msg);
-        return;
-    }
+    return;
 
 stop:
     /*
@@ -2757,6 +2724,126 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
     event_notifier_cleanup(&vdev->err_notifier);
 }
 
+static void vfio_resume_notifier_handler(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    PCIEAERMsg msg = {
+        .severity = 0,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
+
+    if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
+        return;
+    }
+
+    /*
+     * we should read the error details from the real hardware
+     * configuration spaces, here we only need to do is signaling
+     * to guest an uncorrectable error has occurred.
+     */
+    if (dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+        /*
+         * if the error is not emitted by this device, we can
+         * just ignore it.
+         */
+        if (!(uncor_status & ~0UL)) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+        pcie_aer_msg(dev, &msg);
+    }
+}
+
+static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
+{
+    int ret;
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        return;
+    }
+
+    if (event_notifier_init(&vdev->resume_notifier, 0)) {
+        error_report("vfio: Unable to init event notifier for"
+                     " resume notification");
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&vdev->resume_notifier);
+    qemu_set_fd_handler(*pfd, vfio_resume_notifier_handler, NULL, vdev);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to set up resume notification");
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+        event_notifier_cleanup(&vdev->resume_notifier);
+    } else {
+        vdev->resume_enabled = true;
+    }
+    g_free(irq_set);
+}
+
+static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev)
+{
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret;
+
+    if (!vdev->resume_enabled) {
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = -1;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to de-assign error fd: %m");
+    }
+    g_free(irq_set);
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier),
+                        NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->resume_notifier);
+
+    vdev->resume_enabled = false;
+}
+
 static void vfio_req_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
@@ -3062,6 +3149,7 @@ static int vfio_initfn(PCIDevice *pdev)
     }
 
     vfio_register_err_notifier(vdev);
+    vfio_register_aer_resume_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
 
@@ -3092,6 +3180,7 @@ static void vfio_exitfn(PCIDevice *pdev)
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 
     vfio_unregister_req_notifier(vdev);
+    vfio_unregister_aer_resume_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
     vfio_disable_interrupts(vdev);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 9fb0206..3ebc58f 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice {
     VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
     PCIHostDeviceAddress host;
     EventNotifier err_notifier;
+    EventNotifier resume_notifier;
     EventNotifier req_notifier;
     int (*resetfn)(struct VFIOPCIDevice *);
     uint32_t vendor_id;
@@ -144,6 +145,7 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_msi;
     bool no_kvm_msix;
     bool single_depend_dev;
+    bool resume_enabled;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 15e096c..6d1826d 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -345,6 +345,7 @@ enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
+	VFIO_PCI_RESUME_IRQ_INDEX,
 	VFIO_PCI_NUM_IRQS
 };
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [patch v6 12/12] vfio: add 'aer' property to expose aercap
  2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (10 preceding siblings ...)
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume Cao jin
@ 2016-04-05 11:42 ` Cao jin
  11 siblings, 0 replies; 26+ messages in thread
From: Cao jin @ 2016-04-05 11:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chen Fan, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

add 'aer' property to let user able to decide whether expose
the aer capability. by default we should disable aer feature,
because it needs configuration restrictions.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d79fb3d..b78c0fa 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3281,6 +3281,8 @@ static Property vfio_pci_dev_properties[] = {
                        sub_vendor_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
                        sub_device_id, PCI_ANY_ID),
+    DEFINE_PROP_BIT("aer", VFIOPCIDevice, features,
+                    VFIO_FEATURE_ENABLE_AER_BIT, false),
     /*
      * TODO - support passed fds... is this necessary?
      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-05 11:42 ` [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume Cao jin
@ 2016-04-11 21:38   ` Alex Williamson
  2016-04-14  1:02     ` Chen Fan
  0 siblings, 1 reply; 26+ messages in thread
From: Alex Williamson @ 2016-04-11 21:38 UTC (permalink / raw)
  To: Cao jin; +Cc: qemu-devel, mst, izumi.taku, Chen Fan

On Tue, 5 Apr 2016 19:42:02 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> for supporting aer recovery, host and guest would run the same aer
> recovery code, that would do the secondary bus reset if the error
> is fatal, the aer recovery process:
>   1. error_detected
>   2. reset_link (if fatal)
>   3. slot_reset/mmio_enabled
>   4. resume
> 
> it indicates that host will do secondary bus reset to reset
> the physical devices under bus in step 2, that would cause
> devices in D3 status in a short time. but in qemu, we register
> an error detected handler, that would be invoked as host broadcasts
> the error-detected event in step 1, in order to avoid guest do
> reset_link when host do reset_link simultaneously. it may cause
> fatal error. we introduce a resmue notifier to assure host reset
> completely. then do guest aer injection.

Why is it safe to continue running the VM between the error detected
notification and the resume notification?  We're just pushing back the
point at which we inject the AER into the guest, potentially negating
any benefit by allowing the VM to consume bad data.  Shouldn't we
instead be immediately notifying the VM on error detected, but stalling
any access to the device until resume is signaled?  How do we know that
resume will ever be signaled?  We have both the problem that we may be
running on an older kernel that won't support a resume notification and
the problem that seeing a resume notification depends on the host being
able to successfully complete a link reset after fatal error.  We can
detect support for resume notification, but we still need a strategy
for never receiving it.  Thanks,

Alex

> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c              | 157 +++++++++++++++++++++++++++++++++++----------
>  hw/vfio/pci.h              |   2 +
>  linux-headers/linux/vfio.h |   1 +
>  3 files changed, 126 insertions(+), 34 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 691ff5e..d79fb3d 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2610,12 +2610,7 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>  static void vfio_err_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> -    PCIDevice *dev = &vdev->pdev;
>      Error *local_err = NULL;
> -    PCIEAERMsg msg = {
> -        .severity = 0,
> -        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> -    };
>  
>      if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>          return;
> @@ -2636,35 +2631,7 @@ static void vfio_err_notifier_handler(void *opaque)
>          goto stop;
>      }
>  
> -    /*
> -     * we should read the error details from the real hardware
> -     * configuration spaces, here we only need to do is signaling
> -     * to guest an uncorrectable error has occurred.
> -     */
> -    if (dev->exp.aer_cap) {
> -        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> -        uint32_t uncor_status;
> -        bool isfatal;
> -
> -        uncor_status = vfio_pci_read_config(dev,
> -                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> -
> -        /*
> -         * if the error is not emitted by this device, we can
> -         * just ignore it.
> -         */
> -        if (!(uncor_status & ~0UL)) {
> -            return;
> -        }
> -
> -        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> -
> -        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> -                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
> -
> -        pcie_aer_msg(dev, &msg);
> -        return;
> -    }
> +    return;
>  
>  stop:
>      /*
> @@ -2757,6 +2724,126 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
>      event_notifier_cleanup(&vdev->err_notifier);
>  }
>  
> +static void vfio_resume_notifier_handler(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIEAERMsg msg = {
> +        .severity = 0,
> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> +    };
> +
> +    if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
> +        return;
> +    }
> +
> +    /*
> +     * we should read the error details from the real hardware
> +     * configuration spaces, here we only need to do is signaling
> +     * to guest an uncorrectable error has occurred.
> +     */
> +    if (dev->exp.aer_cap) {
> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +        uint32_t uncor_status;
> +        bool isfatal;
> +
> +        uncor_status = vfio_pci_read_config(dev,
> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> +
> +        /*
> +         * if the error is not emitted by this device, we can
> +         * just ignore it.
> +         */
> +        if (!(uncor_status & ~0UL)) {
> +            return;
> +        }
> +
> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +
> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
> +
> +        pcie_aer_msg(dev, &msg);
> +    }
> +}
> +
> +static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
> +{
> +    int ret;
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +
> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
> +        return;
> +    }
> +
> +    if (event_notifier_init(&vdev->resume_notifier, 0)) {
> +        error_report("vfio: Unable to init event notifier for"
> +                     " resume notification");
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&vdev->resume_notifier);
> +    qemu_set_fd_handler(*pfd, vfio_resume_notifier_handler, NULL, vdev);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to set up resume notification");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
> +        event_notifier_cleanup(&vdev->resume_notifier);
> +    } else {
> +        vdev->resume_enabled = true;
> +    }
> +    g_free(irq_set);
> +}
> +
> +static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev)
> +{
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret;
> +
> +    if (!vdev->resume_enabled) {
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +    *pfd = -1;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to de-assign error fd: %m");
> +    }
> +    g_free(irq_set);
> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier),
> +                        NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->resume_notifier);
> +
> +    vdev->resume_enabled = false;
> +}
> +
>  static void vfio_req_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> @@ -3062,6 +3149,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      }
>  
>      vfio_register_err_notifier(vdev);
> +    vfio_register_aer_resume_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
>  
> @@ -3092,6 +3180,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>  
>      vfio_unregister_req_notifier(vdev);
> +    vfio_unregister_aer_resume_notifier(vdev);
>      vfio_unregister_err_notifier(vdev);
>      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>      vfio_disable_interrupts(vdev);
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 9fb0206..3ebc58f 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice {
>      VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
>      PCIHostDeviceAddress host;
>      EventNotifier err_notifier;
> +    EventNotifier resume_notifier;
>      EventNotifier req_notifier;
>      int (*resetfn)(struct VFIOPCIDevice *);
>      uint32_t vendor_id;
> @@ -144,6 +145,7 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_msi;
>      bool no_kvm_msix;
>      bool single_depend_dev;
> +    bool resume_enabled;
>  } VFIOPCIDevice;
>  
>  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 15e096c..6d1826d 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -345,6 +345,7 @@ enum {
>  	VFIO_PCI_MSIX_IRQ_INDEX,
>  	VFIO_PCI_ERR_IRQ_INDEX,
>  	VFIO_PCI_REQ_IRQ_INDEX,
> +	VFIO_PCI_RESUME_IRQ_INDEX,
>  	VFIO_PCI_NUM_IRQS
>  };
>  

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-11 21:38   ` Alex Williamson
@ 2016-04-14  1:02     ` Chen Fan
  2016-04-26  3:39       ` Chen Fan
  0 siblings, 1 reply; 26+ messages in thread
From: Chen Fan @ 2016-04-14  1:02 UTC (permalink / raw)
  To: Alex Williamson, Cao jin; +Cc: qemu-devel, mst, izumi.taku


On 04/12/2016 05:38 AM, Alex Williamson wrote:
> On Tue, 5 Apr 2016 19:42:02 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> for supporting aer recovery, host and guest would run the same aer
>> recovery code, that would do the secondary bus reset if the error
>> is fatal, the aer recovery process:
>>    1. error_detected
>>    2. reset_link (if fatal)
>>    3. slot_reset/mmio_enabled
>>    4. resume
>>
>> it indicates that host will do secondary bus reset to reset
>> the physical devices under bus in step 2, that would cause
>> devices in D3 status in a short time. but in qemu, we register
>> an error detected handler, that would be invoked as host broadcasts
>> the error-detected event in step 1, in order to avoid guest do
>> reset_link when host do reset_link simultaneously. it may cause
>> fatal error. we introduce a resmue notifier to assure host reset
>> completely. then do guest aer injection.
> Why is it safe to continue running the VM between the error detected
> notification and the resume notification?  We're just pushing back the
> point at which we inject the AER into the guest, potentially negating
> any benefit by allowing the VM to consume bad data.  Shouldn't we
> instead be immediately notifying the VM on error detected, but stalling
> any access to the device until resume is signaled?  How do we know that
> resume will ever be signaled?  We have both the problem that we may be
> running on an older kernel that won't support a resume notification and
> the problem that seeing a resume notification depends on the host being
> able to successfully complete a link reset after fatal error.  We can
> detect support for resume notification, but we still need a strategy
> for never receiving it.  Thanks,
That's make sense, but I haven't came up with a good idea. do you have
any idea, Alex?

Thanks,
Chen


>
> Alex
>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c              | 157 +++++++++++++++++++++++++++++++++++----------
>>   hw/vfio/pci.h              |   2 +
>>   linux-headers/linux/vfio.h |   1 +
>>   3 files changed, 126 insertions(+), 34 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 691ff5e..d79fb3d 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2610,12 +2610,7 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>   static void vfio_err_notifier_handler(void *opaque)
>>   {
>>       VFIOPCIDevice *vdev = opaque;
>> -    PCIDevice *dev = &vdev->pdev;
>>       Error *local_err = NULL;
>> -    PCIEAERMsg msg = {
>> -        .severity = 0,
>> -        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>> -    };
>>   
>>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>           return;
>> @@ -2636,35 +2631,7 @@ static void vfio_err_notifier_handler(void *opaque)
>>           goto stop;
>>       }
>>   
>> -    /*
>> -     * we should read the error details from the real hardware
>> -     * configuration spaces, here we only need to do is signaling
>> -     * to guest an uncorrectable error has occurred.
>> -     */
>> -    if (dev->exp.aer_cap) {
>> -        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> -        uint32_t uncor_status;
>> -        bool isfatal;
>> -
>> -        uncor_status = vfio_pci_read_config(dev,
>> -                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> -
>> -        /*
>> -         * if the error is not emitted by this device, we can
>> -         * just ignore it.
>> -         */
>> -        if (!(uncor_status & ~0UL)) {
>> -            return;
>> -        }
>> -
>> -        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
>> -
>> -        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>> -                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
>> -
>> -        pcie_aer_msg(dev, &msg);
>> -        return;
>> -    }
>> +    return;
>>   
>>   stop:
>>       /*
>> @@ -2757,6 +2724,126 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
>>       event_notifier_cleanup(&vdev->err_notifier);
>>   }
>>   
>> +static void vfio_resume_notifier_handler(void *opaque)
>> +{
>> +    VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *dev = &vdev->pdev;
>> +    PCIEAERMsg msg = {
>> +        .severity = 0,
>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>> +    };
>> +
>> +    if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * we should read the error details from the real hardware
>> +     * configuration spaces, here we only need to do is signaling
>> +     * to guest an uncorrectable error has occurred.
>> +     */
>> +    if (dev->exp.aer_cap) {
>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> +        uint32_t uncor_status;
>> +        bool isfatal;
>> +
>> +        uncor_status = vfio_pci_read_config(dev,
>> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> +
>> +        /*
>> +         * if the error is not emitted by this device, we can
>> +         * just ignore it.
>> +         */
>> +        if (!(uncor_status & ~0UL)) {
>> +            return;
>> +        }
>> +
>> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
>> +
>> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
>> +
>> +        pcie_aer_msg(dev, &msg);
>> +    }
>> +}
>> +
>> +static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
>> +{
>> +    int ret;
>> +    int argsz;
>> +    struct vfio_irq_set *irq_set;
>> +    int32_t *pfd;
>> +
>> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
>> +        return;
>> +    }
>> +
>> +    if (event_notifier_init(&vdev->resume_notifier, 0)) {
>> +        error_report("vfio: Unable to init event notifier for"
>> +                     " resume notification");
>> +        return;
>> +    }
>> +
>> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
>> +
>> +    irq_set = g_malloc0(argsz);
>> +    irq_set->argsz = argsz;
>> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
>> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
>> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
>> +    irq_set->start = 0;
>> +    irq_set->count = 1;
>> +    pfd = (int32_t *)&irq_set->data;
>> +
>> +    *pfd = event_notifier_get_fd(&vdev->resume_notifier);
>> +    qemu_set_fd_handler(*pfd, vfio_resume_notifier_handler, NULL, vdev);
>> +
>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
>> +    if (ret) {
>> +        error_report("vfio: Failed to set up resume notification");
>> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
>> +        event_notifier_cleanup(&vdev->resume_notifier);
>> +    } else {
>> +        vdev->resume_enabled = true;
>> +    }
>> +    g_free(irq_set);
>> +}
>> +
>> +static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev)
>> +{
>> +    int argsz;
>> +    struct vfio_irq_set *irq_set;
>> +    int32_t *pfd;
>> +    int ret;
>> +
>> +    if (!vdev->resume_enabled) {
>> +        return;
>> +    }
>> +
>> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
>> +
>> +    irq_set = g_malloc0(argsz);
>> +    irq_set->argsz = argsz;
>> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
>> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
>> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
>> +    irq_set->start = 0;
>> +    irq_set->count = 1;
>> +    pfd = (int32_t *)&irq_set->data;
>> +    *pfd = -1;
>> +
>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
>> +    if (ret) {
>> +        error_report("vfio: Failed to de-assign error fd: %m");
>> +    }
>> +    g_free(irq_set);
>> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier),
>> +                        NULL, NULL, vdev);
>> +    event_notifier_cleanup(&vdev->resume_notifier);
>> +
>> +    vdev->resume_enabled = false;
>> +}
>> +
>>   static void vfio_req_notifier_handler(void *opaque)
>>   {
>>       VFIOPCIDevice *vdev = opaque;
>> @@ -3062,6 +3149,7 @@ static int vfio_initfn(PCIDevice *pdev)
>>       }
>>   
>>       vfio_register_err_notifier(vdev);
>> +    vfio_register_aer_resume_notifier(vdev);
>>       vfio_register_req_notifier(vdev);
>>       vfio_setup_resetfn_quirk(vdev);
>>   
>> @@ -3092,6 +3180,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>   
>>       vfio_unregister_req_notifier(vdev);
>> +    vfio_unregister_aer_resume_notifier(vdev);
>>       vfio_unregister_err_notifier(vdev);
>>       pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>>       vfio_disable_interrupts(vdev);
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 9fb0206..3ebc58f 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice {
>>       VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
>>       PCIHostDeviceAddress host;
>>       EventNotifier err_notifier;
>> +    EventNotifier resume_notifier;
>>       EventNotifier req_notifier;
>>       int (*resetfn)(struct VFIOPCIDevice *);
>>       uint32_t vendor_id;
>> @@ -144,6 +145,7 @@ typedef struct VFIOPCIDevice {
>>       bool no_kvm_msi;
>>       bool no_kvm_msix;
>>       bool single_depend_dev;
>> +    bool resume_enabled;
>>   } VFIOPCIDevice;
>>   
>>   uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
>> index 15e096c..6d1826d 100644
>> --- a/linux-headers/linux/vfio.h
>> +++ b/linux-headers/linux/vfio.h
>> @@ -345,6 +345,7 @@ enum {
>>   	VFIO_PCI_MSIX_IRQ_INDEX,
>>   	VFIO_PCI_ERR_IRQ_INDEX,
>>   	VFIO_PCI_REQ_IRQ_INDEX,
>> +	VFIO_PCI_RESUME_IRQ_INDEX,
>>   	VFIO_PCI_NUM_IRQS
>>   };
>>   
>
>
> .
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-14  1:02     ` Chen Fan
@ 2016-04-26  3:39       ` Chen Fan
  2016-04-26 14:48         ` Alex Williamson
  0 siblings, 1 reply; 26+ messages in thread
From: Chen Fan @ 2016-04-26  3:39 UTC (permalink / raw)
  To: Alex Williamson, Cao jin; +Cc: izumi.taku, qemu-devel, mst


On 04/14/2016 09:02 AM, Chen Fan wrote:
>
> On 04/12/2016 05:38 AM, Alex Williamson wrote:
>> On Tue, 5 Apr 2016 19:42:02 +0800
>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>
>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>
>>> for supporting aer recovery, host and guest would run the same aer
>>> recovery code, that would do the secondary bus reset if the error
>>> is fatal, the aer recovery process:
>>>    1. error_detected
>>>    2. reset_link (if fatal)
>>>    3. slot_reset/mmio_enabled
>>>    4. resume
>>>
>>> it indicates that host will do secondary bus reset to reset
>>> the physical devices under bus in step 2, that would cause
>>> devices in D3 status in a short time. but in qemu, we register
>>> an error detected handler, that would be invoked as host broadcasts
>>> the error-detected event in step 1, in order to avoid guest do
>>> reset_link when host do reset_link simultaneously. it may cause
>>> fatal error. we introduce a resmue notifier to assure host reset
>>> completely. then do guest aer injection.
>> Why is it safe to continue running the VM between the error detected
>> notification and the resume notification?  We're just pushing back the
>> point at which we inject the AER into the guest, potentially negating
>> any benefit by allowing the VM to consume bad data.  Shouldn't we
>> instead be immediately notifying the VM on error detected, but stalling
>> any access to the device until resume is signaled?  How do we know that
>> resume will ever be signaled?  We have both the problem that we may be
>> running on an older kernel that won't support a resume notification and
>> the problem that seeing a resume notification depends on the host being
>> able to successfully complete a link reset after fatal error. We can
>> detect support for resume notification, but we still need a strategy
>> for never receiving it.  Thanks,
> That's make sense, but I haven't came up with a good idea. do you have
> any idea, Alex?
>
ping...

> Thanks,
> Chen
>
>
>>
>> Alex
>>
>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>> ---
>>>   hw/vfio/pci.c              | 157 
>>> +++++++++++++++++++++++++++++++++++----------
>>>   hw/vfio/pci.h              |   2 +
>>>   linux-headers/linux/vfio.h |   1 +
>>>   3 files changed, 126 insertions(+), 34 deletions(-)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index 691ff5e..d79fb3d 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -2610,12 +2610,7 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>>   static void vfio_err_notifier_handler(void *opaque)
>>>   {
>>>       VFIOPCIDevice *vdev = opaque;
>>> -    PCIDevice *dev = &vdev->pdev;
>>>       Error *local_err = NULL;
>>> -    PCIEAERMsg msg = {
>>> -        .severity = 0,
>>> -        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>>> -    };
>>>         if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>           return;
>>> @@ -2636,35 +2631,7 @@ static void vfio_err_notifier_handler(void 
>>> *opaque)
>>>           goto stop;
>>>       }
>>>   -    /*
>>> -     * we should read the error details from the real hardware
>>> -     * configuration spaces, here we only need to do is signaling
>>> -     * to guest an uncorrectable error has occurred.
>>> -     */
>>> -    if (dev->exp.aer_cap) {
>>> -        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>>> -        uint32_t uncor_status;
>>> -        bool isfatal;
>>> -
>>> -        uncor_status = vfio_pci_read_config(dev,
>>> -                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 
>>> 4);
>>> -
>>> -        /*
>>> -         * if the error is not emitted by this device, we can
>>> -         * just ignore it.
>>> -         */
>>> -        if (!(uncor_status & ~0UL)) {
>>> -            return;
>>> -        }
>>> -
>>> -        isfatal = uncor_status & pci_get_long(aer_cap + 
>>> PCI_ERR_UNCOR_SEVER);
>>> -
>>> -        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>> - PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>> -
>>> -        pcie_aer_msg(dev, &msg);
>>> -        return;
>>> -    }
>>> +    return;
>>>     stop:
>>>       /*
>>> @@ -2757,6 +2724,126 @@ static void 
>>> vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
>>>       event_notifier_cleanup(&vdev->err_notifier);
>>>   }
>>>   +static void vfio_resume_notifier_handler(void *opaque)
>>> +{
>>> +    VFIOPCIDevice *vdev = opaque;
>>> +    PCIDevice *dev = &vdev->pdev;
>>> +    PCIEAERMsg msg = {
>>> +        .severity = 0,
>>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>>> +    };
>>> +
>>> +    if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * we should read the error details from the real hardware
>>> +     * configuration spaces, here we only need to do is signaling
>>> +     * to guest an uncorrectable error has occurred.
>>> +     */
>>> +    if (dev->exp.aer_cap) {
>>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>>> +        uint32_t uncor_status;
>>> +        bool isfatal;
>>> +
>>> +        uncor_status = vfio_pci_read_config(dev,
>>> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 
>>> 4);
>>> +
>>> +        /*
>>> +         * if the error is not emitted by this device, we can
>>> +         * just ignore it.
>>> +         */
>>> +        if (!(uncor_status & ~0UL)) {
>>> +            return;
>>> +        }
>>> +
>>> +        isfatal = uncor_status & pci_get_long(aer_cap + 
>>> PCI_ERR_UNCOR_SEVER);
>>> +
>>> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>> + PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>> +
>>> +        pcie_aer_msg(dev, &msg);
>>> +    }
>>> +}
>>> +
>>> +static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
>>> +{
>>> +    int ret;
>>> +    int argsz;
>>> +    struct vfio_irq_set *irq_set;
>>> +    int32_t *pfd;
>>> +
>>> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (event_notifier_init(&vdev->resume_notifier, 0)) {
>>> +        error_report("vfio: Unable to init event notifier for"
>>> +                     " resume notification");
>>> +        return;
>>> +    }
>>> +
>>> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
>>> +
>>> +    irq_set = g_malloc0(argsz);
>>> +    irq_set->argsz = argsz;
>>> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
>>> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
>>> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
>>> +    irq_set->start = 0;
>>> +    irq_set->count = 1;
>>> +    pfd = (int32_t *)&irq_set->data;
>>> +
>>> +    *pfd = event_notifier_get_fd(&vdev->resume_notifier);
>>> +    qemu_set_fd_handler(*pfd, vfio_resume_notifier_handler, NULL, 
>>> vdev);
>>> +
>>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
>>> +    if (ret) {
>>> +        error_report("vfio: Failed to set up resume notification");
>>> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
>>> + event_notifier_cleanup(&vdev->resume_notifier);
>>> +    } else {
>>> +        vdev->resume_enabled = true;
>>> +    }
>>> +    g_free(irq_set);
>>> +}
>>> +
>>> +static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev)
>>> +{
>>> +    int argsz;
>>> +    struct vfio_irq_set *irq_set;
>>> +    int32_t *pfd;
>>> +    int ret;
>>> +
>>> +    if (!vdev->resume_enabled) {
>>> +        return;
>>> +    }
>>> +
>>> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
>>> +
>>> +    irq_set = g_malloc0(argsz);
>>> +    irq_set->argsz = argsz;
>>> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
>>> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
>>> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
>>> +    irq_set->start = 0;
>>> +    irq_set->count = 1;
>>> +    pfd = (int32_t *)&irq_set->data;
>>> +    *pfd = -1;
>>> +
>>> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
>>> +    if (ret) {
>>> +        error_report("vfio: Failed to de-assign error fd: %m");
>>> +    }
>>> +    g_free(irq_set);
>>> + qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier),
>>> +                        NULL, NULL, vdev);
>>> +    event_notifier_cleanup(&vdev->resume_notifier);
>>> +
>>> +    vdev->resume_enabled = false;
>>> +}
>>> +
>>>   static void vfio_req_notifier_handler(void *opaque)
>>>   {
>>>       VFIOPCIDevice *vdev = opaque;
>>> @@ -3062,6 +3149,7 @@ static int vfio_initfn(PCIDevice *pdev)
>>>       }
>>>         vfio_register_err_notifier(vdev);
>>> +    vfio_register_aer_resume_notifier(vdev);
>>>       vfio_register_req_notifier(vdev);
>>>       vfio_setup_resetfn_quirk(vdev);
>>>   @@ -3092,6 +3180,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>>>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>         vfio_unregister_req_notifier(vdev);
>>> +    vfio_unregister_aer_resume_notifier(vdev);
>>>       vfio_unregister_err_notifier(vdev);
>>>       pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>>>       vfio_disable_interrupts(vdev);
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>> index 9fb0206..3ebc58f 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice {
>>>       VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
>>>       PCIHostDeviceAddress host;
>>>       EventNotifier err_notifier;
>>> +    EventNotifier resume_notifier;
>>>       EventNotifier req_notifier;
>>>       int (*resetfn)(struct VFIOPCIDevice *);
>>>       uint32_t vendor_id;
>>> @@ -144,6 +145,7 @@ typedef struct VFIOPCIDevice {
>>>       bool no_kvm_msi;
>>>       bool no_kvm_msix;
>>>       bool single_depend_dev;
>>> +    bool resume_enabled;
>>>   } VFIOPCIDevice;
>>>     uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, 
>>> int len);
>>> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
>>> index 15e096c..6d1826d 100644
>>> --- a/linux-headers/linux/vfio.h
>>> +++ b/linux-headers/linux/vfio.h
>>> @@ -345,6 +345,7 @@ enum {
>>>       VFIO_PCI_MSIX_IRQ_INDEX,
>>>       VFIO_PCI_ERR_IRQ_INDEX,
>>>       VFIO_PCI_REQ_IRQ_INDEX,
>>> +    VFIO_PCI_RESUME_IRQ_INDEX,
>>>       VFIO_PCI_NUM_IRQS
>>>   };
>>
>>
>> .
>>
>
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-26  3:39       ` Chen Fan
@ 2016-04-26 14:48         ` Alex Williamson
  2016-05-06  1:38           ` Chen Fan
  2016-05-24 10:49           ` Michael S. Tsirkin
  0 siblings, 2 replies; 26+ messages in thread
From: Alex Williamson @ 2016-04-26 14:48 UTC (permalink / raw)
  To: Chen Fan; +Cc: Cao jin, izumi.taku, qemu-devel, mst

On Tue, 26 Apr 2016 11:39:02 +0800
Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:

> On 04/14/2016 09:02 AM, Chen Fan wrote:
> >
> > On 04/12/2016 05:38 AM, Alex Williamson wrote:  
> >> On Tue, 5 Apr 2016 19:42:02 +0800
> >> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >>  
> >>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>
> >>> for supporting aer recovery, host and guest would run the same aer
> >>> recovery code, that would do the secondary bus reset if the error
> >>> is fatal, the aer recovery process:
> >>>    1. error_detected
> >>>    2. reset_link (if fatal)
> >>>    3. slot_reset/mmio_enabled
> >>>    4. resume
> >>>
> >>> it indicates that host will do secondary bus reset to reset
> >>> the physical devices under bus in step 2, that would cause
> >>> devices in D3 status in a short time. but in qemu, we register
> >>> an error detected handler, that would be invoked as host broadcasts
> >>> the error-detected event in step 1, in order to avoid guest do
> >>> reset_link when host do reset_link simultaneously. it may cause
> >>> fatal error. we introduce a resmue notifier to assure host reset
> >>> completely. then do guest aer injection.  
> >> Why is it safe to continue running the VM between the error detected
> >> notification and the resume notification?  We're just pushing back the
> >> point at which we inject the AER into the guest, potentially negating
> >> any benefit by allowing the VM to consume bad data.  Shouldn't we
> >> instead be immediately notifying the VM on error detected, but stalling
> >> any access to the device until resume is signaled?  How do we know that
> >> resume will ever be signaled?  We have both the problem that we may be
> >> running on an older kernel that won't support a resume notification and
> >> the problem that seeing a resume notification depends on the host being
> >> able to successfully complete a link reset after fatal error. We can
> >> detect support for resume notification, but we still need a strategy
> >> for never receiving it.  Thanks,  
> > That's make sense, but I haven't came up with a good idea. do you have
> > any idea, Alex?

I don't know that there are any good solutions here.  We need to
respond to the current error notifier interrupt and not regress from
our support there.  I think that means that if we want to switch from a
simple halt-on-error to a mechanism for the guest to handle recovery,
we need to disable access to the device between being notified that the
error occurred and being notified to resume.  We can do that by
disabling mmaps to the device and preventing access via the slow path
handlers.  I don't know what the best solution is for preventing access,
do we block and pause the VM or do we drop writes and return -1 for
reads, that's something that needs to be determined.  We also need to
inject the AER into the VM at the point we're notified of an error
because the VM needs to know as soon as possible to stop using the
device or trusting any data from it.  The next coordination point would
be something like the resume notifier that you've added and there are
numerous questions around the interaction of that with the guest
handling.  Clearly we can't do a guest directed bus reset until we get
the resume notifier, so do we block that execution path in QEMU until
the resume notification is received?  What happens if we don't get that
notification?  Is there any way that we can rely on the host having
done a bus reset to the point where we don't need to act on the guest
directed reset?  These are all things that need to be figured out.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-26 14:48         ` Alex Williamson
@ 2016-05-06  1:38           ` Chen Fan
  2016-05-06 16:39             ` Alex Williamson
  2016-05-24 10:49           ` Michael S. Tsirkin
  1 sibling, 1 reply; 26+ messages in thread
From: Chen Fan @ 2016-05-06  1:38 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Cao jin, izumi.taku, qemu-devel, mst, zhoujie2011


On 04/26/2016 10:48 PM, Alex Williamson wrote:
> On Tue, 26 Apr 2016 11:39:02 +0800
> Chen Fan<chen.fan.fnst@cn.fujitsu.com>  wrote:
>
>> On 04/14/2016 09:02 AM, Chen Fan wrote:
>>> On 04/12/2016 05:38 AM, Alex Williamson wrote:
>>>> On Tue, 5 Apr 2016 19:42:02 +0800
>>>> Cao jin<caoj.fnst@cn.fujitsu.com>  wrote:
>>>>   
>>>>> From: Chen Fan<chen.fan.fnst@cn.fujitsu.com>
>>>>>
>>>>> for supporting aer recovery, host and guest would run the same aer
>>>>> recovery code, that would do the secondary bus reset if the error
>>>>> is fatal, the aer recovery process:
>>>>>     1. error_detected
>>>>>     2. reset_link (if fatal)
>>>>>     3. slot_reset/mmio_enabled
>>>>>     4. resume
>>>>>
>>>>> it indicates that host will do secondary bus reset to reset
>>>>> the physical devices under bus in step 2, that would cause
>>>>> devices in D3 status in a short time. but in qemu, we register
>>>>> an error detected handler, that would be invoked as host broadcasts
>>>>> the error-detected event in step 1, in order to avoid guest do
>>>>> reset_link when host do reset_link simultaneously. it may cause
>>>>> fatal error. we introduce a resmue notifier to assure host reset
>>>>> completely. then do guest aer injection.
>>>> Why is it safe to continue running the VM between the error detected
>>>> notification and the resume notification?  We're just pushing back the
>>>> point at which we inject the AER into the guest, potentially negating
>>>> any benefit by allowing the VM to consume bad data.  Shouldn't we
>>>> instead be immediately notifying the VM on error detected, but stalling
>>>> any access to the device until resume is signaled?  How do we know that
>>>> resume will ever be signaled?  We have both the problem that we may be
>>>> running on an older kernel that won't support a resume notification and
>>>> the problem that seeing a resume notification depends on the host being
>>>> able to successfully complete a link reset after fatal error. We can
>>>> detect support for resume notification, but we still need a strategy
>>>> for never receiving it.  Thanks,
>>> That's make sense, but I haven't came up with a good idea. do you have
>>> any idea, Alex?
> I don't know that there are any good solutions here.  We need to
> respond to the current error notifier interrupt and not regress from
> our support there.  I think that means that if we want to switch from a
> simple halt-on-error to a mechanism for the guest to handle recovery,
> we need to disable access to the device between being notified that the
> error occurred and being notified to resume.  We can do that by
> disabling mmaps to the device and preventing access via the slow path
> handlers.  I don't know what the best solution is for preventing access,
> do we block and pause the VM or do we drop writes and return -1 for
> reads, that's something that needs to be determined.  We also need to
> inject the AER into the VM at the point we're notified of an error
> because the VM needs to know as soon as possible to stop using the
> device or trusting any data from it.  The next coordination point would
> be something like the resume notifier that you've added and there are
> numerous questions around the interaction of that with the guest
> handling.  Clearly we can't do a guest directed bus reset until we get
> the resume notifier, so do we block that execution path in QEMU until
> the resume notification is received?  What happens if we don't get that
> notification?  Is there any way that we can rely on the host having
> done a bus reset to the point where we don't need to act on the guest
> directed reset?  These are all things that need to be figured out.
> Thanks,
Maybe we can simply pause the vcpu running and avoid the VM to
access the device. and add two flags in VFIO_DEVICE_GET_INFO to query
whether the vfio pci driver has a resume notifier,
if it does not have resume notifier flags, we can directly fail to boot 
up VM
as with aer enabled. otherwise, we should wait for resume notifier coming to
restart the cpu. about the problem of the reduplicated bus reset by host 
and guest,
I think qemu can according to the error is fatal or non-fatal to decide 
whether need
to do a bus reset on guest, I think it's not critical and could be 
resolved later.

Thanks,
Chen


> Alex
>
>
> .
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-06  1:38           ` Chen Fan
@ 2016-05-06 16:39             ` Alex Williamson
  2016-05-11  3:11               ` Zhou Jie
  0 siblings, 1 reply; 26+ messages in thread
From: Alex Williamson @ 2016-05-06 16:39 UTC (permalink / raw)
  To: Chen Fan; +Cc: Cao jin, izumi.taku, qemu-devel, mst, zhoujie2011

On Fri, 6 May 2016 09:38:41 +0800
Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:

> On 04/26/2016 10:48 PM, Alex Williamson wrote:
> > On Tue, 26 Apr 2016 11:39:02 +0800
> > Chen Fan<chen.fan.fnst@cn.fujitsu.com>  wrote:
> >  
> >> On 04/14/2016 09:02 AM, Chen Fan wrote:  
> >>> On 04/12/2016 05:38 AM, Alex Williamson wrote:  
> >>>> On Tue, 5 Apr 2016 19:42:02 +0800
> >>>> Cao jin<caoj.fnst@cn.fujitsu.com>  wrote:
> >>>>     
> >>>>> From: Chen Fan<chen.fan.fnst@cn.fujitsu.com>
> >>>>>
> >>>>> for supporting aer recovery, host and guest would run the same aer
> >>>>> recovery code, that would do the secondary bus reset if the error
> >>>>> is fatal, the aer recovery process:
> >>>>>     1. error_detected
> >>>>>     2. reset_link (if fatal)
> >>>>>     3. slot_reset/mmio_enabled
> >>>>>     4. resume
> >>>>>
> >>>>> it indicates that host will do secondary bus reset to reset
> >>>>> the physical devices under bus in step 2, that would cause
> >>>>> devices in D3 status in a short time. but in qemu, we register
> >>>>> an error detected handler, that would be invoked as host broadcasts
> >>>>> the error-detected event in step 1, in order to avoid guest do
> >>>>> reset_link when host do reset_link simultaneously. it may cause
> >>>>> fatal error. we introduce a resmue notifier to assure host reset
> >>>>> completely. then do guest aer injection.  
> >>>> Why is it safe to continue running the VM between the error detected
> >>>> notification and the resume notification?  We're just pushing back the
> >>>> point at which we inject the AER into the guest, potentially negating
> >>>> any benefit by allowing the VM to consume bad data.  Shouldn't we
> >>>> instead be immediately notifying the VM on error detected, but stalling
> >>>> any access to the device until resume is signaled?  How do we know that
> >>>> resume will ever be signaled?  We have both the problem that we may be
> >>>> running on an older kernel that won't support a resume notification and
> >>>> the problem that seeing a resume notification depends on the host being
> >>>> able to successfully complete a link reset after fatal error. We can
> >>>> detect support for resume notification, but we still need a strategy
> >>>> for never receiving it.  Thanks,  
> >>> That's make sense, but I haven't came up with a good idea. do you have
> >>> any idea, Alex?  
> > I don't know that there are any good solutions here.  We need to
> > respond to the current error notifier interrupt and not regress from
> > our support there.  I think that means that if we want to switch from a
> > simple halt-on-error to a mechanism for the guest to handle recovery,
> > we need to disable access to the device between being notified that the
> > error occurred and being notified to resume.  We can do that by
> > disabling mmaps to the device and preventing access via the slow path
> > handlers.  I don't know what the best solution is for preventing access,
> > do we block and pause the VM or do we drop writes and return -1 for
> > reads, that's something that needs to be determined.  We also need to
> > inject the AER into the VM at the point we're notified of an error
> > because the VM needs to know as soon as possible to stop using the
> > device or trusting any data from it.  The next coordination point would
> > be something like the resume notifier that you've added and there are
> > numerous questions around the interaction of that with the guest
> > handling.  Clearly we can't do a guest directed bus reset until we get
> > the resume notifier, so do we block that execution path in QEMU until
> > the resume notification is received?  What happens if we don't get that
> > notification?  Is there any way that we can rely on the host having
> > done a bus reset to the point where we don't need to act on the guest
> > directed reset?  These are all things that need to be figured out.
> > Thanks,  
> Maybe we can simply pause the vcpu running and avoid the VM to
> access the device. and add two flags in VFIO_DEVICE_GET_INFO to query
> whether the vfio pci driver has a resume notifier,
> if it does not have resume notifier flags, we can directly fail to boot 
> up VM
> as with aer enabled.

We can already tell if a resume interrupt is supported between the IRQ
count in vfio_device_info and a probe with vfio_irq_info, what would
additional flags in vfio_device_info tell us beyond a resume interrupt
being supported?  Is pausing the VM acceptable from a service guarantee
perspective to users?  A bus reset can take a full second and I imagine
deeper PCI hierarchies can push that out depending on what level the
error occurs.  A second of downtime may be enough to trigger failovers
to other systems.  If we were to disable mmaps when a fault occurs, we
could trap any further device access, drop writes, return -1 for
reads.  This seems reasonable since we've already notified the VM that
the device had a fault.  The synchronization point seems like when the
guest tries to do a bus reset, we need to block that until we get the
resume notification from the host.  Perhaps if that doesn't occur after
some timeout, we would abort the guest directed bus reset altogether
and make the device disappear, perhaps even initiating an unplug of the
device to prevent it from further interacting with the VM.

> otherwise, we should wait for resume notifier coming to
> restart the cpu. about the problem of the reduplicated bus reset by host 
> and guest,
> I think qemu can according to the error is fatal or non-fatal to decide 
> whether need
> to do a bus reset on guest, I think it's not critical and could be 
> resolved later.

The vfio error interrupt doesn't signal non-fatal errors afaik.  I'm
also not sure we have an guarantee that the host has performed a bus
reset, we shouldn't necessarily design the API that strictly around the
current behavior of the Linux AER handler.  So I don't know that
there's any practical way to avoid duplicate bus resets between host
and guest recovery.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-06 16:39             ` Alex Williamson
@ 2016-05-11  3:11               ` Zhou Jie
  2016-05-11 20:20                 ` Alex Williamson
  0 siblings, 1 reply; 26+ messages in thread
From: Zhou Jie @ 2016-05-11  3:11 UTC (permalink / raw)
  To: Alex Williamson, Chen Fan; +Cc: Cao jin, izumi.taku, qemu-devel, mst

Hi, Alex
     What do you think about the following solution?
     1. Detect support for resume notification.
        If host vfio driver does not have resume notifier flags,
        Directly fail to boot up VM as with aer enabled.
     2. Immediately notify the VM on error detected.
     3. Stall any access to the device until resume is signaled.
        Disable mmaps, drop writes, return -1 for reads.
     4. Delay the guest directed bus reset.
        Don't reset bus in vfio_pci_reset function.
     5. Wait for resume notification.
        If we don't get the resume notification from the host after
        some timeout, we would abort the guest directed bus reset
        altogether and make the device disappear,
        Initiating an unplug of the device to prevent it from further
        interacting with the VM.
     6. After get the resume notification.
        Reset bus.
        It the second bus reset. Because the host did bus reset already.
        But as you said we shouldn't necessarily design the API that
        strictly around the current behavior of the Linux AER handler.

Sincerely,
Zhou Jie

On 2016/5/7 0:39, Alex Williamson wrote:
> On Fri, 6 May 2016 09:38:41 +0800
> Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
>
>> On 04/26/2016 10:48 PM, Alex Williamson wrote:
>>> On Tue, 26 Apr 2016 11:39:02 +0800
>>> Chen Fan<chen.fan.fnst@cn.fujitsu.com>  wrote:
>>>
>>>> On 04/14/2016 09:02 AM, Chen Fan wrote:
>>>>> On 04/12/2016 05:38 AM, Alex Williamson wrote:
>>>>>> On Tue, 5 Apr 2016 19:42:02 +0800
>>>>>> Cao jin<caoj.fnst@cn.fujitsu.com>  wrote:
>>>>>>
>>>>>>> From: Chen Fan<chen.fan.fnst@cn.fujitsu.com>
>>>>>>>
>>>>>>> for supporting aer recovery, host and guest would run the same aer
>>>>>>> recovery code, that would do the secondary bus reset if the error
>>>>>>> is fatal, the aer recovery process:
>>>>>>>     1. error_detected
>>>>>>>     2. reset_link (if fatal)
>>>>>>>     3. slot_reset/mmio_enabled
>>>>>>>     4. resume
>>>>>>>
>>>>>>> it indicates that host will do secondary bus reset to reset
>>>>>>> the physical devices under bus in step 2, that would cause
>>>>>>> devices in D3 status in a short time. but in qemu, we register
>>>>>>> an error detected handler, that would be invoked as host broadcasts
>>>>>>> the error-detected event in step 1, in order to avoid guest do
>>>>>>> reset_link when host do reset_link simultaneously. it may cause
>>>>>>> fatal error. we introduce a resmue notifier to assure host reset
>>>>>>> completely. then do guest aer injection.
>>>>>> Why is it safe to continue running the VM between the error detected
>>>>>> notification and the resume notification?  We're just pushing back the
>>>>>> point at which we inject the AER into the guest, potentially negating
>>>>>> any benefit by allowing the VM to consume bad data.  Shouldn't we
>>>>>> instead be immediately notifying the VM on error detected, but stalling
>>>>>> any access to the device until resume is signaled?  How do we know that
>>>>>> resume will ever be signaled?  We have both the problem that we may be
>>>>>> running on an older kernel that won't support a resume notification and
>>>>>> the problem that seeing a resume notification depends on the host being
>>>>>> able to successfully complete a link reset after fatal error. We can
>>>>>> detect support for resume notification, but we still need a strategy
>>>>>> for never receiving it.  Thanks,
>>>>> That's make sense, but I haven't came up with a good idea. do you have
>>>>> any idea, Alex?
>>> I don't know that there are any good solutions here.  We need to
>>> respond to the current error notifier interrupt and not regress from
>>> our support there.  I think that means that if we want to switch from a
>>> simple halt-on-error to a mechanism for the guest to handle recovery,
>>> we need to disable access to the device between being notified that the
>>> error occurred and being notified to resume.  We can do that by
>>> disabling mmaps to the device and preventing access via the slow path
>>> handlers.  I don't know what the best solution is for preventing access,
>>> do we block and pause the VM or do we drop writes and return -1 for
>>> reads, that's something that needs to be determined.  We also need to
>>> inject the AER into the VM at the point we're notified of an error
>>> because the VM needs to know as soon as possible to stop using the
>>> device or trusting any data from it.  The next coordination point would
>>> be something like the resume notifier that you've added and there are
>>> numerous questions around the interaction of that with the guest
>>> handling.  Clearly we can't do a guest directed bus reset until we get
>>> the resume notifier, so do we block that execution path in QEMU until
>>> the resume notification is received?  What happens if we don't get that
>>> notification?  Is there any way that we can rely on the host having
>>> done a bus reset to the point where we don't need to act on the guest
>>> directed reset?  These are all things that need to be figured out.
>>> Thanks,
>> Maybe we can simply pause the vcpu running and avoid the VM to
>> access the device. and add two flags in VFIO_DEVICE_GET_INFO to query
>> whether the vfio pci driver has a resume notifier,
>> if it does not have resume notifier flags, we can directly fail to boot
>> up VM
>> as with aer enabled.
>
> We can already tell if a resume interrupt is supported between the IRQ
> count in vfio_device_info and a probe with vfio_irq_info, what would
> additional flags in vfio_device_info tell us beyond a resume interrupt
> being supported?  Is pausing the VM acceptable from a service guarantee
> perspective to users?  A bus reset can take a full second and I imagine
> deeper PCI hierarchies can push that out depending on what level the
> error occurs.  A second of downtime may be enough to trigger failovers
> to other systems.  If we were to disable mmaps when a fault occurs, we
> could trap any further device access, drop writes, return -1 for
> reads.  This seems reasonable since we've already notified the VM that
> the device had a fault.  The synchronization point seems like when the
> guest tries to do a bus reset, we need to block that until we get the
> resume notification from the host.  Perhaps if that doesn't occur after
> some timeout, we would abort the guest directed bus reset altogether
> and make the device disappear, perhaps even initiating an unplug of the
> device to prevent it from further interacting with the VM.
>
>> otherwise, we should wait for resume notifier coming to
>> restart the cpu. about the problem of the reduplicated bus reset by host
>> and guest,
>> I think qemu can according to the error is fatal or non-fatal to decide
>> whether need
>> to do a bus reset on guest, I think it's not critical and could be
>> resolved later.
>
> The vfio error interrupt doesn't signal non-fatal errors afaik.  I'm
> also not sure we have an guarantee that the host has performed a bus
> reset, we shouldn't necessarily design the API that strictly around the
> current behavior of the Linux AER handler.  So I don't know that
> there's any practical way to avoid duplicate bus resets between host
> and guest recovery.  Thanks,
>
> Alex
>
>
> .
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-11  3:11               ` Zhou Jie
@ 2016-05-11 20:20                 ` Alex Williamson
  0 siblings, 0 replies; 26+ messages in thread
From: Alex Williamson @ 2016-05-11 20:20 UTC (permalink / raw)
  To: Zhou Jie; +Cc: Chen Fan, izumi.taku, Cao jin, qemu-devel, mst

On Wed, 11 May 2016 11:11:39 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi, Alex
>      What do you think about the following solution?
>      1. Detect support for resume notification.
>         If host vfio driver does not have resume notifier flags,
>         Directly fail to boot up VM as with aer enabled.

It's not a flag used to detect the resume notifier, but simply probing
IRQ_INFO for the index allocated for this notification.

>      2. Immediately notify the VM on error detected.
>      3. Stall any access to the device until resume is signaled.
>         Disable mmaps, drop writes, return -1 for reads.
>      4. Delay the guest directed bus reset.
>         Don't reset bus in vfio_pci_reset function.
>      5. Wait for resume notification.
>         If we don't get the resume notification from the host after
>         some timeout, we would abort the guest directed bus reset
>         altogether and make the device disappear,
>         Initiating an unplug of the device to prevent it from further
>         interacting with the VM.
>      6. After get the resume notification.
>         Reset bus.
>         It the second bus reset. Because the host did bus reset already.
>         But as you said we shouldn't necessarily design the API that
>         strictly around the current behavior of the Linux AER handler.

Otherwise it sounds like what I had proposed.  Thanks,

Alex

> On 2016/5/7 0:39, Alex Williamson wrote:
> > On Fri, 6 May 2016 09:38:41 +0800
> > Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
> >  
> >> On 04/26/2016 10:48 PM, Alex Williamson wrote:  
> >>> On Tue, 26 Apr 2016 11:39:02 +0800
> >>> Chen Fan<chen.fan.fnst@cn.fujitsu.com>  wrote:
> >>>  
> >>>> On 04/14/2016 09:02 AM, Chen Fan wrote:  
> >>>>> On 04/12/2016 05:38 AM, Alex Williamson wrote:  
> >>>>>> On Tue, 5 Apr 2016 19:42:02 +0800
> >>>>>> Cao jin<caoj.fnst@cn.fujitsu.com>  wrote:
> >>>>>>  
> >>>>>>> From: Chen Fan<chen.fan.fnst@cn.fujitsu.com>
> >>>>>>>
> >>>>>>> for supporting aer recovery, host and guest would run the same aer
> >>>>>>> recovery code, that would do the secondary bus reset if the error
> >>>>>>> is fatal, the aer recovery process:
> >>>>>>>     1. error_detected
> >>>>>>>     2. reset_link (if fatal)
> >>>>>>>     3. slot_reset/mmio_enabled
> >>>>>>>     4. resume
> >>>>>>>
> >>>>>>> it indicates that host will do secondary bus reset to reset
> >>>>>>> the physical devices under bus in step 2, that would cause
> >>>>>>> devices in D3 status in a short time. but in qemu, we register
> >>>>>>> an error detected handler, that would be invoked as host broadcasts
> >>>>>>> the error-detected event in step 1, in order to avoid guest do
> >>>>>>> reset_link when host do reset_link simultaneously. it may cause
> >>>>>>> fatal error. we introduce a resmue notifier to assure host reset
> >>>>>>> completely. then do guest aer injection.  
> >>>>>> Why is it safe to continue running the VM between the error detected
> >>>>>> notification and the resume notification?  We're just pushing back the
> >>>>>> point at which we inject the AER into the guest, potentially negating
> >>>>>> any benefit by allowing the VM to consume bad data.  Shouldn't we
> >>>>>> instead be immediately notifying the VM on error detected, but stalling
> >>>>>> any access to the device until resume is signaled?  How do we know that
> >>>>>> resume will ever be signaled?  We have both the problem that we may be
> >>>>>> running on an older kernel that won't support a resume notification and
> >>>>>> the problem that seeing a resume notification depends on the host being
> >>>>>> able to successfully complete a link reset after fatal error. We can
> >>>>>> detect support for resume notification, but we still need a strategy
> >>>>>> for never receiving it.  Thanks,  
> >>>>> That's make sense, but I haven't came up with a good idea. do you have
> >>>>> any idea, Alex?  
> >>> I don't know that there are any good solutions here.  We need to
> >>> respond to the current error notifier interrupt and not regress from
> >>> our support there.  I think that means that if we want to switch from a
> >>> simple halt-on-error to a mechanism for the guest to handle recovery,
> >>> we need to disable access to the device between being notified that the
> >>> error occurred and being notified to resume.  We can do that by
> >>> disabling mmaps to the device and preventing access via the slow path
> >>> handlers.  I don't know what the best solution is for preventing access,
> >>> do we block and pause the VM or do we drop writes and return -1 for
> >>> reads, that's something that needs to be determined.  We also need to
> >>> inject the AER into the VM at the point we're notified of an error
> >>> because the VM needs to know as soon as possible to stop using the
> >>> device or trusting any data from it.  The next coordination point would
> >>> be something like the resume notifier that you've added and there are
> >>> numerous questions around the interaction of that with the guest
> >>> handling.  Clearly we can't do a guest directed bus reset until we get
> >>> the resume notifier, so do we block that execution path in QEMU until
> >>> the resume notification is received?  What happens if we don't get that
> >>> notification?  Is there any way that we can rely on the host having
> >>> done a bus reset to the point where we don't need to act on the guest
> >>> directed reset?  These are all things that need to be figured out.
> >>> Thanks,  
> >> Maybe we can simply pause the vcpu running and avoid the VM to
> >> access the device. and add two flags in VFIO_DEVICE_GET_INFO to query
> >> whether the vfio pci driver has a resume notifier,
> >> if it does not have resume notifier flags, we can directly fail to boot
> >> up VM
> >> as with aer enabled.  
> >
> > We can already tell if a resume interrupt is supported between the IRQ
> > count in vfio_device_info and a probe with vfio_irq_info, what would
> > additional flags in vfio_device_info tell us beyond a resume interrupt
> > being supported?  Is pausing the VM acceptable from a service guarantee
> > perspective to users?  A bus reset can take a full second and I imagine
> > deeper PCI hierarchies can push that out depending on what level the
> > error occurs.  A second of downtime may be enough to trigger failovers
> > to other systems.  If we were to disable mmaps when a fault occurs, we
> > could trap any further device access, drop writes, return -1 for
> > reads.  This seems reasonable since we've already notified the VM that
> > the device had a fault.  The synchronization point seems like when the
> > guest tries to do a bus reset, we need to block that until we get the
> > resume notification from the host.  Perhaps if that doesn't occur after
> > some timeout, we would abort the guest directed bus reset altogether
> > and make the device disappear, perhaps even initiating an unplug of the
> > device to prevent it from further interacting with the VM.
> >  
> >> otherwise, we should wait for resume notifier coming to
> >> restart the cpu. about the problem of the reduplicated bus reset by host
> >> and guest,
> >> I think qemu can according to the error is fatal or non-fatal to decide
> >> whether need
> >> to do a bus reset on guest, I think it's not critical and could be
> >> resolved later.  
> >
> > The vfio error interrupt doesn't signal non-fatal errors afaik.  I'm
> > also not sure we have an guarantee that the host has performed a bus
> > reset, we shouldn't necessarily design the API that strictly around the
> > current behavior of the Linux AER handler.  So I don't know that
> > there's any practical way to avoid duplicate bus resets between host
> > and guest recovery.  Thanks,
> >
> > Alex
> >
> >
> > .
> >  
> 
> 
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-04-26 14:48         ` Alex Williamson
  2016-05-06  1:38           ` Chen Fan
@ 2016-05-24 10:49           ` Michael S. Tsirkin
  2016-05-25  1:08             ` Zhou Jie
  2016-05-25  2:54             ` Alex Williamson
  1 sibling, 2 replies; 26+ messages in thread
From: Michael S. Tsirkin @ 2016-05-24 10:49 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Chen Fan, Cao jin, izumi.taku, qemu-devel

On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:
> I think that means that if we want to switch from a
> simple halt-on-error to a mechanism for the guest to handle recovery,
> we need to disable access to the device between being notified that the
> error occurred and being notified to resume.

But this isn't what happens on bare metal.
Errors are reported asynchronously and host might access the device
meanwhile.  These accesses might or might not trigger more errors, but
fundamentally this should not matter too much as device is going to be
reset.

-- 
MST

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-24 10:49           ` Michael S. Tsirkin
@ 2016-05-25  1:08             ` Zhou Jie
  2016-05-25  2:54             ` Alex Williamson
  1 sibling, 0 replies; 26+ messages in thread
From: Zhou Jie @ 2016-05-25  1:08 UTC (permalink / raw)
  To: Michael S. Tsirkin, Alex Williamson
  Cc: fan.chen, izumi.taku, Cao jin, qemu-devel

So I needn't disable access to the device between being notified that 
the error occurred and being notified to resume.
This will make code simpler.

Am I right?

On 2016/5/24 18:49, Michael S. Tsirkin wrote:
> On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:
>> I think that means that if we want to switch from a
>> simple halt-on-error to a mechanism for the guest to handle recovery,
>> we need to disable access to the device between being notified that the
>> error occurred and being notified to resume.
>
> But this isn't what happens on bare metal.
> Errors are reported asynchronously and host might access the device
> meanwhile.  These accesses might or might not trigger more errors, but
> fundamentally this should not matter too much as device is going to be
> reset.
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-24 10:49           ` Michael S. Tsirkin
  2016-05-25  1:08             ` Zhou Jie
@ 2016-05-25  2:54             ` Alex Williamson
  2016-05-25  8:45               ` Michael S. Tsirkin
  1 sibling, 1 reply; 26+ messages in thread
From: Alex Williamson @ 2016-05-25  2:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Chen Fan, Cao jin, izumi.taku, qemu-devel

On Tue, 24 May 2016 13:49:12 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:
> > I think that means that if we want to switch from a
> > simple halt-on-error to a mechanism for the guest to handle recovery,
> > we need to disable access to the device between being notified that the
> > error occurred and being notified to resume.  
> 
> But this isn't what happens on bare metal.
> Errors are reported asynchronously and host might access the device
> meanwhile.  These accesses might or might not trigger more errors, but
> fundamentally this should not matter too much as device is going to be
> reset.

Bare metal also doesn't have a hypervisor underneath performing a PCI
bus reset, there's only one OS trying to control the device at a time,
so we have some clear differences from bare metal that I don't know we
can avoid.  The thought here was that we need to notify the guest at the
earliest point we can, but let the host recovery run to completion
before allowing the user to interact with the device.  Perhaps there is
no need to block region access to the device (ie. config space & BAR
resources), but I think we do need to somehow synchronize the bus resets
or else we get situations like that observed previously where the bus is
still in reset while userspace trys to proceed with using it.

The next question then would be whether that's QEMU's job or something
that should be done in the host kernel.  It's been proposed to add yet
another eventfd for the kernel vfio-pci to signal QEMU when a resume
notification has occured, but perhaps the better approach would be for
the hot reset ioctl (and base reset ioctl) to handle this situation more
transparently.  We could immediately return -EAGAIN and allow QEMU to
delay itself for any reset ioctl received after the AER error detected
event, but before the resume event.  We could also allow some sort of
timeout, that the ioctl might enter an interruptible sleep, woken on
the resume notification or timeout.  That sounds a bit better to me as
the specification of what's allowed between the error detected
notification and the resume notification is otherwise pretty poorly
defined.  Do you think we can run completely asynchronous, letting the
host and guest bus resets race?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-25  2:54             ` Alex Williamson
@ 2016-05-25  8:45               ` Michael S. Tsirkin
  2016-05-25 14:22                 ` Alex Williamson
  0 siblings, 1 reply; 26+ messages in thread
From: Michael S. Tsirkin @ 2016-05-25  8:45 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Chen Fan, Cao jin, izumi.taku, qemu-devel

On Tue, May 24, 2016 at 08:54:06PM -0600, Alex Williamson wrote:
> On Tue, 24 May 2016 13:49:12 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:
> > > I think that means that if we want to switch from a
> > > simple halt-on-error to a mechanism for the guest to handle recovery,
> > > we need to disable access to the device between being notified that the
> > > error occurred and being notified to resume.  
> > 
> > But this isn't what happens on bare metal.
> > Errors are reported asynchronously and host might access the device
> > meanwhile.  These accesses might or might not trigger more errors, but
> > fundamentally this should not matter too much as device is going to be
> > reset.
> 
> Bare metal also doesn't have a hypervisor underneath performing a PCI
> bus reset,

This is where I get lost. I assumed we do reset when guest
requests it. Isn't that the case? Why not?

> there's only one OS trying to control the device at a time,
> so we have some clear differences from bare metal that I don't know we
> can avoid.  The thought here was that we need to notify the guest at the
> earliest point we can, but let the host recovery run to completion
> before allowing the user to interact with the device.  Perhaps there is
> no need to block region access to the device (ie. config space & BAR
> resources), but I think we do need to somehow synchronize the bus resets
> or else we get situations like that observed previously where the bus is
> still in reset while userspace trys to proceed with using it.
>

Why do we have to trigger reset upon an error?
Why not wait for guest to request reset?

> The next question then would be whether that's QEMU's job or something
> that should be done in the host kernel.  It's been proposed to add yet
> another eventfd for the kernel vfio-pci to signal QEMU when a resume
> notification has occured, but perhaps the better approach would be for
> the hot reset ioctl (and base reset ioctl) to handle this situation more
> transparently.  We could immediately return -EAGAIN and allow QEMU to
> delay itself for any reset ioctl received after the AER error detected
> event, but before the resume event.  We could also allow some sort of
> timeout, that the ioctl might enter an interruptible sleep, woken on
> the resume notification or timeout.  That sounds a bit better to me as
> the specification of what's allowed between the error detected
> notification and the resume notification is otherwise pretty poorly
> defined.

So if guest started reset, it might take a while for
device to come out of that state, and access during this
time might trigger errors. But that's already possible
for guest to trigger, right?  How is this different?


>  Do you think we can run completely asynchronous, letting the
> host and guest bus resets race?  Thanks,
> 
> Alex

I have a feeling we need to put some code out,
disabled by default, and see how it behaves in the field.
For example ability to trigger UR errors seems benign but
I think we are trying to prevent them now because of
something we saw in the field.

-- 
MST

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-25  8:45               ` Michael S. Tsirkin
@ 2016-05-25 14:22                 ` Alex Williamson
  0 siblings, 0 replies; 26+ messages in thread
From: Alex Williamson @ 2016-05-25 14:22 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Chen Fan, Cao jin, izumi.taku, qemu-devel

On Wed, 25 May 2016 11:45:11 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, May 24, 2016 at 08:54:06PM -0600, Alex Williamson wrote:
> > On Tue, 24 May 2016 13:49:12 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:  
> > > > I think that means that if we want to switch from a
> > > > simple halt-on-error to a mechanism for the guest to handle recovery,
> > > > we need to disable access to the device between being notified that the
> > > > error occurred and being notified to resume.    
> > > 
> > > But this isn't what happens on bare metal.
> > > Errors are reported asynchronously and host might access the device
> > > meanwhile.  These accesses might or might not trigger more errors, but
> > > fundamentally this should not matter too much as device is going to be
> > > reset.  
> > 
> > Bare metal also doesn't have a hypervisor underneath performing a PCI
> > bus reset,  
> 
> This is where I get lost. I assumed we do reset when guest
> requests it. Isn't that the case? Why not?

Unless we can somehow opt-out vfio-pci devices from AER handling on the
host, then the host is going to try to recover the device as part of
the core AER handling, which is not driver directed.  The host driver
can register various callbacks, which is where we get the
error_detected notification, but until we get the resume notification,
I think we assume the device is being operated on by the core.

At the same time we're using the error_detected to signal the AER event
to the guest, which begins a similar recovery process.  We don't
particularly want to rely on how the host has recovered the device and
the guest driver needs to be aware that the device state may need to be
recovered, so we let the recovery proceed in both the host and guest,
but it seems we need some synchronization point and guest accesses to
the device while we know the host is still in recovery may do more harm
than good.

> > there's only one OS trying to control the device at a time,
> > so we have some clear differences from bare metal that I don't know we
> > can avoid.  The thought here was that we need to notify the guest at the
> > earliest point we can, but let the host recovery run to completion
> > before allowing the user to interact with the device.  Perhaps there is
> > no need to block region access to the device (ie. config space & BAR
> > resources), but I think we do need to somehow synchronize the bus resets
> > or else we get situations like that observed previously where the bus is
> > still in reset while userspace trys to proceed with using it.
> >  
> 
> Why do we have to trigger reset upon an error?
> Why not wait for guest to request reset?

Is there a way to opt-out of host AER handling?  Do we want to create a
special case for vfio-pci to do this?  Does doing so allow the user to
exploit the host in anyway, such as the user failing to recover the
device, continuing to signal error events on the host, and perhaps
leaving the device in a lest trustworthy state when returned to the
host (not that we should ever be trusting the state of the device at
that point).

> > The next question then would be whether that's QEMU's job or something
> > that should be done in the host kernel.  It's been proposed to add yet
> > another eventfd for the kernel vfio-pci to signal QEMU when a resume
> > notification has occured, but perhaps the better approach would be for
> > the hot reset ioctl (and base reset ioctl) to handle this situation more
> > transparently.  We could immediately return -EAGAIN and allow QEMU to
> > delay itself for any reset ioctl received after the AER error detected
> > event, but before the resume event.  We could also allow some sort of
> > timeout, that the ioctl might enter an interruptible sleep, woken on
> > the resume notification or timeout.  That sounds a bit better to me as
> > the specification of what's allowed between the error detected
> > notification and the resume notification is otherwise pretty poorly
> > defined.  
> 
> So if guest started reset, it might take a while for
> device to come out of that state, and access during this
> time might trigger errors. But that's already possible
> for guest to trigger, right?  How is this different?

You can look back through the history of this series, there was an
arbitrary delay added after reset because the device was still in
reset (presumably host and guest reset racing).  Yes we do not
currently block access to the device during a reset, the issue is
mostly that we don't expect device resets to be occurring in the host
except when directed by the guest.  In this case we expect a host
directed reset is occurring and the guest directed reset seems to be a
synchronization point.

> >  Do you think we can run completely asynchronous, letting the
> > host and guest bus resets race?  Thanks,
> > 
> > Alex  
> 
> I have a feeling we need to put some code out,
> disabled by default, and see how it behaves in the field.
> For example ability to trigger UR errors seems benign but
> I think we are trying to prevent them now because of
> something we saw in the field.

I don't really follow here, but I'm not in love with the idea of "let's
see how it behaves in the field" because I'm going to be stuck with the
support of that code whether it behaves appropriately or not.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2016-05-25 14:22 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-04-05 11:41 [Qemu-devel] [patch v6 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 03/12] vfio: add pcie extended capability support Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 04/12] vfio: add aer support for vfio device Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 05/12] vfio: refine function vfio_pci_host_match Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 06/12] vfio: add check host bus reset is support or not Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
2016-04-05 11:41 ` [Qemu-devel] [patch v6 08/12] vfio: add check aer functionality for hotplug device Cao jin
2016-04-05 11:42 ` [Qemu-devel] [patch v6 09/12] vfio: vote the function 0 to do host bus reset when aer occurred Cao jin
2016-04-05 11:42 ` [Qemu-devel] [patch v6 10/12] vfio-pci: pass the aer error to guest Cao jin
2016-04-05 11:42 ` [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume Cao jin
2016-04-11 21:38   ` Alex Williamson
2016-04-14  1:02     ` Chen Fan
2016-04-26  3:39       ` Chen Fan
2016-04-26 14:48         ` Alex Williamson
2016-05-06  1:38           ` Chen Fan
2016-05-06 16:39             ` Alex Williamson
2016-05-11  3:11               ` Zhou Jie
2016-05-11 20:20                 ` Alex Williamson
2016-05-24 10:49           ` Michael S. Tsirkin
2016-05-25  1:08             ` Zhou Jie
2016-05-25  2:54             ` Alex Williamson
2016-05-25  8:45               ` Michael S. Tsirkin
2016-05-25 14:22                 ` Alex Williamson
2016-04-05 11:42 ` [Qemu-devel] [patch v6 12/12] vfio: add 'aer' property to expose aercap Cao jin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).