From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([2001:4830:134:3::10]:54407)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <alex.williamson@redhat.com>) id 1Ynf77-0005Ad-0d
	for qemu-devel@nongnu.org; Wed, 29 Apr 2015 23:26:14 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <alex.williamson@redhat.com>) id 1Ynf73-0000OL-Fw
	for qemu-devel@nongnu.org; Wed, 29 Apr 2015 23:26:12 -0400
Received: from mx1.redhat.com ([209.132.183.28]:38079)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <alex.williamson@redhat.com>) id 1Ynf73-0000NR-5g
	for qemu-devel@nongnu.org; Wed, 29 Apr 2015 23:26:09 -0400
Message-ID: <1430364066.4472.237.camel@redhat.com>
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 29 Apr 2015 21:21:06 -0600
In-Reply-To: <55419C6C.6020903@cn.fujitsu.com>
References: <cover.1430297161.git.chen.fan.fnst@cn.fujitsu.com>
	<b95cd227a99fdc382e0a9a64b1e9ded18c82ad3d.1430297161.git.chen.fan.fnst@cn.fujitsu.com>
	<1430328750.4472.215.camel@redhat.com>
	<55419C6C.6020903@cn.fujitsu.com>
Content-Type: text/plain; charset="UTF-8"
Mime-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: Re: [Qemu-devel] [PATCH RFC v6 11/11] vfio: add bus reset notifier
 for host bus reset
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Cc: izumi.taku@jp.fujitsu.com, qemu-devel@nongnu.org

On Thu, 2015-04-30 at 11:07 +0800, Chen Fan wrote:
> On 04/30/2015 01:32 AM, Alex Williamson wrote:
> > On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote:
> >> add host secondary bus reset for vfio when AER occurs, if reset failed,
> >> we should stop vm.
> >>
> >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >> ---
> >>   hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
> >>   1 file changed, 138 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 060fb47..619daed 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice {
> >>       PCIHostDeviceAddress host;
> >>       EventNotifier err_notifier;
> >>       EventNotifier req_notifier;
> >> +
> >> +    Notifier sec_bus_reset_notifier;
> >>       uint32_t features;
> >>   #define VFIO_FEATURE_ENABLE_VGA_BIT 0
> >>   #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
> >> @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
> >>       return pos;
> >>   }
> >>   
> >> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> >> +                                PCIHostDeviceAddress *host2)
> >> +{
> >> +    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> >> +            host1->slot == host2->slot && host1->function == host2->function);
> >> +}
> >> +
> >>   static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
> >>   {
> >>       uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
> >> @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
> >>       return 0;
> >>   }
> >>   
> >> +static int vfio_aer_validate_devices(DeviceState *dev,
> >> +                                     void *opaque)
> >> +{
> >> +    VFIOPCIDevice *vdev;
> >> +    int i;
> >> +    bool found = false;
> >> +    struct vfio_pci_hot_reset_info *info = opaque;
> >> +    struct vfio_pci_dependent_device *devices = &info->devices[0];
> >> +
> >> +    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev));
> >> +    for (i = 0; i < info->count; i++) {
> >> +        PCIHostDeviceAddress host;
> >> +
> >> +        host.domain = devices[i].segment;
> >> +        host.bus = devices[i].bus;
> >> +        host.slot = PCI_SLOT(devices[i].devfn);
> >> +        host.function = PCI_FUNC(devices[i].devfn);
> >> +
> >> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> >> +            found = true;
> >> +            break;
> >> +        }
> >> +    }
> >> +
> >> +    if (!found) {
> >> +        error_report("vfio: Cannot reset parent bus with AER supported,"
> >> +                     "depends on device %s which is not contained.",
> >> +                      vdev->vbasedev.name);
> >> +        return -1;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev)
> >> +{
> >> +    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
> >> +                 "Please collect any data possible and then kill the guest",
> >> +                 __func__, vdev->host.domain, vdev->host.bus,
> >> +                 vdev->host.slot, vdev->host.function);
> >> +
> >> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> >> +}
> >> +
> >> +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque)
> >> +{
> >> +    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier);
> >> +    PCIDevice *pdev = &vdev->pdev;
> >> +    int ret, i;
> >> +    struct vfio_pci_hot_reset_info *info;
> >> +    struct vfio_pci_dependent_device *devices;
> >> +    VFIOGroup *group;
> >> +
> >> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
> >> +        return;
> >> +    }
> >> +
> >> +    /*
> >> +     * Check the affected devices by virtual bus reset are contained in
> >> +     * the set of groups.
> >> +     */
> >> +    ret = vfio_get_hot_reset_info(vdev, &info);
> >> +    if (ret < 0) {
> >> +        goto stop_vm;
> >> +    }
> >> +
> >> +    devices = &info->devices[0];
> >> +
> >> +    /* Verify that we have all the groups required */
> >> +    for (i = 0; i < info->count; i++) {
> >> +        PCIHostDeviceAddress host;
> >> +
> >> +        host.domain = devices[i].segment;
> >> +        host.bus = devices[i].bus;
> >> +        host.slot = PCI_SLOT(devices[i].devfn);
> >> +        host.function = PCI_FUNC(devices[i].devfn);
> >> +
> >> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> >> +            if (group->groupid == devices[i].group_id) {
> >> +                break;
> >> +            }
> >> +        }
> >> +
> >> +        if (!group) {
> >> +            if (!vdev->has_pm_reset) {
> > I'm not sure how has_pm_reset has anything to do with what we're testing
> > here.  Copy-paste error?
> >
> >> +                error_report("vfio: Cannot reset device %s with AER supported,"
> >> +                             "depends on group %d which is not owned.",
> >> +                             vdev->vbasedev.name, devices[i].group_id);
> >> +            }
> >> +            ret = -EPERM;
> >> +            goto stop_vm;
> >> +        }
> >> +    }
> >
> > The above verifies that all of the devices affected by the bus reset are
> > owned by the VM.
> >
> >> +
> >> +    /* Verify that we have all the affected devices under the bus */
> >> +    ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL,
> >> +                             vfio_aer_validate_devices,
> >> +                             NULL, info);
> > And here we make sure that any vfio-pci devices affected by the bus
> > reset are contained in the list of affected devices.  What we're still
> > missing is whether there are any affected devices that are exposed to
> > the VM that are not covered in this bus walk.  For instance, if I assign
> > a multi-function device and place function 0 and 1 under separate, peer
> > root ports, I believe the above tests will confirm that the VM owns the
> > necessary groups and that the only vfio-pci device affected by the bus
> > reset is in the list of affected devices, but it will not verify that
> > the other function is not affected by the guest bus reset, but is
> > affected by the host bus reset.
> you are right.
> 
> >
> >> +    if (ret < 0) {
> >> +        goto stop_vm;
> >> +    }
> >> +
> >> +
> >> +    /* bus reset! */
> >> +    ret = vfio_pci_do_hot_reset(vdev, info);
> >> +    if (ret < 0) {
> >> +        goto stop_vm;
> >> +    }
> >> +
> >> +    g_free(info);
> >> +    return;
> >> +
> >> +stop_vm:
> >> +    g_free(info);
> >> +    vfio_pci_vm_stop(vdev);
> >> +}
> >> +
> >>   static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>   {
> >>       PCIDevice *pdev = &vdev->pdev;
> >> @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>                      pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4);
> >>       pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity);
> >>   
> >> +    vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset;
> >> +    pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier);
> >> +
> > Additionally, we're going to do a bus reset once for every vfio-pci
> > device subordinate to the bus being reset.  That's not very efficient.
> >
> > There are still a number of problems here.  We're allowing users to
> > configure their VMs however they wish and enable AER, then only when
> > they do a bus reset (which we have no way to infer is related to an AER
> > event) do we check and potentially halt the VM if guest mapping of the
> > host topology prevents a bus reset.  That seems sort of like making
> > backups of your data, but never testing that you can recover from the
> > backup until we need the data.  I can't imagine that many users are
> > going to be able to get this correct and they won't know it's incorrect
> > until an AER event occurs.
> >
> > Also, Do we need to worry about guest root complex devices?  The guest
> > won't be able to perform a bus reset on the guest bus in that case.  Is
> > AER even valid for RC devices since they don't have a parent downstream
> > port to signal the AER?  This seems like another case where it's more
> > likely that a user will create an invalid configuration than it is they
> > will create a valid one, but we won't know until an AER occurs with the
> > code structure here.
> do you refer to the root complex device is the root port of root complex?
> the root port can notify system itself, the PCI Express aer driver
> reset the upstream link between root port and upstream port of
> subordinate switch. so I think we don't need to care that.


No, I'm saying that we can place an assigned device directly on pcie.0,
the host bus, without using a root port.  I don't know if we have any
way to signal an AER event for root complex devices, but the guest
certainly doesn't have a way to perform a bus reset on the host bus.
Thanks,

Alex


> > Therefore I think that if the user specifies AER, we need to verify and
> > enforce at that point, ie. the initfn, that a host bus reset is
> > possible, that a guest reset is possible, and that a guest bus reset
> > fully contains the VM visible host devices affected by the reset.
> >
> > I'd also like to see the patches ordered such that we provide all the
> > infrastructure to validate and enforce the configuration restrictions to
> > support AER before we enable it to the guest.  The order here creates
> > bisect points where AER is exposed to the guest with unacceptable QEMU
> > handling.  Thanks,
> I'm sorry for that, due to this is RFC version I missed that,
> and  I will arrange the patches after this series.
> 
> Thanks,
> Chen
> 
> 
> >
> > Alex
> >
> >>       return 0;
> >>   }
> >>   
> >> @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
> >>       vfio_enable_intx(vdev);
> >>   }
> >>   
> >> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> >> -                                PCIHostDeviceAddress *host2)
> >> -{
> >> -    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> >> -            host1->slot == host2->slot && host1->function == host2->function);
> >> -}
> >> -
> >>   static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
> >>   {
> >>       VFIOGroup *group;
> >> @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque)
> >>        * terminate the guest to contain the error.
> >>        */
> >>   
> >> -    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
> >> -                 "Please collect any data possible and then kill the guest",
> >> -                 __func__, vdev->host.domain, vdev->host.bus,
> >> -                 vdev->host.slot, vdev->host.function);
> >> -
> >> -    vm_stop(RUN_STATE_INTERNAL_ERROR);
> >> +    vfio_pci_vm_stop(vdev);
> >>   }
> >>   
> >>   /*
> >
> >
> > .
> >
>