From: Alex Williamson <alex.williamson@redhat.com>
To: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Cc: izumi.taku@jp.fujitsu.com, qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [RFC v8.1 06/13] vfio: add check host bus reset is support or not
Date: Thu, 04 Jun 2015 09:59:20 -0600 [thread overview]
Message-ID: <1433433560.3510.175.camel@redhat.com> (raw)
In-Reply-To: <556E4FE3.4000607@cn.fujitsu.com>
On Wed, 2015-06-03 at 08:52 +0800, Chen Fan wrote:
> On 06/03/2015 12:47 AM, Alex Williamson wrote:
> > On Tue, 2015-06-02 at 15:54 +0800, Chen Fan wrote:
> >> On 05/28/2015 05:32 AM, Alex Williamson wrote:
> >>> On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
> >>>> we introduce a has_bus_reset capability to sign the vfio
> >>>> devices if support host bus reset.
> >>>>
> >>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>> ---
> >>>> hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>> 1 file changed, 123 insertions(+)
> >>>>
> >>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>>> index f4e7855..5934fd7 100644
> >>>> --- a/hw/vfio/pci.c
> >>>> +++ b/hw/vfio/pci.c
> >>>> @@ -33,6 +33,7 @@
> >>>> #include "hw/pci/msix.h"
> >>>> #include "hw/pci/pci.h"
> >>>> #include "hw/pci/pci_bridge.h"
> >>>> +#include "hw/pci/pci_bus.h"
> >>>> #include "qemu-common.h"
> >>>> #include "qemu/error-report.h"
> >>>> #include "qemu/event_notifier.h"
> >>>> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
> >>>> bool req_enabled;
> >>>> bool has_flr;
> >>>> bool has_pm_reset;
> >>>> + bool has_bus_reset;
> >>> I still think that caching this is a bad idea, there's no point at which
> >>> we can blindly assume the capability is still present.
> >>>
> >>>> bool rom_read_failed;
> >>>> } VFIOPCIDevice;
> >>>>
> >>>> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> >>>> static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
> >>>> uint32_t val, int len);
> >>>> static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> >>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
> >>>>
> >>>> /*
> >>>> * Disabling BAR mmaping can be slow, but toggling it around INTx can
> >>>> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>>> dev_iter = pci_bridge_get_device(dev_iter->bus);
> >>>> }
> >>>>
> >>>> + /*
> >>>> + * Don't check bus reset capability when device is enabled during
> >>>> + * qemu machine creation, which is done by machine init function.
> >>>> + */
> >>>> + if (DEVICE(vdev)->hotplugged) {
> >>>> + vfio_check_host_bus_reset(vdev);
> >>>> + if (!vdev->has_bus_reset) {
> >>>> + error_report("vfio: Cannot enable AER for device %s, "
> >>>> + "which is not support host bus reset.",
> >>> "which does not support host bus reset."
> >>>
> >>>> + vdev->vbasedev.name);
> >>>> + goto error;
> >>>> + }
> >>>> + }
> >>>> +
> >>>> errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
> >>>> /*
> >>>> * The ability to record multiple headers is depending on
> >>>> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
> >>>> }
> >>>> }
> >>>>
> >>>> +struct VfioDeviceFind {
> >>> We use VFIOFooBar for all other camel case definitions, much like PCIBus
> >>> and PCIDevice below.
> >>>
> >>>> + PCIBus *pbus;
> >>>> + PCIDevice *pdev;
> >>>> + bool found;
> >>>> +};
> >>>> +
> >>>> +static void find_devices(PCIBus *bus, void *opaque)
> >>>> +{
> >>>> + struct VfioDeviceFind *find = opaque;
> >>>> + int i;
> >>>> +
> >>>> + if (find->found == true) {
> >>> if (find->found) {...
> >>>
> >>>> + return;
> >>>> + }
> >>>> +
> >>>> + for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
> >>>> + if (!bus->devices[i]) {
> >>>> + continue;
> >>>> + }
> >>>> +
> >>>> + if (bus->devices[i] == find->pdev) {
> >>>> + find->pbus = bus;
> >>>> + find->found = true;
> >>>> + break;
> >>>> + }
> >>>> + }
> >>>> +}
> >>>> +
> >>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> >>>> +{
> >>>> + PCIBus *bus = vdev->pdev.bus;
> >>>> + struct vfio_pci_hot_reset_info *info = NULL;
> >>>> + struct vfio_pci_dependent_device *devices;
> >>>> + VFIOGroup *group;
> >>>> + int ret, i;
> >>>> + bool has_bus_reset = false;
> >>>> +
> >>>> + ret = vfio_get_hot_reset_info(vdev, &info);
> >>>> + if (ret < 0) {
> >>> if (ret) {...
> >>>
> >>>> + goto out;
> >>>> + }
> >>>> +
> >>>> + /* List all affected devices by bus reset */
> >>>> + devices = &info->devices[0];
> >>>> +
> >>>> + /* Verify that we have all the groups required */
> >>>> + for (i = 0; i < info->count; i++) {
> >>>> + PCIHostDeviceAddress host;
> >>>> + VFIOPCIDevice *tmp;
> >>>> + VFIODevice *vbasedev_iter;
> >>>> +
> >>>> + host.domain = devices[i].segment;
> >>>> + host.bus = devices[i].bus;
> >>>> + host.slot = PCI_SLOT(devices[i].devfn);
> >>>> + host.function = PCI_FUNC(devices[i].devfn);
> >>>> +
> >>>> + /* Skip the current device */
> >>>> + if (vfio_pci_host_match(&host, &vdev->host)) {
> >>>> + continue;
> >>>> + }
> >>>> +
> >>>> + /* Ensure we own the group of the affected device */
> >>>> + QLIST_FOREACH(group, &vfio_group_list, next) {
> >>>> + if (group->groupid == devices[i].group_id) {
> >>>> + break;
> >>>> + }
> >>>> + }
> >>>> +
> >>>> + if (!group) {
> >>>> + goto out;
> >>>> + }
> >>>> +
> >>>> + /* Ensure affected devices for reset under the same bus */
> >>>> + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> >>>> + if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
> >>>> + continue;
> >>>> + }
> >>>> + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
> >>>> + if (vfio_pci_host_match(&host, &tmp->host)) {
> >>>> + struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
> >>>> +
> >>>> + pci_for_each_bus(bus, find_devices, &find);
> >>>> + if (!find.found) {
> >>>> + goto out;
> >>>> + }
> >>>> + /*
> >>>> + * When the check device is hotplugged to a higher bus again,
> >>>> + * which would influence the affected device which enable aer
> >>>> + * below the bus.
> >>>> + */
> >>>> + if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
> >>>> + find.pbus != bus) {
> >>>> + goto out;
> >>>> + }
> >>> I think what you're trying to do here is assume that if a reset of A
> >>> affects B, then a reset of B affects A, and if B is on a subordinate bus
> >>> from A, then our configuration is broken, right? Can we really
> >>> guarantee that assumption? If we had a physical topology that mirrored
> >>> this virtual topology, that wouldn't necessarily be true. For instance,
> >>> if A was a function of a multi-function device where another function
> >>> was a PCIe upstream switch, B could be subordinate to that switch, so a
> >>> reset of A affects B, but a reset of B doesn't affect A.
> >>>
> >>>> + break;
> >>>> + }
> >>>> + }
> >>>> + }
> >>>> +
> >>>> + has_bus_reset = true;
> >>>> +
> >>>> +out:
> >>>> + vdev->has_bus_reset = has_bus_reset;
> >>> I don't see any value is storing this, it can't be trusted at any point
> >>> in the future. I think that any time we add a device or think about
> >>> forwarding an AER message to the guest, we need to do a validation pass,
> >>> testing the entire topology.
> >>>
> >>> To do that we'd iterate through every device in every group for PCI
> >>> devices that support AER. For each one, get the hot reset info for the
> >>> affected device list. For each affected device, if it's attached to the
> >>> VM, it needs to be on or below the bus of the target device.
> >>> Additionally, we should test all of the non-AER supporting vfio-pci
> >>> devices on or below the target bus to verify they have a reset
> >>> mechanism. Run that function for each vfio-pci device as it's added,
> >>> regardless of whether it's hotplug. If the test fails, fail the initfn
> >>> for the current device. Also run the test prior to AER injection, if it
> >>> fails demote the AER injection to a machine halt as we have now.
> >> I'm worry about is the case that the affected devices belonged to
> >> another groups but when initialize this device the another group
> >> has not been added. it would cause fail the initfn, but the group
> >> maybe added later. so I used the machine done event notifier to
> >> check this case. if we don't do like that. how can we check the
> >> case when all vfio-pci devices initfn ?
> > That's why the initfn test needs to test the entire topology, not just
> > the device being added. If we have the case you describe where we have
> > two devices in separate groups, we add the first device, test the
> > topology, see that the second device is affected but not yet added and
> > allow AER to be enabled. When the second device is added, we again test
> > the topology, we see the potential conflict and fail the initfn for the
> > second device if the bus requirements are not met.
> In case that the second device may not be added. in this case the
> first device enable the aer, and pass the validate. how do we know
> the second device if be added ?
Yeah, I see what you're getting at here. If we have a dual port NIC
with isolation between functions such that each is a separate IOMMU
group, when we add the first device with AER enabled it will fail
because a bus reset affects both groups and we don't yet own the second
group.
My proposal wouldn't provide a way to ever enable AER for these devices.
However, the proposal of using a machine-init-done hook only allows
cold-plug of such devices with AER, hotplug would get the same issue. I
don't think that sort of asymmetry is supportable either.
We almost need some sort of vfio-group stub driver in qemu that we can
claim ownership of a group without adding any of the devices in the
group to the VM. Another option might be to make AER a "soft"
requirement, demote it to a VM stop unless the topology allows it. That
also creates a confusing user scenario that probably looks nearly random
from an outside perspective. The only other idea I can see would be to
allow some manipulation of iommu groups at the kernel level, perhaps
creating a meta-group composed of one or more iommu groups. Or maybe a
kernel option that could ignore isolation for specified devices to
broaden the group. Are we really sure exposing AER to guests is a good
idea? Requiring guest bus resets to map to host bus rests is really a
mess. Thanks,
Alex
next prev parent reply other threads:[~2015-06-04 15:59 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-05-27 2:46 [Qemu-devel] [RFC v8.1 00/13] vfio-pci: pass the aer error to guest Chen Fan
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 01/13] vfio: extract vfio_get_hot_reset_info as a single function Chen Fan
2015-05-27 21:31 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 03/13] vfio: add pcie extanded capability support Chen Fan
2015-05-27 21:31 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 04/13] aer: impove pcie_aer_init to support vfio device Chen Fan
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 05/13] vfio: add aer support for " Chen Fan
2015-05-27 21:32 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 06/13] vfio: add check host bus reset is support or not Chen Fan
2015-05-27 21:32 ` Alex Williamson
2015-06-02 7:54 ` Chen Fan
2015-06-02 16:47 ` Alex Williamson
2015-06-03 0:52 ` Chen Fan
2015-06-04 15:59 ` Alex Williamson [this message]
2015-06-09 3:43 ` Chen Fan
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 07/13] vfio: add check for vfio devices which enable aer should support bus reset Chen Fan
2015-05-27 21:32 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 08/13] pci: add bus reset_notifiers callbacks for host " Chen Fan
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 09/13] vfio: add sec_bus_reset notifier to notify physical bus reset is needed Chen Fan
2015-05-27 21:32 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 10/13] vfio: do hot bus reset when do virtual secondary bus reset Chen Fan
2015-05-27 21:33 ` Alex Williamson
[not found] ` <557020F1.7070705@cn.fujitsu.com>
2015-06-04 16:06 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 11/13] pcie_aer: expose pcie_aer_msg() interface Chen Fan
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 12/13] vfio-pci: pass the aer error to guest Chen Fan
2015-05-27 21:33 ` Alex Williamson
2015-05-27 2:46 ` [Qemu-devel] [RFC v8.1 13/13] vfio: add 'aer' property to expose aercap Chen Fan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1433433560.3510.175.camel@redhat.com \
--to=alex.williamson@redhat.com \
--cc=chen.fan.fnst@cn.fujitsu.com \
--cc=izumi.taku@jp.fujitsu.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).