[PATCH 0/3] VFIO change for EEH support

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] VFIO change for EEH support
@ 2013-03-15  7:26 Gavin Shan
  2013-03-15  7:26 ` [PATCH 1/3] VFIO: Architecture dependent VFIO device operations Gavin Shan
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Gavin Shan @ 2013-03-15  7:26 UTC (permalink / raw)
  To: kvm, linuxppc-dev; +Cc: aik, alex.williamson, Gavin Shan

The EEH (Enhanced Error Handling) is one of RAS features on IBM Power
machines. In order to support EEH, the VFIO needs some modification
as the patchset addresses. Firstly, the address (domain:bus:slot:function)
of passed PCI devices looks quite different from host and guest perspectives.
So we have to mantain the address mapping in host so that the EEH could
direct the EEH errors from guest to proper PCI device. Unfortunately, it
seems that the VFIO implementation doesn't include the mechanism yet. On
the other hand, it's totally business of individual platforms. So I introduced
some weak functions in VFIO driver and individual platforms can override
that to figure out more information that platform needs. Apart from that,
the last patch [3/3] is changing the current behavior of accessing uncoverred
config space for specific PCI device.

The patchset is expected to be applied after Alexy's patchset (supporting
VFIO on PowerNV platform). Besides, there're patchset based on it queued
in my personal tree for EEH core to support PowerKVM guest. With all of
them (Alexy's patchset, this patchset, EEH core patchset), I can sucessfully
pass PCI device to guest and recover it from EEH errors.

drivers/vfio/pci/vfio_pci.c        |   42 ++++++++++++++++++++++++++++++------
drivers/vfio/pci/vfio_pci_config.c |   31 +++++++++++++++++---------
include/linux/vfio.h               |    7 +++++-
include/uapi/linux/vfio.h          |   16 +++++++++++++
4 files changed, 77 insertions(+), 19 deletions(-)

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/3] VFIO: Architecture dependent VFIO device operations
  2013-03-15  7:26 [PATCH 0/3] VFIO change for EEH support Gavin Shan
@ 2013-03-15  7:26 ` Gavin Shan
  2013-03-15  7:26 ` [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command Gavin Shan
  2013-03-15  7:26 ` [PATCH 3/3] VFIO: Direct access config reg without capability Gavin Shan
  2 siblings, 0 replies; 20+ messages in thread
From: Gavin Shan @ 2013-03-15  7:26 UTC (permalink / raw)
  To: kvm, linuxppc-dev; +Cc: aik, alex.williamson, Gavin Shan

Some architectures like PPC, especailly PowerNV platform, need to
do additional operations while adding or removing VFIO devices to
or from VFIO bus. The patch adds weak functions while to open,
release or ioctl for the specific VFIO device. Those functions could
be overrided by individual architectures if necessary.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci.c |   42 +++++++++++++++++++++++++++++++++++-------
 include/linux/vfio.h        |    7 ++++++-
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8189cb6..1a53e77 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -143,32 +143,51 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 	pci_restore_state(pdev);
 }
 
+void __weak vfio_pci_arch_release(struct pci_dev *pdev)
+{
+	return;
+}
+
 static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
 
-	if (atomic_dec_and_test(&vdev->refcnt))
+	if (atomic_dec_and_test(&vdev->refcnt)) {
+		vfio_pci_arch_release(vdev->pdev);
+
 		vfio_pci_disable(vdev);
+	}
 
 	module_put(THIS_MODULE);
 }
 
+int __weak vfio_pci_arch_open(struct pci_dev *pdev)
+{
+	return 0;
+}
+
 static int vfio_pci_open(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
+	int ret;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
 	if (atomic_inc_return(&vdev->refcnt) == 1) {
-		int ret = vfio_pci_enable(vdev);
-		if (ret) {
-			module_put(THIS_MODULE);
-			return ret;
-		}
+		ret = vfio_pci_arch_open(vdev->pdev);
+		if (ret)
+			goto fail;
+
+		ret = vfio_pci_enable(vdev);
+		if (ret)
+			goto fail;
 	}
 
 	return 0;
+fail:
+	module_put(THIS_MODULE);
+	return ret;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -206,6 +225,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
 	return 0;
 }
 
+long __weak vfio_pci_arch_ioctl(struct pci_dev *pdev,
+				unsigned int cmd, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
 static long vfio_pci_ioctl(void *device_data,
 			   unsigned int cmd, unsigned long arg)
 {
@@ -374,9 +399,12 @@ static long vfio_pci_ioctl(void *device_data,
 
 		return ret;
 
-	} else if (cmd == VFIO_DEVICE_RESET)
+	} else if (cmd == VFIO_DEVICE_RESET) {
 		return vdev->reset_works ?
 			pci_reset_function(vdev->pdev) : -EINVAL;
+	} else {
+		return vfio_pci_arch_ioctl(vdev->pdev, cmd, arg);
+	}
 
 	return -ENOTTY;
 }
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ab9e862..a991c39 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -11,9 +11,9 @@
 #ifndef VFIO_H
 #define VFIO_H
 
-
 #include <linux/iommu.h>
 #include <linux/mm.h>
+#include <linux/pci.h>
 #include <uapi/linux/vfio.h>
 
 /**
@@ -40,6 +40,11 @@ struct vfio_device_ops {
 	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
 };
 
+extern int vfio_pci_arch_open(struct pci_dev *pdev);
+extern long vfio_pci_arch_ioctl(struct pci_dev *pdev,
+				unsigned int cmd,
+				unsigned long arg);
+extern void vfio_pci_arch_release(struct pci_dev *pdev);
 extern int vfio_add_group_dev(struct device *dev,
 			      const struct vfio_device_ops *ops,
 			      void *device_data);
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-15  7:26 [PATCH 0/3] VFIO change for EEH support Gavin Shan
  2013-03-15  7:26 ` [PATCH 1/3] VFIO: Architecture dependent VFIO device operations Gavin Shan
@ 2013-03-15  7:26 ` Gavin Shan
  2013-03-15 19:29   ` Alex Williamson
  2013-03-15  7:26 ` [PATCH 3/3] VFIO: Direct access config reg without capability Gavin Shan
  2 siblings, 1 reply; 20+ messages in thread
From: Gavin Shan @ 2013-03-15  7:26 UTC (permalink / raw)
  To: kvm, linuxppc-dev; +Cc: aik, alex.williamson, Gavin Shan

The address (domain/bus/slot/function) of the passed PCI device
looks quite different from perspective of host and guest. Some
architectures like PPC need to setup the mapping in host. The patch
introduces additional VFIO device IOCTL command to address that.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 include/uapi/linux/vfio.h |   16 ++++++++++++++++
 1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 6e58d9b..ecc4f38 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -289,6 +289,22 @@ struct vfio_irq_set {
  */
 #define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
 
+/**
+ * VFIO_DEVICE_SET_ADDR_MAPPING - _IO(VFIO_TYPE, VFIO_BASE + 12)
+ *
+ * The address, which comprised of domain/bus/slot/function looks
+ * different between host and guest. We need to setup the mapping
+ * in host for some architectures like PPC so that the passed PCI
+ * devices could support RTAS smoothly.
+ */
+struct vfio_addr_mapping {
+	__u64 buid;
+	__u8  bus;
+	__u8  slot;
+	__u8  func;
+};
+#define VFIO_DEVICE_SET_ADDR_MAPPING	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
 /*
  * The VFIO-PCI bus driver makes use of the following fixed region and
  * IRQ index mapping.  Unimplemented regions return a size of zero.
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-15  7:26 ` [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command Gavin Shan
@ 2013-03-15 19:29   ` Alex Williamson
  2013-03-16  1:34     ` Gavin Shan
  0 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2013-03-15 19:29 UTC (permalink / raw)
  To: Gavin Shan; +Cc: aik, linuxppc-dev, kvm

On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The address (domain/bus/slot/function) of the passed PCI device
> looks quite different from perspective of host and guest. Some
> architectures like PPC need to setup the mapping in host. The patch
> introduces additional VFIO device IOCTL command to address that.

Could you explain further how this will be used?  How the device is
exposed to a guest is entirely a userspace construct, so why does vfio
need to know or care about this?  I had assumed for AER that QEMU would
do the translation from host to guest address space.

> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  include/uapi/linux/vfio.h |   16 ++++++++++++++++
>  1 files changed, 16 insertions(+), 0 deletions(-)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 6e58d9b..ecc4f38 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -289,6 +289,22 @@ struct vfio_irq_set {
>   */
>  #define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
>  
> +/**
> + * VFIO_DEVICE_SET_ADDR_MAPPING - _IO(VFIO_TYPE, VFIO_BASE + 12)
> + *
> + * The address, which comprised of domain/bus/slot/function looks
> + * different between host and guest. We need to setup the mapping
> + * in host for some architectures like PPC so that the passed PCI
> + * devices could support RTAS smoothly.
> + */
> +struct vfio_addr_mapping {
> +	__u64 buid;

What's a buid?  Thanks,

Alex

> +	__u8  bus;
> +	__u8  slot;
> +	__u8  func;
> +};
> +#define VFIO_DEVICE_SET_ADDR_MAPPING	_IO(VFIO_TYPE, VFIO_BASE + 12)
> +
>  /*
>   * The VFIO-PCI bus driver makes use of the following fixed region and
>   * IRQ index mapping.  Unimplemented regions return a size of zero.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-15 19:29   ` Alex Williamson
@ 2013-03-16  1:34     ` Gavin Shan
  2013-03-16  5:37       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 20+ messages in thread
From: Gavin Shan @ 2013-03-16  1:34 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Fri, Mar 15, 2013 at 01:29:00PM -0600, Alex Williamson wrote:
>On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
>> The address (domain/bus/slot/function) of the passed PCI device
>> looks quite different from perspective of host and guest. Some
>> architectures like PPC need to setup the mapping in host. The patch
>> introduces additional VFIO device IOCTL command to address that.
>
>Could you explain further how this will be used?  How the device is
>exposed to a guest is entirely a userspace construct, so why does vfio
>need to know or care about this?  I had assumed for AER that QEMU would
>do the translation from host to guest address space.
>

The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
patch. The PowerNV platform is going to override it to figure out the
information for EEH core to use. On the other hand, QEMU will runs into
the IOCTL command while opening (creating) one VFIO device.

Though I'm not familiar with AER very much. AER is quite different from
EEH. The EEH functionality implemented in PHB instead of in PCI device
core. So we don't care AER stuff in EEH directly :-)

>> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>> ---
>>  include/uapi/linux/vfio.h |   16 ++++++++++++++++
>>  1 files changed, 16 insertions(+), 0 deletions(-)
>> 
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 6e58d9b..ecc4f38 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -289,6 +289,22 @@ struct vfio_irq_set {
>>   */
>>  #define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
>>  
>> +/**
>> + * VFIO_DEVICE_SET_ADDR_MAPPING - _IO(VFIO_TYPE, VFIO_BASE + 12)
>> + *
>> + * The address, which comprised of domain/bus/slot/function looks
>> + * different between host and guest. We need to setup the mapping
>> + * in host for some architectures like PPC so that the passed PCI
>> + * devices could support RTAS smoothly.
>> + */
>> +struct vfio_addr_mapping {
>> +	__u64 buid;
>
>What's a buid?  Thanks,
>

BUID means "Bus Unit Identifier". BUID is the identifier for specific PHB.
Firmware figures it out and expose to OS through device-tree. For VFIO case,
the QEMU figures it out and expose to guest eventually. It's notable that
the PHB (including buid) figured out by QEMU is virtual and something like
container :-)

>> +	__u8  bus;
>> +	__u8  slot;
>> +	__u8  func;
>> +};
>> +#define VFIO_DEVICE_SET_ADDR_MAPPING	_IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>>  /*
>>   * The VFIO-PCI bus driver makes use of the following fixed region and
>>   * IRQ index mapping.  Unimplemented regions return a size of zero.
>

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-16  1:34     ` Gavin Shan
@ 2013-03-16  5:37       ` Benjamin Herrenschmidt
  2013-03-18 21:01         ` Alex Williamson
  0 siblings, 1 reply; 20+ messages in thread
From: Benjamin Herrenschmidt @ 2013-03-16  5:37 UTC (permalink / raw)
  To: Gavin Shan; +Cc: aik, Alex Williamson, linuxppc-dev, kvm

On Sat, 2013-03-16 at 09:34 +0800, Gavin Shan wrote:
> >Could you explain further how this will be used?  How the device is
> >exposed to a guest is entirely a userspace construct, so why does vfio
> >need to know or care about this?  I had assumed for AER that QEMU would
> >do the translation from host to guest address space.
> >
> 
> The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
> patch. The PowerNV platform is going to override it to figure out the
> information for EEH core to use. On the other hand, QEMU will runs into
> the IOCTL command while opening (creating) one VFIO device.
> 
> Though I'm not familiar with AER very much. AER is quite different from
> EEH. The EEH functionality implemented in PHB instead of in PCI device
> core. So we don't care AER stuff in EEH directly :-)

To give Alex a bit more background...

EEH is our IBM specific error handling facility which is a superset of AER.

IE. In addition to AER's error detection and logging, it adds a layer of
error detection at the host bridge level (such as iommu violations etc...)
and a mechanism for handling and recovering from errors. This is tied to
our iommu domain stuff (our PE's) and our device "freezing" capability
among others.

With VFIO + KVM, we want to implement most of the EEH support for guests in
the host kernel. The reason is multipart and we can discuss this separately
as some of it might well be debatable (mostly it's more convenient that way
because we hook into the underlying HW/FW EEH which isn't directly userspace
accessible so we don't have to add a new layer of kernel -> user API in
addition to the VFIO stuff), but there's at least one aspect of it that drives
this requirement more strongly which is performance:

When EEH is enabled, whenever any MMIO returns all 1's, the kernel will do
a firmware call to query the EEH state of the device and check whether it
has been frozen. On some devices, that can be a performance issue, and
going all the way to qemu for that would be horribly expensive.

So we want at least a way to handle that call in the kernel and for that we
need at least some way of mapping things there.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-16  5:37       ` Benjamin Herrenschmidt
@ 2013-03-18 21:01         ` Alex Williamson
  2013-03-19  3:24           ` Gavin Shan
  0 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2013-03-18 21:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Sat, 2013-03-16 at 06:37 +0100, Benjamin Herrenschmidt wrote:
> On Sat, 2013-03-16 at 09:34 +0800, Gavin Shan wrote:
> > >Could you explain further how this will be used?  How the device is
> > >exposed to a guest is entirely a userspace construct, so why does vfio
> > >need to know or care about this?  I had assumed for AER that QEMU would
> > >do the translation from host to guest address space.
> > >
> > 
> > The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
> > patch. The PowerNV platform is going to override it to figure out the
> > information for EEH core to use. On the other hand, QEMU will runs into
> > the IOCTL command while opening (creating) one VFIO device.
> > 
> > Though I'm not familiar with AER very much. AER is quite different from
> > EEH. The EEH functionality implemented in PHB instead of in PCI device
> > core. So we don't care AER stuff in EEH directly :-)
> 
> To give Alex a bit more background...
> 
> EEH is our IBM specific error handling facility which is a superset of AER.
> 
> IE. In addition to AER's error detection and logging, it adds a layer of
> error detection at the host bridge level (such as iommu violations etc...)
> and a mechanism for handling and recovering from errors. This is tied to
> our iommu domain stuff (our PE's) and our device "freezing" capability
> among others.
> 
> With VFIO + KVM, we want to implement most of the EEH support for guests in
> the host kernel. The reason is multipart and we can discuss this separately
> as some of it might well be debatable (mostly it's more convenient that way
> because we hook into the underlying HW/FW EEH which isn't directly userspace
> accessible so we don't have to add a new layer of kernel -> user API in
> addition to the VFIO stuff), but there's at least one aspect of it that drives
> this requirement more strongly which is performance:
> 
> When EEH is enabled, whenever any MMIO returns all 1's, the kernel will do
> a firmware call to query the EEH state of the device and check whether it
> has been frozen. On some devices, that can be a performance issue, and
> going all the way to qemu for that would be horribly expensive.
> 
> So we want at least a way to handle that call in the kernel and for that we
> need at least some way of mapping things there.

There's no notification mechanism when a PHB is frozen?  I suppose
notification would be asynchronous so you risk data for every read that
happens in the interim.  So the choices are a) tell the host kernel the
mapping, b) tell the guest kernel the mapping, c) identity mapping, or
d) qemu intercept?

Presumably your firmware call to query the EEH is not going through
VFIO, so is VFIO the appropriate place to setup this mapping?  As you
say, this seems like just a convenient place to put it even though it
really has nothing to do with the VFIO kernel component.  QEMU has this
information and could register it with the host kernel through other
means if available.  Maybe the mapping should be registered with KVM if
that's how the EEH data is accessed.  I'm not yet sold on why this
mapping is registered here.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-18 21:01         ` Alex Williamson
@ 2013-03-19  3:24           ` Gavin Shan
  2013-03-19  4:18             ` Alex Williamson
  0 siblings, 1 reply; 20+ messages in thread
From: Gavin Shan @ 2013-03-19  3:24 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Mon, Mar 18, 2013 at 03:01:14PM -0600, Alex Williamson wrote:
>On Sat, 2013-03-16 at 06:37 +0100, Benjamin Herrenschmidt wrote:
>> On Sat, 2013-03-16 at 09:34 +0800, Gavin Shan wrote:
>> > >Could you explain further how this will be used?  How the device is
>> > >exposed to a guest is entirely a userspace construct, so why does vfio
>> > >need to know or care about this?  I had assumed for AER that QEMU would
>> > >do the translation from host to guest address space.
>> > >
>> > 
>> > The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
>> > patch. The PowerNV platform is going to override it to figure out the
>> > information for EEH core to use. On the other hand, QEMU will runs into
>> > the IOCTL command while opening (creating) one VFIO device.
>> > 
>> > Though I'm not familiar with AER very much. AER is quite different from
>> > EEH. The EEH functionality implemented in PHB instead of in PCI device
>> > core. So we don't care AER stuff in EEH directly :-)
>> 
>> To give Alex a bit more background...
>> 
>> EEH is our IBM specific error handling facility which is a superset of AER.
>> 
>> IE. In addition to AER's error detection and logging, it adds a layer of
>> error detection at the host bridge level (such as iommu violations etc...)
>> and a mechanism for handling and recovering from errors. This is tied to
>> our iommu domain stuff (our PE's) and our device "freezing" capability
>> among others.
>> 
>> With VFIO + KVM, we want to implement most of the EEH support for guests in
>> the host kernel. The reason is multipart and we can discuss this separately
>> as some of it might well be debatable (mostly it's more convenient that way
>> because we hook into the underlying HW/FW EEH which isn't directly userspace
>> accessible so we don't have to add a new layer of kernel -> user API in
>> addition to the VFIO stuff), but there's at least one aspect of it that drives
>> this requirement more strongly which is performance:
>> 
>> When EEH is enabled, whenever any MMIO returns all 1's, the kernel will do
>> a firmware call to query the EEH state of the device and check whether it
>> has been frozen. On some devices, that can be a performance issue, and
>> going all the way to qemu for that would be horribly expensive.
>> 
>> So we want at least a way to handle that call in the kernel and for that we
>> need at least some way of mapping things there.
>
>There's no notification mechanism when a PHB is frozen?  I suppose
>notification would be asynchronous so you risk data for every read that
>happens in the interim.  So the choices are a) tell the host kernel the
>mapping, b) tell the guest kernel the mapping, c) identity mapping, or
>d) qemu intercept?
>

We do have dedicated interrupts on detecting frozen PHB on host side.
However, the guest has to poll/check the frozen state (frozen PE) during
access to config or MMIO space. For the recommended methods, (a) is what
we want to do with the patchset. (b) seems infeasible since the guest
shouldn't be aware of hypervisor (e.g. KVM or PowerVM) it's running on
top of, it's hard to polish the guest to do it. (d) sounds applicable
since the QEMU should know the address (BDF) of host and guest devices.
However, we still need let the host EEH core know that which PCI device
has been passed to guest and the best place to do that would be when opening
the corresponding VFIO PCI device. In turn, it will still need weak function
for ppc platform to override it. Why we not directly take (a) to finish
everything in one VFIO IOCTL command?

Sorry, Alex. I didn't understand (c) well :-)

>Presumably your firmware call to query the EEH is not going through
>VFIO, so is VFIO the appropriate place to setup this mapping?  As you
>say, this seems like just a convenient place to put it even though it
>really has nothing to do with the VFIO kernel component.  QEMU has this
>information and could register it with the host kernel through other
>means if available.  Maybe the mapping should be registered with KVM if
>that's how the EEH data is accessed.  I'm not yet sold on why this
>mapping is registered here.  Thanks,
>

Yes, EEH firmware call needn't going through VFIO. However, EEH has
very close relationship with PCI and so VFIO-PCI does. Eventually, EEH
has close relationship with VFIO-PCI :-)

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-19  3:24           ` Gavin Shan
@ 2013-03-19  4:18             ` Alex Williamson
  2013-03-19  4:45               ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2013-03-19  4:18 UTC (permalink / raw)
  To: Gavin Shan; +Cc: aik, linuxppc-dev, kvm

On Tue, 2013-03-19 at 11:24 +0800, Gavin Shan wrote:
> On Mon, Mar 18, 2013 at 03:01:14PM -0600, Alex Williamson wrote:
> >On Sat, 2013-03-16 at 06:37 +0100, Benjamin Herrenschmidt wrote:
> >> On Sat, 2013-03-16 at 09:34 +0800, Gavin Shan wrote:
> >> > >Could you explain further how this will be used?  How the device is
> >> > >exposed to a guest is entirely a userspace construct, so why does vfio
> >> > >need to know or care about this?  I had assumed for AER that QEMU would
> >> > >do the translation from host to guest address space.
> >> > >
> >> > 
> >> > The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
> >> > patch. The PowerNV platform is going to override it to figure out the
> >> > information for EEH core to use. On the other hand, QEMU will runs into
> >> > the IOCTL command while opening (creating) one VFIO device.
> >> > 
> >> > Though I'm not familiar with AER very much. AER is quite different from
> >> > EEH. The EEH functionality implemented in PHB instead of in PCI device
> >> > core. So we don't care AER stuff in EEH directly :-)
> >> 
> >> To give Alex a bit more background...
> >> 
> >> EEH is our IBM specific error handling facility which is a superset of AER.
> >> 
> >> IE. In addition to AER's error detection and logging, it adds a layer of
> >> error detection at the host bridge level (such as iommu violations etc...)
> >> and a mechanism for handling and recovering from errors. This is tied to
> >> our iommu domain stuff (our PE's) and our device "freezing" capability
> >> among others.
> >> 
> >> With VFIO + KVM, we want to implement most of the EEH support for guests in
> >> the host kernel. The reason is multipart and we can discuss this separately
> >> as some of it might well be debatable (mostly it's more convenient that way
> >> because we hook into the underlying HW/FW EEH which isn't directly userspace
> >> accessible so we don't have to add a new layer of kernel -> user API in
> >> addition to the VFIO stuff), but there's at least one aspect of it that drives
> >> this requirement more strongly which is performance:
> >> 
> >> When EEH is enabled, whenever any MMIO returns all 1's, the kernel will do
> >> a firmware call to query the EEH state of the device and check whether it
> >> has been frozen. On some devices, that can be a performance issue, and
> >> going all the way to qemu for that would be horribly expensive.
> >> 
> >> So we want at least a way to handle that call in the kernel and for that we
> >> need at least some way of mapping things there.
> >
> >There's no notification mechanism when a PHB is frozen?  I suppose
> >notification would be asynchronous so you risk data for every read that
> >happens in the interim.  So the choices are a) tell the host kernel the
> >mapping, b) tell the guest kernel the mapping, c) identity mapping, or
> >d) qemu intercept?
> >
> 
> We do have dedicated interrupts on detecting frozen PHB on host side.
> However, the guest has to poll/check the frozen state (frozen PE) during
> access to config or MMIO space.

Can you make use of something like this to notify the guest:

https://github.com/awilliam/linux-vfio/commit/dad9f8972e04cd081a028d8fb1249d746d97fc03

As a first step this only notifies QEMU, but the plan is to forward that
on to the guest.  If we can leverage similar interfaces between AER and
EEH, I'd obviously like to do that.

> For the recommended methods, (a) is what
> we want to do with the patchset. (b) seems infeasible since the guest
> shouldn't be aware of hypervisor (e.g. KVM or PowerVM) it's running on
> top of, it's hard to polish the guest to do it. (d) sounds applicable
> since the QEMU should know the address (BDF) of host and guest devices.
> However, we still need let the host EEH core know that which PCI device
> has been passed to guest and the best place to do that would be when opening
> the corresponding VFIO PCI device. In turn, it will still need weak function
> for ppc platform to override it. Why we not directly take (a) to finish
> everything in one VFIO IOCTL command?

Because it seems like VFIO is just being used as a relay and has no
purpose knowing this information on it's own.  It's just a convenient
place to host the ioctl, but that alone is not a good enough reason to
put it there.

> Sorry, Alex. I didn't understand (c) well :-)

(c) is if the buid/bus/slot/func exposed to the guest matches the same
for the host, then there's no need for mapping translation.

> >Presumably your firmware call to query the EEH is not going through
> >VFIO, so is VFIO the appropriate place to setup this mapping?  As you
> >say, this seems like just a convenient place to put it even though it
> >really has nothing to do with the VFIO kernel component.  QEMU has this
> >information and could register it with the host kernel through other
> >means if available.  Maybe the mapping should be registered with KVM if
> >that's how the EEH data is accessed.  I'm not yet sold on why this
> >mapping is registered here.  Thanks,
> >
> 
> Yes, EEH firmware call needn't going through VFIO. However, EEH has
> very close relationship with PCI and so VFIO-PCI does. Eventually, EEH
> has close relationship with VFIO-PCI :-)

Is there some plan to do more with EEH through VFIO in the future or are
we just talking about some kind of weird associative property to sell
this ioctl?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-19  4:18             ` Alex Williamson
@ 2013-03-19  4:45               ` Benjamin Herrenschmidt
  2013-03-20 18:48                 ` Alex Williamson
  0 siblings, 1 reply; 20+ messages in thread
From: Benjamin Herrenschmidt @ 2013-03-19  4:45 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Mon, 2013-03-18 at 22:18 -0600, Alex Williamson wrote:
> > Yes, EEH firmware call needn't going through VFIO. However, EEH has
> > very close relationship with PCI and so VFIO-PCI does. Eventually, EEH
> > has close relationship with VFIO-PCI :-)
> 
> Is there some plan to do more with EEH through VFIO in the future or are
> we just talking about some kind of weird associative property to sell
> this ioctl?  Thanks,

Well, I'm not sure how 'weird' that is but it makes sense to me... VFIO
is the mechanism that virtualizes access to a PCI device and provides
interfaces to qemu & kvm to access it &| map it.

Or rather VFIO-PCI is.

At a basic level it provides ... the basic PCI interfaces, ie, config
space access (with or without filtering), interrupts, etc...

In our environment, EEH is just another functionality of PCI really.
The firmware calls used by the guest to do that fall into more or less
the same category as the ones used for PCI config space accesses,
manipulation of DMA windows, etc... Similar to host (though guest
and host use a different FW interface for various reasons).

So it's very natural to "transport" these via VFIO-PCI like everything
else, I don't see a more natural place to put the ioctl's we need for
qemu to be able to access the EEH state, trigger EEH (un)isolation,
resets, etc...

Fundamentally, the design should be for VFIO-PCI to provide some specific
ioctls for EEH that userspace programs such as qemu can use, and then
re-expose those APIs to the guest.

In addition, to do some of it in the kernel for performance reason, we
want to establish that mapping, but I see that as a VFIO "accelerator".

IE. Whatever is going to respond to the EEH calls from the guest in-kernel
will have to share state with the rest of the EEH stuff provided to qemu
by vfio-pci.

Ben.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-19  4:45               ` Benjamin Herrenschmidt
@ 2013-03-20 18:48                 ` Alex Williamson
  2013-03-20 19:31                   ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2013-03-20 18:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Tue, 2013-03-19 at 05:45 +0100, Benjamin Herrenschmidt wrote:
> On Mon, 2013-03-18 at 22:18 -0600, Alex Williamson wrote:
> > > Yes, EEH firmware call needn't going through VFIO. However, EEH has
> > > very close relationship with PCI and so VFIO-PCI does. Eventually, EEH
> > > has close relationship with VFIO-PCI :-)
> > 
> > Is there some plan to do more with EEH through VFIO in the future or are
> > we just talking about some kind of weird associative property to sell
> > this ioctl?  Thanks,
> 
> Well, I'm not sure how 'weird' that is but it makes sense to me... VFIO
> is the mechanism that virtualizes access to a PCI device and provides
> interfaces to qemu & kvm to access it &| map it.
> 
> Or rather VFIO-PCI is.
> 
> At a basic level it provides ... the basic PCI interfaces, ie, config
> space access (with or without filtering), interrupts, etc...
> 
> In our environment, EEH is just another functionality of PCI really.
> The firmware calls used by the guest to do that fall into more or less
> the same category as the ones used for PCI config space accesses,
> manipulation of DMA windows, etc... Similar to host (though guest
> and host use a different FW interface for various reasons).
> 
> So it's very natural to "transport" these via VFIO-PCI like everything
> else, I don't see a more natural place to put the ioctl's we need for
> qemu to be able to access the EEH state, trigger EEH (un)isolation,
> resets, etc...
> 
> Fundamentally, the design should be for VFIO-PCI to provide some specific
> ioctls for EEH that userspace programs such as qemu can use, and then
> re-expose those APIs to the guest.
> 
> In addition, to do some of it in the kernel for performance reason, we
> want to establish that mapping, but I see that as a VFIO "accelerator".
> 
> IE. Whatever is going to respond to the EEH calls from the guest in-kernel
> will have to share state with the rest of the EEH stuff provided to qemu
> by vfio-pci.

Perhaps my problem is that I don't have a clear picture of where you're
going with this like I do for AER.  For AER we're starting with
notification of an error, from that we build into how to retrieve the
error information, and finally how to perform corrective action.  Each
of these will be done through vifo-pci.

Here we're starting by registering a mapping that's really only useful
to the vfio "accelerator" path, but we don't even have a hint of what
the non-accelerated path is and how vfio is involved with it.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-20 18:48                 ` Alex Williamson
@ 2013-03-20 19:31                   ` Benjamin Herrenschmidt
  2013-03-20 19:46                     ` Alex Williamson
  0 siblings, 1 reply; 20+ messages in thread
From: Benjamin Herrenschmidt @ 2013-03-20 19:31 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:
> Perhaps my problem is that I don't have a clear picture of where
> you're
> going with this like I do for AER.  For AER we're starting with
> notification of an error, from that we build into how to retrieve the
> error information, and finally how to perform corrective action.  Each
> of these will be done through vifo-pci.
> 
> Here we're starting by registering a mapping that's really only useful
> to the vfio "accelerator" path, but we don't even have a hint of what
> the non-accelerated path is and how vfio is involved with it.  Thanks,

I'm surprised that you are building so much policy around AER ... can't
you just pass the raw stuff down to the guest and let the guest do it's
own corrective actions ?

As for EEH, I will let Gavin describe in more details what he is doing,
though I wouldn't be surprised if so far he doesn't have a
non-accelerated path :-) Which indeed makes things oddball, granted ...
at least for now. I *think* what Gavin's doing right now is a
pass-through to the host EEH directly in the kernel, so without a slow
path...

Gavin, it really boils down to that. In-kernel EEH for guests is a
KVMism that ends up not involving VFIO in any other way than
establishing the mapping, then arguably it could be done via a VM ioctl.

If there's more going through VFIO and shared state, then it should
probably go through VFIO-PCI.

Note (FYI) that EEH somewhat encompass AER... the EEH logic triggers on
AER errors as well and the error reports generated by the firmware
contain the AER register dump in addition to the bridge internal stuff.
IE. EEH encompass pretty much all sort of errors, correctable or not,
that happens on PCI. It adds a mechanism of "isolation" of domains on
first error (involving blocking MMIOs, DMAs and MSIs) which helps with
preventing propagation of bad data, and various recovery schemes.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-20 19:31                   ` Benjamin Herrenschmidt
@ 2013-03-20 19:46                     ` Alex Williamson
  2013-03-21  2:09                       ` Gavin Shan
  0 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2013-03-20 19:46 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Wed, 2013-03-20 at 20:31 +0100, Benjamin Herrenschmidt wrote:
> On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:
> > Perhaps my problem is that I don't have a clear picture of where
> > you're
> > going with this like I do for AER.  For AER we're starting with
> > notification of an error, from that we build into how to retrieve the
> > error information, and finally how to perform corrective action.  Each
> > of these will be done through vifo-pci.
> > 
> > Here we're starting by registering a mapping that's really only useful
> > to the vfio "accelerator" path, but we don't even have a hint of what
> > the non-accelerated path is and how vfio is involved with it.  Thanks,
> 
> I'm surprised that you are building so much policy around AER ... can't
> you just pass the raw stuff down to the guest and let the guest do it's
> own corrective actions ?

How does the guest get the raw stuff?  We need to get the AER interrupt
out to the guest so it can be injected into the virtual PCIe port, then
we need to be able to retrieve the physical device log and pass it to
the qemu to mangle to match the guest topology.  We don't have existing
firmware interfaces for the guest to do that, so it's all being routed
through vfio-pci.

> As for EEH, I will let Gavin describe in more details what he is doing,
> though I wouldn't be surprised if so far he doesn't have a
> non-accelerated path :-) Which indeed makes things oddball, granted ...
> at least for now. I *think* what Gavin's doing right now is a
> pass-through to the host EEH directly in the kernel, so without a slow
> path...
> 
> Gavin, it really boils down to that. In-kernel EEH for guests is a
> KVMism that ends up not involving VFIO in any other way than
> establishing the mapping, then arguably it could be done via a VM ioctl.
> 
> If there's more going through VFIO and shared state, then it should
> probably go through VFIO-PCI.

Exactly my thinking.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
  2013-03-20 19:46                     ` Alex Williamson
@ 2013-03-21  2:09                       ` Gavin Shan
  0 siblings, 0 replies; 20+ messages in thread
From: Gavin Shan @ 2013-03-21  2:09 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Wed, Mar 20, 2013 at 01:46:22PM -0600, Alex Williamson wrote:
>On Wed, 2013-03-20 at 20:31 +0100, Benjamin Herrenschmidt wrote:
>> On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:

.../...

>> As for EEH, I will let Gavin describe in more details what he is doing,
>> though I wouldn't be surprised if so far he doesn't have a
>> non-accelerated path :-) Which indeed makes things oddball, granted ...
>> at least for now. I *think* what Gavin's doing right now is a
>> pass-through to the host EEH directly in the kernel, so without a slow
>> path...
>> 

Yes, I don't have non-accelerated path. I'm trying to describe what I'm
doing: On the host side, the interrupt will be triggered while detecting
frozen PE, which has been passed to guest. We won't send EEH event to
EEH core on host side and we're waiting for guest to be involved to handle
the EEH error. In guest, any access to config or MMIO of the frozen PE will
trigger EEH event and in turn, the guest utilizes existing (exactly same
to pSeries on phyp case) RTAS calls to recover the error. The RTAS calls is
being emulated in host kernel.

Part of the RTAS call arguments is PCI domain/bus/slot/function viewed from
guest perspective. That's different from that for same physical PCI device
in host side. So I used VFIO-PCI to do the mapping and maintain the information
in host kernel.

>> Gavin, it really boils down to that. In-kernel EEH for guests is a
>> KVMism that ends up not involving VFIO in any other way than
>> establishing the mapping, then arguably it could be done via a VM ioctl.
>> 
>> If there's more going through VFIO and shared state, then it should
>> probably go through VFIO-PCI.
>

Ben, you're right. I use VFIO for nothing other than doing address mapping.
So I will do the address mapping in VM IOCTL instead of in VFIO-PCI.

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 3/3] VFIO: Direct access config reg without capability
  2013-03-15  7:26 [PATCH 0/3] VFIO change for EEH support Gavin Shan
  2013-03-15  7:26 ` [PATCH 1/3] VFIO: Architecture dependent VFIO device operations Gavin Shan
  2013-03-15  7:26 ` [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command Gavin Shan
@ 2013-03-15  7:26 ` Gavin Shan
  2013-03-15 19:41   ` Alex Williamson
  2013-03-21  0:58   ` Alex Williamson
  2 siblings, 2 replies; 20+ messages in thread
From: Gavin Shan @ 2013-03-15  7:26 UTC (permalink / raw)
  To: kvm, linuxppc-dev; +Cc: aik, alex.williamson, Gavin Shan

The config registers in [0, 0x40] is being supported by VFIO. Apart
from that, the other config registers should be coverred by PCI or
PCIe capability. However, there might have some PCI devices (be2net)
who has config registers (0x7c) out of [0, 0x40], and don't have
corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
those registers and writing is dropped. It caused the be2net driver
fails to be loaded because 0x0 returned from its config register 0x7c.

The patch changes the behaviour so that those config registers out
of [0, 0x40] and don't have corresponding PCI or PCIe capability
will be accessed directly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci_config.c |   31 ++++++++++++++++++++-----------
 1 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 964ff22..5ea3afb 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 
 	cap_id = vdev->pci_config_map[*ppos / 4];
 
+	/*
+	 * Some PCI device config registers might not be coverred by
+	 * capability and useful. We will enable direct access to
+	 * those registers.
+	 */
 	if (cap_id == PCI_CAP_ID_INVALID) {
-		if (iswrite)
-			return ret; /* drop */
-
-		/*
-		 * Per PCI spec 3.0, section 6.1, reads from reserved and
-		 * unimplemented registers return 0
-		 */
-		if (copy_to_user(buf, &val, count))
-			return -EFAULT;
-
-		return ret;
+		if (iswrite) {
+			if (copy_from_user(&val, buf, count))
+				return -EFAULT;
+			ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
+						     val, count);
+			return ret ? ret : count;
+		} else {
+			ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
+						    &val, count);
+			if (ret)
+				return ret;
+			if (copy_to_user(buf, &val, count))
+				return -EFAULT;
+			return count;
+		}
 	}
 
 	/*
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] VFIO: Direct access config reg without capability
  2013-03-15  7:26 ` [PATCH 3/3] VFIO: Direct access config reg without capability Gavin Shan
@ 2013-03-15 19:41   ` Alex Williamson
  2013-03-16  3:34     ` Gavin Shan
  2013-03-16  5:30     ` Benjamin Herrenschmidt
  2013-03-21  0:58   ` Alex Williamson
  1 sibling, 2 replies; 20+ messages in thread
From: Alex Williamson @ 2013-03-15 19:41 UTC (permalink / raw)
  To: Gavin Shan; +Cc: aik, linuxppc-dev, kvm

On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The config registers in [0, 0x40] is being supported by VFIO. Apart
> from that, the other config registers should be coverred by PCI or
> PCIe capability. However, there might have some PCI devices (be2net)
> who has config registers (0x7c) out of [0, 0x40], and don't have
> corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
> those registers and writing is dropped. It caused the be2net driver
> fails to be loaded because 0x0 returned from its config register 0x7c.
> 
> The patch changes the behaviour so that those config registers out
> of [0, 0x40] and don't have corresponding PCI or PCIe capability
> will be accessed directly.

This basically gives userspace free access to any regions that aren't
covered by known capabilities.  We have no idea what this might expose
on some devices.  I'd like to support be2net, but what's the minimal
access that it needs?  Can we provide 2 or 4 bytes of read-only access
at offset 0x7c for just that device?  Is it always 0x7c?  Let's split
this patch from the series since it's clearly dealing with something
independent.  Thanks,

Alex

> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  drivers/vfio/pci/vfio_pci_config.c |   31 ++++++++++++++++++++-----------
>  1 files changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 964ff22..5ea3afb 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
>  
>  	cap_id = vdev->pci_config_map[*ppos / 4];
>  
> +	/*
> +	 * Some PCI device config registers might not be coverred by
> +	 * capability and useful. We will enable direct access to
> +	 * those registers.
> +	 */
>  	if (cap_id == PCI_CAP_ID_INVALID) {
> -		if (iswrite)
> -			return ret; /* drop */
> -
> -		/*
> -		 * Per PCI spec 3.0, section 6.1, reads from reserved and
> -		 * unimplemented registers return 0
> -		 */
> -		if (copy_to_user(buf, &val, count))
> -			return -EFAULT;
> -
> -		return ret;
> +		if (iswrite) {
> +			if (copy_from_user(&val, buf, count))
> +				return -EFAULT;
> +			ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
> +						     val, count);
> +			return ret ? ret : count;
> +		} else {
> +			ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
> +						    &val, count);
> +			if (ret)
> +				return ret;
> +			if (copy_to_user(buf, &val, count))
> +				return -EFAULT;
> +			return count;
> +		}
>  	}
>  
>  	/*

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] VFIO: Direct access config reg without capability
  2013-03-15 19:41   ` Alex Williamson
@ 2013-03-16  3:34     ` Gavin Shan
  2013-03-16  5:30     ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 20+ messages in thread
From: Gavin Shan @ 2013-03-16  3:34 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Fri, Mar 15, 2013 at 01:41:08PM -0600, Alex Williamson wrote:
>On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
>> The config registers in [0, 0x40] is being supported by VFIO. Apart
>> from that, the other config registers should be coverred by PCI or
>> PCIe capability. However, there might have some PCI devices (be2net)
>> who has config registers (0x7c) out of [0, 0x40], and don't have
>> corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
>> those registers and writing is dropped. It caused the be2net driver
>> fails to be loaded because 0x0 returned from its config register 0x7c.
>> 
>> The patch changes the behaviour so that those config registers out
>> of [0, 0x40] and don't have corresponding PCI or PCIe capability
>> will be accessed directly.
>
>This basically gives userspace free access to any regions that aren't
>covered by known capabilities.  We have no idea what this might expose
>on some devices.  I'd like to support be2net, but what's the minimal
>access that it needs?  Can we provide 2 or 4 bytes of read-only access
>at offset 0x7c for just that device?  Is it always 0x7c?  Let's split
>this patch from the series since it's clearly dealing with something
>independent.  Thanks,
>

0x7c is just one example. Actually, benet driver also need access other
uncoverred config registers like 0x58/0xf0/0xfc (by capabilities) in orde
to make the device work well. All of those uncoverred config registers
are really business of specific device itself. I think we might not bother
their accessing attributes. So exporting those uncoverred registers to
user space might be the reasonable choice.

If we really want to control the accessing attributes for those uncoverred
registers, we might introduce some mechanism to check the vendor/device ID
and read/write to the uncoverred registers according the specified bits.
All of that requires fully understanding the usage of those uncoverred registers.

Yes, I will split this one from the patchset.

Thanks,
Gavin

>> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>> ---
>>  drivers/vfio/pci/vfio_pci_config.c |   31 ++++++++++++++++++++-----------
>>  1 files changed, 20 insertions(+), 11 deletions(-)
>> 
>> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
>> index 964ff22..5ea3afb 100644
>> --- a/drivers/vfio/pci/vfio_pci_config.c
>> +++ b/drivers/vfio/pci/vfio_pci_config.c
>> @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
>>  
>>  	cap_id = vdev->pci_config_map[*ppos / 4];
>>  
>> +	/*
>> +	 * Some PCI device config registers might not be coverred by
>> +	 * capability and useful. We will enable direct access to
>> +	 * those registers.
>> +	 */
>>  	if (cap_id == PCI_CAP_ID_INVALID) {
>> -		if (iswrite)
>> -			return ret; /* drop */
>> -
>> -		/*
>> -		 * Per PCI spec 3.0, section 6.1, reads from reserved and
>> -		 * unimplemented registers return 0
>> -		 */
>> -		if (copy_to_user(buf, &val, count))
>> -			return -EFAULT;
>> -
>> -		return ret;
>> +		if (iswrite) {
>> +			if (copy_from_user(&val, buf, count))
>> +				return -EFAULT;
>> +			ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
>> +						     val, count);
>> +			return ret ? ret : count;
>> +		} else {
>> +			ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
>> +						    &val, count);
>> +			if (ret)
>> +				return ret;
>> +			if (copy_to_user(buf, &val, count))
>> +				return -EFAULT;
>> +			return count;
>> +		}
>>  	}
>>  
>>  	/*
>
>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] VFIO: Direct access config reg without capability
  2013-03-15 19:41   ` Alex Williamson
  2013-03-16  3:34     ` Gavin Shan
@ 2013-03-16  5:30     ` Benjamin Herrenschmidt
  2013-03-18 21:15       ` Alex Williamson
  1 sibling, 1 reply; 20+ messages in thread
From: Benjamin Herrenschmidt @ 2013-03-16  5:30 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Fri, 2013-03-15 at 13:41 -0600, Alex Williamson wrote:
> 
> This basically gives userspace free access to any regions that aren't
> covered by known capabilities. 

And ?

I mean seriously :-) We already had that discussion ... trying to
"protect" config space is just plain doomed. There is no point.

It makes sense to do things like emulate BARs etc... for things to
function properly under some circumstances/setups where you can't just
expose the original BAR values to the guest and have the HW take care of
it but you *must* be prepared to deal with anything in config space
being changed without you knowing about it.

Devices *will* have backdoors via MMIO. Period. You cannot rely on those
not existing, whether they are documented or not.

If you can't cope with the config space accesses then you aren't
properly isolated. It can be deemed acceptable (depends what you use
your VMs for) but that I mean is that any config space
filtering/emulation for the sake of "security" is ... pointless.

Doing it for functionality to work at all (ie BAR emulation) is fine,
but that's about it. IE. As a mean of security it's pointless.

>  We have no idea what this might expose on some devices.

No more than we have any idea what MMIO mapping of the device register
space exposes :-)

>   I'd like to support be2net, but what's the minimal
> access that it needs?  Can we provide 2 or 4 bytes of read-only access
> at offset 0x7c for just that device?  Is it always 0x7c?  Let's split
> this patch from the series since it's clearly dealing with something
> independent.  Thanks,

Ben.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] VFIO: Direct access config reg without capability
  2013-03-16  5:30     ` Benjamin Herrenschmidt
@ 2013-03-18 21:15       ` Alex Williamson
  0 siblings, 0 replies; 20+ messages in thread
From: Alex Williamson @ 2013-03-18 21:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: aik, linuxppc-dev, Gavin Shan, kvm

On Sat, 2013-03-16 at 06:30 +0100, Benjamin Herrenschmidt wrote:
> On Fri, 2013-03-15 at 13:41 -0600, Alex Williamson wrote:
> > 
> > This basically gives userspace free access to any regions that aren't
> > covered by known capabilities. 
> 
> And ?
> 
> I mean seriously :-) We already had that discussion ... trying to
> "protect" config space is just plain doomed. There is no point.
> 
> It makes sense to do things like emulate BARs etc... for things to
> function properly under some circumstances/setups where you can't just
> expose the original BAR values to the guest and have the HW take care of
> it but you *must* be prepared to deal with anything in config space
> being changed without you knowing about it.
> 
> Devices *will* have backdoors via MMIO. Period. You cannot rely on those
> not existing, whether they are documented or not.
> 
> If you can't cope with the config space accesses then you aren't
> properly isolated. It can be deemed acceptable (depends what you use
> your VMs for) but that I mean is that any config space
> filtering/emulation for the sake of "security" is ... pointless.
> 
> Doing it for functionality to work at all (ie BAR emulation) is fine,
> but that's about it. IE. As a mean of security it's pointless.
> 
> 
> >  We have no idea what this might expose on some devices.
> 
> No more than we have any idea what MMIO mapping of the device register
> space exposes :-)

Yeah, yeah.  Ok, I can't come up with a reasonable argument otherwise,
it'll give us better device support, and I believe pci-assign has always
done this.  I'll take another look at the patch.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] VFIO: Direct access config reg without capability
  2013-03-15  7:26 ` [PATCH 3/3] VFIO: Direct access config reg without capability Gavin Shan
  2013-03-15 19:41   ` Alex Williamson
@ 2013-03-21  0:58   ` Alex Williamson
  1 sibling, 0 replies; 20+ messages in thread
From: Alex Williamson @ 2013-03-21  0:58 UTC (permalink / raw)
  To: Gavin Shan; +Cc: aik, linuxppc-dev, kvm

On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The config registers in [0, 0x40] is being supported by VFIO. Apart
> from that, the other config registers should be coverred by PCI or
> PCIe capability. However, there might have some PCI devices (be2net)
> who has config registers (0x7c) out of [0, 0x40], and don't have
> corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
> those registers and writing is dropped. It caused the be2net driver
> fails to be loaded because 0x0 returned from its config register 0x7c.
> 
> The patch changes the behaviour so that those config registers out
> of [0, 0x40] and don't have corresponding PCI or PCIe capability
> will be accessed directly.
> 
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---

Hi Gavin,

I'm onboard with making this change now, but this patch isn't
sufficient.  The config space map uses a byte per dword to index the
capability since both standard and extended capabilities are dword
aligned.  We currently have a bug that this patch exposes that we round
the length down, ex. a 14 byte MSI capability becomes 12 bytes leaving
the message data now exposed and writable with this patch.  That bug can
be fixed by aligning the length so the capability fills the dword, but
notice that 0x7c on the be2net is filling one of these gaps.  So fixing
that bug attaches that gap to the previous capability instead of
allowing direct access.

So, before we can make this change we need to fix the config map to have
byte granularity.  Thanks,

Alex

>  drivers/vfio/pci/vfio_pci_config.c |   31 ++++++++++++++++++++-----------
>  1 files changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 964ff22..5ea3afb 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
>  
>  	cap_id = vdev->pci_config_map[*ppos / 4];
>  
> +	/*
> +	 * Some PCI device config registers might not be coverred by
> +	 * capability and useful. We will enable direct access to
> +	 * those registers.
> +	 */
>  	if (cap_id == PCI_CAP_ID_INVALID) {
> -		if (iswrite)
> -			return ret; /* drop */
> -
> -		/*
> -		 * Per PCI spec 3.0, section 6.1, reads from reserved and
> -		 * unimplemented registers return 0
> -		 */
> -		if (copy_to_user(buf, &val, count))
> -			return -EFAULT;
> -
> -		return ret;
> +		if (iswrite) {
> +			if (copy_from_user(&val, buf, count))
> +				return -EFAULT;
> +			ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
> +						     val, count);
> +			return ret ? ret : count;
> +		} else {
> +			ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
> +						    &val, count);
> +			if (ret)
> +				return ret;
> +			if (copy_to_user(buf, &val, count))
> +				return -EFAULT;
> +			return count;
> +		}
>  	}
>  
>  	/*

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2013-03-21  2:09 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-03-15  7:26 [PATCH 0/3] VFIO change for EEH support Gavin Shan
2013-03-15  7:26 ` [PATCH 1/3] VFIO: Architecture dependent VFIO device operations Gavin Shan
2013-03-15  7:26 ` [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command Gavin Shan
2013-03-15 19:29   ` Alex Williamson
2013-03-16  1:34     ` Gavin Shan
2013-03-16  5:37       ` Benjamin Herrenschmidt
2013-03-18 21:01         ` Alex Williamson
2013-03-19  3:24           ` Gavin Shan
2013-03-19  4:18             ` Alex Williamson
2013-03-19  4:45               ` Benjamin Herrenschmidt
2013-03-20 18:48                 ` Alex Williamson
2013-03-20 19:31                   ` Benjamin Herrenschmidt
2013-03-20 19:46                     ` Alex Williamson
2013-03-21  2:09                       ` Gavin Shan
2013-03-15  7:26 ` [PATCH 3/3] VFIO: Direct access config reg without capability Gavin Shan
2013-03-15 19:41   ` Alex Williamson
2013-03-16  3:34     ` Gavin Shan
2013-03-16  5:30     ` Benjamin Herrenschmidt
2013-03-18 21:15       ` Alex Williamson
2013-03-21  0:58   ` Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).