[Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
@ 2016-06-20 22:04 Alex Williamson
  2016-06-20 22:23 ` Eric Blake
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Alex Williamson @ 2016-06-20 22:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, zhoujie2011, alex.williamson, lersek

The kernel currently exposes the SR-IOV capability as read-only
through vfio-pci.  This is sufficient to protect the host kernel, but
has the potential to confuse guests without further virtualization.
In particular, OVMF tries to size the VF BARs and comes up with absurd
results, ending with an assert.  There's not much point in adding
virtualization to a read-only capability, so we simply hide it for
now.  If the kernel ever enables SR-IOV virtualization, we should
easily be able to test it through VF BAR sizing or explicit flags.

Testing whether we should parse extended capabilities is also pulled
into the function to keep these assumptions in one place.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

This depends on Chen Fan's patch "vfio: add pcie extended capability
support", which I'll pull from Zhou Jie's latest series unless there
are comments to the contrary.  Otherwise based on Stefan's tracing
pull request so as not to conflict.

 hw/vfio/pci.c        |   49 +++++++++++++++++++++++++++++++++++++++----------
 hw/vfio/trace-events |    1 +
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a171056b..36d5e00 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1772,6 +1772,12 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
     uint8_t cap_ver;
     uint8_t *config;
 
+    /* Only add extended caps if we have them and the guest can see them */
+    if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
+        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
+        return 0;
+    }
+
     /*
      * pcie_add_capability always inserts the new capability at the tail
      * of the chain.  Therefore to end up with a chain that matches the
@@ -1780,6 +1786,25 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
      */
     config = g_memdup(pdev->config, vdev->config_size);
 
+    /*
+     * Extended capabilities are chained with each pointing to the next, so we
+     * can drop anything other than the head of the chain simply by modifying
+     * the previous next pointer.  For the head of the chain, we can modify the
+     * capability ID to something that cannot match a valid capability.  ID
+     * 0 is reserved for this since absence of capabilities is indicated by
+     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
+     * uses ID 0 as reserved for list management and will incorrectly match and
+     * assert if we attempt to pre-load the head of the chain with with this
+     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
+     * part for identifying abscense of capabilities in a root complex register
+     * block.  If the ID still exists after adding capabilities, switch back to
+     * zero.  We'll mark this entire first dword as emulated for this purpose.
+     */
+    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
+                 PCI_EXT_CAP(0xFFFF, 0, 0));
+    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
+    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
+
     for (next = PCI_CONFIG_SPACE_SIZE; next;
          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
         header = pci_get_long(config + next);
@@ -1794,12 +1819,23 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
          */
         size = vfio_ext_cap_max_size(config, next);
 
-        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
-        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
-
         /* Use emulated next pointer to allow dropping extended caps */
         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
                                    PCI_EXT_CAP_NEXT_MASK);
+
+        switch (cap_id) {
+        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuses OVMF */
+            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
+            break;
+        default:
+            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+        }
+
+    }
+
+    /* Cleanup chain head ID if necessary */
+    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
+        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
     }
 
     g_free(config);
@@ -1821,13 +1857,6 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev)
         return ret;
     }
 
-    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
-    if (!pci_is_express(pdev) ||
-        !pci_bus_is_express(pdev->bus) ||
-        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
-        return 0;
-    }
-
     return vfio_add_ext_cap(vdev);
 }
 
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 9da0ff9..a768fb5 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
 vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
 vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
 vfio_initfn(const char *name, int group_id) " (%s) group %d"
+vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x"
 vfio_pci_reset(const char *name) " (%s)"
 vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
 vfio_pci_reset_pm(const char *name) "%s PCI PM Reset"

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
  2016-06-20 22:04 [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability Alex Williamson
@ 2016-06-20 22:23 ` Eric Blake
  2016-06-20 22:31   ` Alex Williamson
  2016-06-21  0:15 ` Laszlo Ersek
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 7+ messages in thread
From: Eric Blake @ 2016-06-20 22:23 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: chen.fan.fnst, lersek, zhoujie2011

[-- Attachment #1: Type: text/plain, Size: 1783 bytes --]

On 06/20/2016 04:04 PM, Alex Williamson wrote:
> The kernel currently exposes the SR-IOV capability as read-only
> through vfio-pci.  This is sufficient to protect the host kernel, but
> has the potential to confuse guests without further virtualization.
> In particular, OVMF tries to size the VF BARs and comes up with absurd
> results, ending with an assert.  There's not much point in adding
> virtualization to a read-only capability, so we simply hide it for
> now.  If the kernel ever enables SR-IOV virtualization, we should
> easily be able to test it through VF BAR sizing or explicit flags.
> 
> Testing whether we should parse extended capabilities is also pulled
> into the function to keep these assumptions in one place.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---

> +     * Extended capabilities are chained with each pointing to the next, so we
> +     * can drop anything other than the head of the chain simply by modifying
> +     * the previous next pointer.  For the head of the chain, we can modify the
> +     * capability ID to something that cannot match a valid capability.  ID
> +     * 0 is reserved for this since absence of capabilities is indicated by
> +     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
> +     * uses ID 0 as reserved for list management and will incorrectly match and
> +     * assert if we attempt to pre-load the head of the chain with with this
> +     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
> +     * part for identifying abscense of capabilities in a root complex register

s/abscense/absence/

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
  2016-06-20 22:23 ` Eric Blake
@ 2016-06-20 22:31   ` Alex Williamson
  0 siblings, 0 replies; 7+ messages in thread
From: Alex Williamson @ 2016-06-20 22:31 UTC (permalink / raw)
  To: Eric Blake; +Cc: qemu-devel, chen.fan.fnst, lersek, zhoujie2011

On Mon, 20 Jun 2016 16:23:07 -0600
Eric Blake <eblake@redhat.com> wrote:

> On 06/20/2016 04:04 PM, Alex Williamson wrote:
> > The kernel currently exposes the SR-IOV capability as read-only
> > through vfio-pci.  This is sufficient to protect the host kernel, but
> > has the potential to confuse guests without further virtualization.
> > In particular, OVMF tries to size the VF BARs and comes up with absurd
> > results, ending with an assert.  There's not much point in adding
> > virtualization to a read-only capability, so we simply hide it for
> > now.  If the kernel ever enables SR-IOV virtualization, we should
> > easily be able to test it through VF BAR sizing or explicit flags.
> > 
> > Testing whether we should parse extended capabilities is also pulled
> > into the function to keep these assumptions in one place.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---  
> 
> > +     * Extended capabilities are chained with each pointing to the next, so we
> > +     * can drop anything other than the head of the chain simply by modifying
> > +     * the previous next pointer.  For the head of the chain, we can modify the
> > +     * capability ID to something that cannot match a valid capability.  ID
> > +     * 0 is reserved for this since absence of capabilities is indicated by
> > +     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
> > +     * uses ID 0 as reserved for list management and will incorrectly match and
> > +     * assert if we attempt to pre-load the head of the chain with with this
> > +     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
> > +     * part for identifying abscense of capabilities in a root complex register  
> 
> s/abscense/absence/
> 

Thanks, updated:

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 36d5e00..2418b93 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1796,7 +1796,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
      * uses ID 0 as reserved for list management and will incorrectly match and
      * assert if we attempt to pre-load the head of the chain with with this
      * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
-     * part for identifying abscense of capabilities in a root complex register
+     * part for identifying absence of capabilities in a root complex register
      * block.  If the ID still exists after adding capabilities, switch back to
      * zero.  We'll mark this entire first dword as emulated for this purpose.
      */

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
  2016-06-20 22:04 [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability Alex Williamson
  2016-06-20 22:23 ` Eric Blake
@ 2016-06-21  0:15 ` Laszlo Ersek
  2016-06-21  3:54   ` Alex Williamson
  2016-06-28 13:10 ` Laszlo Ersek
  2016-06-28 20:05 ` Laszlo Ersek
  3 siblings, 1 reply; 7+ messages in thread
From: Laszlo Ersek @ 2016-06-21  0:15 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: chen.fan.fnst, zhoujie2011

On 06/21/16 00:04, Alex Williamson wrote:
> The kernel currently exposes the SR-IOV capability as read-only
> through vfio-pci.  This is sufficient to protect the host kernel, but
> has the potential to confuse guests without further virtualization.
> In particular, OVMF tries to size the VF BARs and comes up with absurd
> results, ending with an assert.  There's not much point in adding
> virtualization to a read-only capability, so we simply hide it for
> now.  If the kernel ever enables SR-IOV virtualization, we should
> easily be able to test it through VF BAR sizing or explicit flags.
> 
> Testing whether we should parse extended capabilities is also pulled
> into the function to keep these assumptions in one place.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> This depends on Chen Fan's patch "vfio: add pcie extended capability
> support", which I'll pull from Zhou Jie's latest series unless there
> are comments to the contrary.  Otherwise based on Stefan's tracing
> pull request so as not to conflict.
> 
>  hw/vfio/pci.c        |   49 +++++++++++++++++++++++++++++++++++++++----------
>  hw/vfio/trace-events |    1 +
>  2 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a171056b..36d5e00 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1772,6 +1772,12 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>      uint8_t cap_ver;
>      uint8_t *config;
>  
> +    /* Only add extended caps if we have them and the guest can see them */
> +    if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
> +        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> +        return 0;
> +    }
> +
>      /*
>       * pcie_add_capability always inserts the new capability at the tail
>       * of the chain.  Therefore to end up with a chain that matches the
> @@ -1780,6 +1786,25 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>       */
>      config = g_memdup(pdev->config, vdev->config_size);
>  
> +    /*
> +     * Extended capabilities are chained with each pointing to the next, so we
> +     * can drop anything other than the head of the chain simply by modifying
> +     * the previous next pointer.  For the head of the chain, we can modify the
> +     * capability ID to something that cannot match a valid capability.  ID
> +     * 0 is reserved for this since absence of capabilities is indicated by
> +     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
> +     * uses ID 0 as reserved for list management and will incorrectly match and
> +     * assert if we attempt to pre-load the head of the chain with with this
> +     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
> +     * part for identifying abscense of capabilities in a root complex register
> +     * block.  If the ID still exists after adding capabilities, switch back to
> +     * zero.  We'll mark this entire first dword as emulated for this purpose.
> +     */
> +    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
> +                 PCI_EXT_CAP(0xFFFF, 0, 0));
> +    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
> +    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
> +
>      for (next = PCI_CONFIG_SPACE_SIZE; next;
>           next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
>          header = pci_get_long(config + next);
> @@ -1794,12 +1819,23 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>           */
>          size = vfio_ext_cap_max_size(config, next);
>  
> -        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> -        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
> -
>          /* Use emulated next pointer to allow dropping extended caps */
>          pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
>                                     PCI_EXT_CAP_NEXT_MASK);
> +
> +        switch (cap_id) {
> +        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuses OVMF */

I think s/confuses/confuse/.

Other than that, this is mostly black magic to me, so I can't even ACK
it with a straight face :)

I would like to test it, and report back, but then again, I don't have a
NIC with virtual functions. :/

Thank you!
Laszlo

> +            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
> +            break;
> +        default:
> +            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> +        }
> +
> +    }
> +
> +    /* Cleanup chain head ID if necessary */
> +    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> +        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
>      }
>  
>      g_free(config);
> @@ -1821,13 +1857,6 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev)
>          return ret;
>      }
>  
> -    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
> -    if (!pci_is_express(pdev) ||
> -        !pci_bus_is_express(pdev->bus) ||
> -        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> -        return 0;
> -    }
> -
>      return vfio_add_ext_cap(vdev);
>  }
>  
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 9da0ff9..a768fb5 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
>  vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
>  vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
>  vfio_initfn(const char *name, int group_id) " (%s) group %d"
> +vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x"
>  vfio_pci_reset(const char *name) " (%s)"
>  vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
>  vfio_pci_reset_pm(const char *name) "%s PCI PM Reset"
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
  2016-06-21  0:15 ` Laszlo Ersek
@ 2016-06-21  3:54   ` Alex Williamson
  0 siblings, 0 replies; 7+ messages in thread
From: Alex Williamson @ 2016-06-21  3:54 UTC (permalink / raw)
  To: Laszlo Ersek; +Cc: qemu-devel, chen.fan.fnst, zhoujie2011

On Tue, 21 Jun 2016 02:15:23 +0200
Laszlo Ersek <lersek@redhat.com> wrote:

> On 06/21/16 00:04, Alex Williamson wrote:
> > The kernel currently exposes the SR-IOV capability as read-only
> > through vfio-pci.  This is sufficient to protect the host kernel, but
> > has the potential to confuse guests without further virtualization.
> > In particular, OVMF tries to size the VF BARs and comes up with absurd
> > results, ending with an assert.  There's not much point in adding
> > virtualization to a read-only capability, so we simply hide it for
> > now.  If the kernel ever enables SR-IOV virtualization, we should
> > easily be able to test it through VF BAR sizing or explicit flags.
> > 
> > Testing whether we should parse extended capabilities is also pulled
> > into the function to keep these assumptions in one place.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> > This depends on Chen Fan's patch "vfio: add pcie extended capability
> > support", which I'll pull from Zhou Jie's latest series unless there
> > are comments to the contrary.  Otherwise based on Stefan's tracing
> > pull request so as not to conflict.
> > 
> >  hw/vfio/pci.c        |   49 +++++++++++++++++++++++++++++++++++++++----------
> >  hw/vfio/trace-events |    1 +
> >  2 files changed, 40 insertions(+), 10 deletions(-)
> > 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index a171056b..36d5e00 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -1772,6 +1772,12 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
> >      uint8_t cap_ver;
> >      uint8_t *config;
> >  
> > +    /* Only add extended caps if we have them and the guest can see them */
> > +    if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
> > +        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> > +        return 0;
> > +    }
> > +
> >      /*
> >       * pcie_add_capability always inserts the new capability at the tail
> >       * of the chain.  Therefore to end up with a chain that matches the
> > @@ -1780,6 +1786,25 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
> >       */
> >      config = g_memdup(pdev->config, vdev->config_size);
> >  
> > +    /*
> > +     * Extended capabilities are chained with each pointing to the next, so we
> > +     * can drop anything other than the head of the chain simply by modifying
> > +     * the previous next pointer.  For the head of the chain, we can modify the
> > +     * capability ID to something that cannot match a valid capability.  ID
> > +     * 0 is reserved for this since absence of capabilities is indicated by
> > +     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
> > +     * uses ID 0 as reserved for list management and will incorrectly match and
> > +     * assert if we attempt to pre-load the head of the chain with with this
> > +     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
> > +     * part for identifying abscense of capabilities in a root complex register
> > +     * block.  If the ID still exists after adding capabilities, switch back to
> > +     * zero.  We'll mark this entire first dword as emulated for this purpose.
> > +     */
> > +    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
> > +                 PCI_EXT_CAP(0xFFFF, 0, 0));
> > +    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
> > +    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
> > +
> >      for (next = PCI_CONFIG_SPACE_SIZE; next;
> >           next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
> >          header = pci_get_long(config + next);
> > @@ -1794,12 +1819,23 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
> >           */
> >          size = vfio_ext_cap_max_size(config, next);
> >  
> > -        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> > -        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
> > -
> >          /* Use emulated next pointer to allow dropping extended caps */
> >          pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
> >                                     PCI_EXT_CAP_NEXT_MASK);
> > +
> > +        switch (cap_id) {
> > +        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuses OVMF */  
> 
> I think s/confuses/confuse/.

Thanks, fixed.

> Other than that, this is mostly black magic to me, so I can't even ACK
> it with a straight face :)
> 
> I would like to test it, and report back, but then again, I don't have a
> NIC with virtual functions. :/

Time to justify one for OVMF testing? ;)  Thanks,

Alex

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
  2016-06-20 22:04 [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability Alex Williamson
  2016-06-20 22:23 ` Eric Blake
  2016-06-21  0:15 ` Laszlo Ersek
@ 2016-06-28 13:10 ` Laszlo Ersek
  2016-06-28 20:05 ` Laszlo Ersek
  3 siblings, 0 replies; 7+ messages in thread
From: Laszlo Ersek @ 2016-06-28 13:10 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: chen.fan.fnst, zhoujie2011

On 06/21/16 00:04, Alex Williamson wrote:
> The kernel currently exposes the SR-IOV capability as read-only
> through vfio-pci.  This is sufficient to protect the host kernel, but
> has the potential to confuse guests without further virtualization.
> In particular, OVMF tries to size the VF BARs and comes up with absurd
> results, ending with an assert.  There's not much point in adding
> virtualization to a read-only capability, so we simply hide it for
> now.  If the kernel ever enables SR-IOV virtualization, we should
> easily be able to test it through VF BAR sizing or explicit flags.
> 
> Testing whether we should parse extended capabilities is also pulled
> into the function to keep these assumptions in one place.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> This depends on Chen Fan's patch "vfio: add pcie extended capability
> support", which I'll pull from Zhou Jie's latest series unless there
> are comments to the contrary.  Otherwise based on Stefan's tracing
> pull request so as not to conflict.
> 
>  hw/vfio/pci.c        |   49 +++++++++++++++++++++++++++++++++++++++----------
>  hw/vfio/trace-events |    1 +
>  2 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a171056b..36d5e00 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1772,6 +1772,12 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>      uint8_t cap_ver;
>      uint8_t *config;
>  
> +    /* Only add extended caps if we have them and the guest can see them */
> +    if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
> +        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> +        return 0;
> +    }
> +
>      /*
>       * pcie_add_capability always inserts the new capability at the tail
>       * of the chain.  Therefore to end up with a chain that matches the
> @@ -1780,6 +1786,25 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>       */
>      config = g_memdup(pdev->config, vdev->config_size);
>  
> +    /*
> +     * Extended capabilities are chained with each pointing to the next, so we
> +     * can drop anything other than the head of the chain simply by modifying
> +     * the previous next pointer.  For the head of the chain, we can modify the
> +     * capability ID to something that cannot match a valid capability.  ID
> +     * 0 is reserved for this since absence of capabilities is indicated by
> +     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
> +     * uses ID 0 as reserved for list management and will incorrectly match and
> +     * assert if we attempt to pre-load the head of the chain with with this
> +     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
> +     * part for identifying abscense of capabilities in a root complex register
> +     * block.  If the ID still exists after adding capabilities, switch back to
> +     * zero.  We'll mark this entire first dword as emulated for this purpose.
> +     */
> +    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
> +                 PCI_EXT_CAP(0xFFFF, 0, 0));
> +    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
> +    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
> +
>      for (next = PCI_CONFIG_SPACE_SIZE; next;
>           next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
>          header = pci_get_long(config + next);
> @@ -1794,12 +1819,23 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>           */
>          size = vfio_ext_cap_max_size(config, next);
>  
> -        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> -        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
> -
>          /* Use emulated next pointer to allow dropping extended caps */
>          pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
>                                     PCI_EXT_CAP_NEXT_MASK);
> +
> +        switch (cap_id) {
> +        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuses OVMF */
> +            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
> +            break;
> +        default:
> +            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> +        }
> +
> +    }
> +
> +    /* Cleanup chain head ID if necessary */
> +    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> +        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
>      }
>  
>      g_free(config);
> @@ -1821,13 +1857,6 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev)
>          return ret;
>      }
>  
> -    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
> -    if (!pci_is_express(pdev) ||
> -        !pci_bus_is_express(pdev->bus) ||
> -        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> -        return 0;
> -    }
> -
>      return vfio_add_ext_cap(vdev);
>  }
>  
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 9da0ff9..a768fb5 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
>  vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
>  vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
>  vfio_initfn(const char *name, int group_id) " (%s) group %d"
> +vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x"
>  vfio_pci_reset(const char *name) " (%s)"
>  vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
>  vfio_pci_reset_pm(const char *name) "%s PCI PM Reset"
> 
> 

Please hold off sending a PULL for this patch for a little longer, I'll
do my best to follow up with test results today.

Thanks!
Laszlo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability
  2016-06-20 22:04 [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability Alex Williamson
                   ` (2 preceding siblings ...)
  2016-06-28 13:10 ` Laszlo Ersek
@ 2016-06-28 20:05 ` Laszlo Ersek
  3 siblings, 0 replies; 7+ messages in thread
From: Laszlo Ersek @ 2016-06-28 20:05 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: chen.fan.fnst, zhoujie2011

On 06/21/16 00:04, Alex Williamson wrote:
> The kernel currently exposes the SR-IOV capability as read-only
> through vfio-pci.  This is sufficient to protect the host kernel, but
> has the potential to confuse guests without further virtualization.
> In particular, OVMF tries to size the VF BARs and comes up with absurd
> results, ending with an assert.  There's not much point in adding
> virtualization to a read-only capability, so we simply hide it for
> now.  If the kernel ever enables SR-IOV virtualization, we should
> easily be able to test it through VF BAR sizing or explicit flags.
> 
> Testing whether we should parse extended capabilities is also pulled
> into the function to keep these assumptions in one place.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> This depends on Chen Fan's patch "vfio: add pcie extended capability
> support", which I'll pull from Zhou Jie's latest series unless there
> are comments to the contrary.  Otherwise based on Stefan's tracing
> pull request so as not to conflict.
> 
>  hw/vfio/pci.c        |   49 +++++++++++++++++++++++++++++++++++++++----------
>  hw/vfio/trace-events |    1 +
>  2 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a171056b..36d5e00 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1772,6 +1772,12 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>      uint8_t cap_ver;
>      uint8_t *config;
>  
> +    /* Only add extended caps if we have them and the guest can see them */
> +    if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
> +        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> +        return 0;
> +    }
> +
>      /*
>       * pcie_add_capability always inserts the new capability at the tail
>       * of the chain.  Therefore to end up with a chain that matches the
> @@ -1780,6 +1786,25 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>       */
>      config = g_memdup(pdev->config, vdev->config_size);
>  
> +    /*
> +     * Extended capabilities are chained with each pointing to the next, so we
> +     * can drop anything other than the head of the chain simply by modifying
> +     * the previous next pointer.  For the head of the chain, we can modify the
> +     * capability ID to something that cannot match a valid capability.  ID
> +     * 0 is reserved for this since absence of capabilities is indicated by
> +     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
> +     * uses ID 0 as reserved for list management and will incorrectly match and
> +     * assert if we attempt to pre-load the head of the chain with with this
> +     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
> +     * part for identifying abscense of capabilities in a root complex register
> +     * block.  If the ID still exists after adding capabilities, switch back to
> +     * zero.  We'll mark this entire first dword as emulated for this purpose.
> +     */
> +    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
> +                 PCI_EXT_CAP(0xFFFF, 0, 0));
> +    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
> +    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
> +
>      for (next = PCI_CONFIG_SPACE_SIZE; next;
>           next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
>          header = pci_get_long(config + next);
> @@ -1794,12 +1819,23 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>           */
>          size = vfio_ext_cap_max_size(config, next);
>  
> -        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> -        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
> -
>          /* Use emulated next pointer to allow dropping extended caps */
>          pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
>                                     PCI_EXT_CAP_NEXT_MASK);
> +
> +        switch (cap_id) {
> +        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuses OVMF */
> +            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
> +            break;
> +        default:
> +            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> +        }
> +
> +    }
> +
> +    /* Cleanup chain head ID if necessary */
> +    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> +        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
>      }
>  
>      g_free(config);
> @@ -1821,13 +1857,6 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev)
>          return ret;
>      }
>  
> -    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
> -    if (!pci_is_express(pdev) ||
> -        !pci_bus_is_express(pdev->bus) ||
> -        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
> -        return 0;
> -    }
> -
>      return vfio_add_ext_cap(vdev);
>  }
>  
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 9da0ff9..a768fb5 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %
>  vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
>  vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m"
>  vfio_initfn(const char *name, int group_id) " (%s) group %d"
> +vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x"
>  vfio_pci_reset(const char *name) " (%s)"
>  vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET"
>  vfio_pci_reset_pm(const char *name) "%s PCI PM Reset"
> 
> 

Tested-by: Laszlo Ersek <lersek@redhat.com>

(with
<http://thread.gmane.org/gmane.comp.emulators.qemu/412174/focus=412178>
applied first)

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-06-28 20:05 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-06-20 22:04 [Qemu-devel] [PATCH] vfio/pci: Hide SR-IOV capability Alex Williamson
2016-06-20 22:23 ` Eric Blake
2016-06-20 22:31   ` Alex Williamson
2016-06-21  0:15 ` Laszlo Ersek
2016-06-21  3:54   ` Alex Williamson
2016-06-28 13:10 ` Laszlo Ersek
2016-06-28 20:05 ` Laszlo Ersek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).