[PATCH v1 1/3] vpci: Hide capability when it fails to initialize

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
@ 2025-03-27  7:32 Jiqian Chen
  2025-03-27  7:32 ` [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails Jiqian Chen
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Jiqian Chen @ 2025-03-27  7:32 UTC (permalink / raw)
  To: xen-devel; +Cc: Roger Pau Monné, Huang Rui, Jiqian Chen

When vpci fails to initialize a capability of a device, it just
return error instead of catching and processing exception. That
makes the entire device unusable.

So, refactor REGISTER_VPCI_INIT to contain more capability specific
information, and try to hide capability when initialization fails
in vpci_assign_device().

What's more, change the definition of init_header() since it is
not a capability and it is needed for all devices' PCI config space.

Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
---
Hi all,

This patch aims to hide a capability when its initialization fails.
That causes we can't rely on vpci_deassign_device() to clean up assigned
resources, so, following two patches clean up resources in the failure
path of init function.

Best regards,
Jiqian Chen.
---
 xen/drivers/vpci/header.c |  3 +-
 xen/drivers/vpci/msi.c    |  2 +-
 xen/drivers/vpci/msix.c   |  2 +-
 xen/drivers/vpci/rebar.c  |  2 +-
 xen/drivers/vpci/vpci.c   | 65 +++++++++++++++++++++++++++++++++------
 xen/include/xen/vpci.h    | 27 ++++++++++++----
 6 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index ef6c965c081c..8c8e4ac5698a 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -745,7 +745,7 @@ static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
     return !bar->mem ? -ENOMEM : 0;
 }
 
-static int cf_check init_header(struct pci_dev *pdev)
+int vpci_init_header(struct pci_dev *pdev)
 {
     uint16_t cmd;
     uint64_t addr, size;
@@ -1007,7 +1007,6 @@ static int cf_check init_header(struct pci_dev *pdev)
     pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
     return rc;
 }
-REGISTER_VPCI_INIT(init_header, VPCI_PRIORITY_MIDDLE);
 
 /*
  * Local variables:
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 66e5a8a116be..9d7a9fd8dba1 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -270,7 +270,7 @@ static int cf_check init_msi(struct pci_dev *pdev)
 
     return 0;
 }
-REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
+REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSI, init_msi, VPCI_PRIORITY_LOW);
 
 void vpci_dump_msi(void)
 {
diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
index 6bd8c55bb48e..50e5f38c1e09 100644
--- a/xen/drivers/vpci/msix.c
+++ b/xen/drivers/vpci/msix.c
@@ -753,7 +753,7 @@ static int cf_check init_msix(struct pci_dev *pdev)
 
     return 0;
 }
-REGISTER_VPCI_INIT(init_msix, VPCI_PRIORITY_HIGH);
+REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSIX, init_msix, VPCI_PRIORITY_HIGH);
 
 /*
  * Local variables:
diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
index 793937449af7..7c53ee031887 100644
--- a/xen/drivers/vpci/rebar.c
+++ b/xen/drivers/vpci/rebar.c
@@ -118,7 +118,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
 
     return 0;
 }
-REGISTER_VPCI_INIT(init_rebar, VPCI_PRIORITY_LOW);
+REGISTER_VPCI_EXTEND_CAP(PCI_EXT_CAP_ID_REBAR, init_rebar, VPCI_PRIORITY_LOW);
 
 /*
  * Local variables:
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 1e6aa5d799b9..a8362e46e097 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -36,8 +36,8 @@ struct vpci_register {
 };
 
 #ifdef __XEN__
-extern vpci_register_init_t *const __start_vpci_array[];
-extern vpci_register_init_t *const __end_vpci_array[];
+extern vpci_capability_t *const __start_vpci_array[];
+extern vpci_capability_t *const __end_vpci_array[];
 #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
 
 #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
@@ -83,6 +83,47 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
 
 #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
 
+static int vpci_init_cap_with_priority(struct pci_dev *pdev,
+                                       const char *priority)
+{
+    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
+    {
+        const vpci_capability_t *capability = __start_vpci_array[i];
+        const unsigned int cap_id = capability->id;
+        unsigned int pos;
+        int rc;
+
+        if ( *(capability->priority) != *priority )
+            continue;
+
+        if ( !capability->is_ext )
+            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
+        else
+            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
+
+        if ( !pos )
+            continue;
+
+        rc = capability->init(pdev);
+
+        if ( rc )
+        {
+            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
+                   pdev->domain, &pdev->sbdf, rc);
+            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
+                                   pos, capability->is_ext ? 4 : 1, NULL);
+            if ( rc )
+            {
+                printk(XENLOG_ERR "%pd %pp: fail to hide cap rc=%d\n",
+                       pdev->domain, &pdev->sbdf, rc);
+                return rc;
+            }
+        }
+    }
+
+    return 0;
+}
+
 void vpci_deassign_device(struct pci_dev *pdev)
 {
     unsigned int i;
@@ -128,7 +169,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
 
 int vpci_assign_device(struct pci_dev *pdev)
 {
-    unsigned int i;
     const unsigned long *ro_map;
     int rc = 0;
 
@@ -159,12 +199,19 @@ int vpci_assign_device(struct pci_dev *pdev)
         goto out;
 #endif
 
-    for ( i = 0; i < NUM_VPCI_INIT; i++ )
-    {
-        rc = __start_vpci_array[i](pdev);
-        if ( rc )
-            break;
-    }
+    /*
+     * Capabilities with high priority like MSI-X need to
+     * be initialized before header
+     */
+    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
+    if ( rc )
+        goto out;
+
+    rc = vpci_init_header(pdev);
+    if ( rc )
+        goto out;
+
+    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_LOW);
 
  out: __maybe_unused;
     if ( rc )
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 807401b2eaa2..fa13397ae409 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -13,12 +13,16 @@ typedef uint32_t vpci_read_t(const struct pci_dev *pdev, unsigned int reg,
 typedef void vpci_write_t(const struct pci_dev *pdev, unsigned int reg,
                           uint32_t val, void *data);
 
-typedef int vpci_register_init_t(struct pci_dev *dev);
-
 #define VPCI_PRIORITY_HIGH      "1"
-#define VPCI_PRIORITY_MIDDLE    "5"
 #define VPCI_PRIORITY_LOW       "9"
 
+typedef struct {
+    unsigned int id;
+    const char *priority;
+    bool is_ext;
+    int (*init)(struct pci_dev *pdev);
+} vpci_capability_t;
+
 #define VPCI_ECAM_BDF(addr)     (((addr) & 0x0ffff000) >> 12)
 
 /*
@@ -29,9 +33,20 @@ typedef int vpci_register_init_t(struct pci_dev *dev);
  */
 #define VPCI_MAX_VIRT_DEV       (PCI_SLOT(~0) + 1)
 
-#define REGISTER_VPCI_INIT(x, p)                \
-  static vpci_register_init_t *const x##_entry  \
-               __used_section(".data.vpci." p) = (x)
+#define REGISTER_VPCI_CAP(cap, x, p, ext) \
+  static vpci_capability_t x##_t = { \
+        .id = (cap), \
+        .init = (x), \
+        .priority = (p), \
+        .is_ext = (ext), \
+  }; \
+  static vpci_capability_t *const x##_entry  \
+               __used_section(".data.vpci." p) = &(x##_t)
+
+#define REGISTER_VPCI_LEGACY_CAP(cap, x, p) REGISTER_VPCI_CAP(cap, x, p, false)
+#define REGISTER_VPCI_EXTEND_CAP(cap, x, p) REGISTER_VPCI_CAP(cap, x, p, true)
+
+int __must_check vpci_init_header(struct pci_dev *pdev);
 
 /* Assign vPCI to device by adding handlers. */
 int __must_check vpci_assign_device(struct pci_dev *pdev);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails
  2025-03-27  7:32 [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Jiqian Chen
@ 2025-03-27  7:32 ` Jiqian Chen
  2025-03-27 12:38   ` Roger Pau Monné
  2025-03-27  7:32 ` [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails Jiqian Chen
  2025-03-27  9:25 ` [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Roger Pau Monné
  2 siblings, 1 reply; 16+ messages in thread
From: Jiqian Chen @ 2025-03-27  7:32 UTC (permalink / raw)
  To: xen-devel; +Cc: Roger Pau Monné, Huang Rui, Jiqian Chen

When init_rebar() fails, the new codes will try to hide Rebar
capability, so it can't rely on vpci_deassign_device() to remove
all Rebar related registers anymore, those registers must be
cleaned up in failure path of init_rebar.

To do that, use a vpci_register array to record all Rebar registers
and call vpci_remove_register() to remove registers.

Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
---
 xen/drivers/vpci/rebar.c | 33 ++++++++++++++++++++-------------
 xen/drivers/vpci/vpci.c  | 14 --------------
 xen/include/xen/vpci.h   | 15 +++++++++++++++
 3 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
index 7c53ee031887..5f2f9978feb9 100644
--- a/xen/drivers/vpci/rebar.c
+++ b/xen/drivers/vpci/rebar.c
@@ -51,8 +51,11 @@ static void cf_check rebar_ctrl_write(const struct pci_dev *pdev,
 
 static int cf_check init_rebar(struct pci_dev *pdev)
 {
+    int rc = 0;
     uint32_t ctrl;
     unsigned int nbars;
+    unsigned int reg_index = 0;
+    struct vpci_register registers[VPCI_CAP_MAX_REGISTER];
     unsigned int rebar_offset = pci_find_ext_capability(pdev->sbdf,
                                                         PCI_EXT_CAP_ID_REBAR);
 
@@ -70,17 +73,17 @@ static int cf_check init_rebar(struct pci_dev *pdev)
     nbars = MASK_EXTR(ctrl, PCI_REBAR_CTRL_NBAR_MASK);
     for ( unsigned int i = 0; i < nbars; i++ )
     {
-        int rc;
+        const unsigned int offset = rebar_offset + PCI_REBAR_CTRL(i);
         struct vpci_bar *bar;
         unsigned int index;
 
-        ctrl = pci_conf_read32(pdev->sbdf, rebar_offset + PCI_REBAR_CTRL(i));
+        ctrl = pci_conf_read32(pdev->sbdf, offset);
         index = ctrl & PCI_REBAR_CTRL_BAR_IDX;
         if ( index >= PCI_HEADER_NORMAL_NR_BARS )
         {
             printk(XENLOG_ERR "%pd %pp: too big BAR number %u in REBAR_CTRL\n",
                    pdev->domain, &pdev->sbdf, index);
-            continue;
+            goto fail;
         }
 
         bar = &pdev->vpci->header.bars[index];
@@ -88,24 +91,19 @@ static int cf_check init_rebar(struct pci_dev *pdev)
         {
             printk(XENLOG_ERR "%pd %pp: BAR%u is not in memory space\n",
                    pdev->domain, &pdev->sbdf, index);
-            continue;
+            goto fail;
         }
 
         rc = vpci_add_register(pdev->vpci, vpci_hw_read32, rebar_ctrl_write,
-                               rebar_offset + PCI_REBAR_CTRL(i), 4, bar);
+                               offset, 4, bar);
         if ( rc )
         {
             printk(XENLOG_ERR "%pd %pp: BAR%u fail to add reg of REBAR_CTRL rc=%d\n",
                    pdev->domain, &pdev->sbdf, index, rc);
-            /*
-             * Ideally we would hide the ReBar capability on error, but code
-             * for doing so still needs to be written. Use continue instead
-             * to keep any already setup register hooks, as returning an
-             * error will cause the hardware domain to get unmediated access
-             * to all device registers.
-             */
-            continue;
+            goto fail;
         }
+        registers[reg_index].offset = offset;
+        registers[reg_index++].size = 4;
 
         bar->resizable_sizes =
             MASK_EXTR(pci_conf_read32(pdev->sbdf,
@@ -117,6 +115,15 @@ static int cf_check init_rebar(struct pci_dev *pdev)
     }
 
     return 0;
+
+ fail:
+    for ( unsigned int i = 0; i < reg_index; i++ )
+        if ( vpci_remove_register(pdev->vpci,
+                                  registers[i].offset,
+                                  registers[i].size) )
+            continue;
+
+    return rc;
 }
 REGISTER_VPCI_EXTEND_CAP(PCI_EXT_CAP_ID_REBAR, init_rebar, VPCI_PRIORITY_LOW);
 
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index a8362e46e097..ea81d8cc9604 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -21,20 +21,6 @@
 #include <xen/vpci.h>
 #include <xen/vmap.h>
 
-/* Internal struct to store the emulated PCI registers. */
-struct vpci_register {
-    vpci_read_t *read;
-    vpci_write_t *write;
-    unsigned int size;
-    unsigned int offset;
-    void *private;
-    struct list_head node;
-    uint32_t ro_mask;
-    uint32_t rw1c_mask;
-    uint32_t rsvdp_mask;
-    uint32_t rsvdz_mask;
-};
-
 #ifdef __XEN__
 extern vpci_capability_t *const __start_vpci_array[];
 extern vpci_capability_t *const __end_vpci_array[];
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index fa13397ae409..19a036c22165 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -214,6 +214,21 @@ struct vpci_vcpu {
     bool rom_only : 1;
 };
 
+#define VPCI_CAP_MAX_REGISTER 10
+
+struct vpci_register {
+    vpci_read_t *read;
+    vpci_write_t *write;
+    unsigned int size;
+    unsigned int offset;
+    void *private;
+    struct list_head node;
+    uint32_t ro_mask;
+    uint32_t rw1c_mask;
+    uint32_t rsvdp_mask;
+    uint32_t rsvdz_mask;
+};
+
 #ifdef __XEN__
 void vpci_dump_msi(void);
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails
  2025-03-27  7:32 ` [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails Jiqian Chen
@ 2025-03-27 12:38   ` Roger Pau Monné
  0 siblings, 0 replies; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-27 12:38 UTC (permalink / raw)
  To: Jiqian Chen; +Cc: xen-devel, Huang Rui

On Thu, Mar 27, 2025 at 03:32:13PM +0800, Jiqian Chen wrote:
> When init_rebar() fails, the new codes will try to hide Rebar
> capability, so it can't rely on vpci_deassign_device() to remove
> all Rebar related registers anymore, those registers must be
> cleaned up in failure path of init_rebar.
> 
> To do that, use a vpci_register array to record all Rebar registers
> and call vpci_remove_register() to remove registers.
> 
> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> ---
>  xen/drivers/vpci/rebar.c | 33 ++++++++++++++++++++-------------
>  xen/drivers/vpci/vpci.c  | 14 --------------
>  xen/include/xen/vpci.h   | 15 +++++++++++++++
>  3 files changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
> index 7c53ee031887..5f2f9978feb9 100644
> --- a/xen/drivers/vpci/rebar.c
> +++ b/xen/drivers/vpci/rebar.c
> @@ -51,8 +51,11 @@ static void cf_check rebar_ctrl_write(const struct pci_dev *pdev,
>  
>  static int cf_check init_rebar(struct pci_dev *pdev)
>  {
> +    int rc = 0;
>      uint32_t ctrl;
>      unsigned int nbars;
> +    unsigned int reg_index = 0;
> +    struct vpci_register registers[VPCI_CAP_MAX_REGISTER];

I'm not sure I like this approach much, as it seems to be quite
cumbersome.  Iff we really want to go that route I would recommend
that you use a much lighter structure here, struct vpci_register has a
bunch of fields that are not used at all by the purposes here.  You
just want a struct with and offset and a size fields.

>      unsigned int rebar_offset = pci_find_ext_capability(pdev->sbdf,
>                                                          PCI_EXT_CAP_ID_REBAR);
>  
> @@ -70,17 +73,17 @@ static int cf_check init_rebar(struct pci_dev *pdev)
>      nbars = MASK_EXTR(ctrl, PCI_REBAR_CTRL_NBAR_MASK);
>      for ( unsigned int i = 0; i < nbars; i++ )
>      {
> -        int rc;
> +        const unsigned int offset = rebar_offset + PCI_REBAR_CTRL(i);
>          struct vpci_bar *bar;
>          unsigned int index;
>  
> -        ctrl = pci_conf_read32(pdev->sbdf, rebar_offset + PCI_REBAR_CTRL(i));
> +        ctrl = pci_conf_read32(pdev->sbdf, offset);
>          index = ctrl & PCI_REBAR_CTRL_BAR_IDX;
>          if ( index >= PCI_HEADER_NORMAL_NR_BARS )
>          {
>              printk(XENLOG_ERR "%pd %pp: too big BAR number %u in REBAR_CTRL\n",
>                     pdev->domain, &pdev->sbdf, index);
> -            continue;
> +            goto fail;
>          }
>  
>          bar = &pdev->vpci->header.bars[index];
> @@ -88,24 +91,19 @@ static int cf_check init_rebar(struct pci_dev *pdev)
>          {
>              printk(XENLOG_ERR "%pd %pp: BAR%u is not in memory space\n",
>                     pdev->domain, &pdev->sbdf, index);
> -            continue;
> +            goto fail;
>          }
>  
>          rc = vpci_add_register(pdev->vpci, vpci_hw_read32, rebar_ctrl_write,
> -                               rebar_offset + PCI_REBAR_CTRL(i), 4, bar);
> +                               offset, 4, bar);
>          if ( rc )
>          {
>              printk(XENLOG_ERR "%pd %pp: BAR%u fail to add reg of REBAR_CTRL rc=%d\n",
>                     pdev->domain, &pdev->sbdf, index, rc);
> -            /*
> -             * Ideally we would hide the ReBar capability on error, but code
> -             * for doing so still needs to be written. Use continue instead
> -             * to keep any already setup register hooks, as returning an
> -             * error will cause the hardware domain to get unmediated access
> -             * to all device registers.
> -             */
> -            continue;
> +            goto fail;
>          }
> +        registers[reg_index].offset = offset;
> +        registers[reg_index++].size = 4;
>  
>          bar->resizable_sizes =
>              MASK_EXTR(pci_conf_read32(pdev->sbdf,
> @@ -117,6 +115,15 @@ static int cf_check init_rebar(struct pci_dev *pdev)
>      }
>  
>      return 0;
> +
> + fail:
> +    for ( unsigned int i = 0; i < reg_index; i++ )
> +        if ( vpci_remove_register(pdev->vpci,
> +                                  registers[i].offset,
> +                                  registers[i].size) )
> +            continue;

Keep in mind it's fine to remove registers that are not there, iow you
could possibly do:

for ( unsigned int i = 0; i < nbars; i++ )
    if ( vpci_remove_register(pdev->vpci, rebar_offset + PCI_REBAR_CTRL(i),
                              4) )
        continue;

And it would be fine IMO, without the need to store exactly which
registers have been added.  It's not like there's much that can be
done from vpci_remove_register() failing in this context.

In fact you can remove the __must_check from vpci_remove_register(), I
don't think it's helpful at all.

> +
> +    return rc;
>  }
>  REGISTER_VPCI_EXTEND_CAP(PCI_EXT_CAP_ID_REBAR, init_rebar, VPCI_PRIORITY_LOW);
>  
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index a8362e46e097..ea81d8cc9604 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -21,20 +21,6 @@
>  #include <xen/vpci.h>
>  #include <xen/vmap.h>
>  
> -/* Internal struct to store the emulated PCI registers. */
> -struct vpci_register {
> -    vpci_read_t *read;
> -    vpci_write_t *write;
> -    unsigned int size;
> -    unsigned int offset;
> -    void *private;
> -    struct list_head node;
> -    uint32_t ro_mask;
> -    uint32_t rw1c_mask;
> -    uint32_t rsvdp_mask;
> -    uint32_t rsvdz_mask;
> -};
> -
>  #ifdef __XEN__
>  extern vpci_capability_t *const __start_vpci_array[];
>  extern vpci_capability_t *const __end_vpci_array[];
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index fa13397ae409..19a036c22165 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -214,6 +214,21 @@ struct vpci_vcpu {
>      bool rom_only : 1;
>  };
>  
> +#define VPCI_CAP_MAX_REGISTER 10

That 10 is kind of arbitrary...

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails
  2025-03-27  7:32 [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Jiqian Chen
  2025-03-27  7:32 ` [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails Jiqian Chen
@ 2025-03-27  7:32 ` Jiqian Chen
  2025-03-27 12:44   ` Roger Pau Monné
  2025-03-27  9:25 ` [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Roger Pau Monné
  2 siblings, 1 reply; 16+ messages in thread
From: Jiqian Chen @ 2025-03-27  7:32 UTC (permalink / raw)
  To: xen-devel; +Cc: Roger Pau Monné, Huang Rui, Jiqian Chen

When init_msi() fails, the new codes will try to hide MSI
capability, so it can't rely on vpci_deassign_device() to
remove all MSI related registers anymore, those registers
must be cleaned up in failure path of init_msi.

To do that, use a vpci_register array to record all MSI
registers and call vpci_remove_register() to remove registers.

Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
---
 xen/drivers/vpci/msi.c | 57 +++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 9d7a9fd8dba1..30ef97efb7b0 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -195,6 +195,9 @@ static void cf_check mask_write(
 
 static int cf_check init_msi(struct pci_dev *pdev)
 {
+    unsigned int offset;
+    unsigned int reg_index = 0;
+    struct vpci_register registers[VPCI_CAP_MAX_REGISTER];
     unsigned int pos = pdev->msi_pos;
     uint16_t control;
     int ret;
@@ -206,15 +209,13 @@ static int cf_check init_msi(struct pci_dev *pdev)
     if ( !pdev->vpci->msi )
         return -ENOMEM;
 
+    offset = msi_control_reg(pos);
     ret = vpci_add_register(pdev->vpci, control_read, control_write,
-                            msi_control_reg(pos), 2, pdev->vpci->msi);
+                            offset, 2, pdev->vpci->msi);
     if ( ret )
-        /*
-         * NB: there's no need to free the msi struct or remove the register
-         * handlers form the config space, the caller will take care of the
-         * cleanup.
-         */
-        return ret;
+        goto fail;
+    registers[reg_index].offset = offset;
+    registers[reg_index++].size = 2;
 
     /* Get the maximum number of vectors the device supports. */
     control = pci_conf_read16(pdev->sbdf, msi_control_reg(pos));
@@ -234,33 +235,42 @@ static int cf_check init_msi(struct pci_dev *pdev)
     pdev->vpci->msi->address64 = is_64bit_address(control);
     pdev->vpci->msi->masking = is_mask_bit_support(control);
 
+    offset = msi_lower_address_reg(pos);
     ret = vpci_add_register(pdev->vpci, address_read, address_write,
-                            msi_lower_address_reg(pos), 4, pdev->vpci->msi);
+                            offset, 4, pdev->vpci->msi);
     if ( ret )
-        return ret;
+        goto fail;
+    registers[reg_index].offset = offset;
+    registers[reg_index++].size = 4;
 
+    offset = msi_data_reg(pos, pdev->vpci->msi->address64);
     ret = vpci_add_register(pdev->vpci, data_read, data_write,
-                            msi_data_reg(pos, pdev->vpci->msi->address64), 2,
-                            pdev->vpci->msi);
+                            offset, 2, pdev->vpci->msi);
     if ( ret )
-        return ret;
+        goto fail;
+    registers[reg_index].offset = offset;
+    registers[reg_index++].size = 2;
 
     if ( pdev->vpci->msi->address64 )
     {
+        offset = msi_upper_address_reg(pos);
         ret = vpci_add_register(pdev->vpci, address_hi_read, address_hi_write,
-                                msi_upper_address_reg(pos), 4, pdev->vpci->msi);
+                                offset, 4, pdev->vpci->msi);
         if ( ret )
-            return ret;
+            goto fail;
+        registers[reg_index].offset = offset;
+        registers[reg_index++].size = 4;
     }
 
     if ( pdev->vpci->msi->masking )
     {
+        offset = msi_mask_bits_reg(pos, pdev->vpci->msi->address64);
         ret = vpci_add_register(pdev->vpci, mask_read, mask_write,
-                                msi_mask_bits_reg(pos,
-                                                  pdev->vpci->msi->address64),
-                                4, pdev->vpci->msi);
+                                offset, 4, pdev->vpci->msi);
         if ( ret )
-            return ret;
+            goto fail;
+        registers[reg_index].offset = offset;
+        registers[reg_index++].size = 4;
         /*
          * FIXME: do not add any handler for the pending bits for the hardware
          * domain, which means direct access. This will be revisited when
@@ -269,6 +279,17 @@ static int cf_check init_msi(struct pci_dev *pdev)
     }
 
     return 0;
+
+ fail:
+    for ( unsigned int i = 0; i < reg_index; i++ )
+        if ( vpci_remove_register(pdev->vpci,
+                                  registers[i].offset,
+                                  registers[i].size) )
+            continue;
+    xfree(pdev->vpci->msi);
+    pdev->vpci->msi = NULL;
+
+    return ret;
 }
 REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSI, init_msi, VPCI_PRIORITY_LOW);
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails
  2025-03-27  7:32 ` [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails Jiqian Chen
@ 2025-03-27 12:44   ` Roger Pau Monné
  2025-03-31  8:13     ` Chen, Jiqian
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-27 12:44 UTC (permalink / raw)
  To: Jiqian Chen; +Cc: xen-devel, Huang Rui

On Thu, Mar 27, 2025 at 03:32:14PM +0800, Jiqian Chen wrote:
> When init_msi() fails, the new codes will try to hide MSI
> capability, so it can't rely on vpci_deassign_device() to
> remove all MSI related registers anymore, those registers
> must be cleaned up in failure path of init_msi.
> 
> To do that, use a vpci_register array to record all MSI
> registers and call vpci_remove_register() to remove registers.

As I'm just seeing 3 patches on the series, isn't there one missing
for MSI-X at least?

> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> ---
>  xen/drivers/vpci/msi.c | 57 +++++++++++++++++++++++++++++-------------
>  1 file changed, 39 insertions(+), 18 deletions(-)
> 
> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> index 9d7a9fd8dba1..30ef97efb7b0 100644
> --- a/xen/drivers/vpci/msi.c
> +++ b/xen/drivers/vpci/msi.c
> @@ -195,6 +195,9 @@ static void cf_check mask_write(
>  
>  static int cf_check init_msi(struct pci_dev *pdev)
>  {
> +    unsigned int offset;
> +    unsigned int reg_index = 0;
> +    struct vpci_register registers[VPCI_CAP_MAX_REGISTER];
>      unsigned int pos = pdev->msi_pos;
>      uint16_t control;
>      int ret;
> @@ -206,15 +209,13 @@ static int cf_check init_msi(struct pci_dev *pdev)
>      if ( !pdev->vpci->msi )
>          return -ENOMEM;
>  
> +    offset = msi_control_reg(pos);
>      ret = vpci_add_register(pdev->vpci, control_read, control_write,
> -                            msi_control_reg(pos), 2, pdev->vpci->msi);
> +                            offset, 2, pdev->vpci->msi);
>      if ( ret )
> -        /*
> -         * NB: there's no need to free the msi struct or remove the register
> -         * handlers form the config space, the caller will take care of the
> -         * cleanup.
> -         */
> -        return ret;
> +        goto fail;
> +    registers[reg_index].offset = offset;
> +    registers[reg_index++].size = 2;
>  
>      /* Get the maximum number of vectors the device supports. */
>      control = pci_conf_read16(pdev->sbdf, msi_control_reg(pos));
> @@ -234,33 +235,42 @@ static int cf_check init_msi(struct pci_dev *pdev)
>      pdev->vpci->msi->address64 = is_64bit_address(control);
>      pdev->vpci->msi->masking = is_mask_bit_support(control);
>  
> +    offset = msi_lower_address_reg(pos);
>      ret = vpci_add_register(pdev->vpci, address_read, address_write,
> -                            msi_lower_address_reg(pos), 4, pdev->vpci->msi);
> +                            offset, 4, pdev->vpci->msi);
>      if ( ret )
> -        return ret;
> +        goto fail;
> +    registers[reg_index].offset = offset;
> +    registers[reg_index++].size = 4;
>  
> +    offset = msi_data_reg(pos, pdev->vpci->msi->address64);
>      ret = vpci_add_register(pdev->vpci, data_read, data_write,
> -                            msi_data_reg(pos, pdev->vpci->msi->address64), 2,
> -                            pdev->vpci->msi);
> +                            offset, 2, pdev->vpci->msi);
>      if ( ret )
> -        return ret;
> +        goto fail;
> +    registers[reg_index].offset = offset;
> +    registers[reg_index++].size = 2;
>  
>      if ( pdev->vpci->msi->address64 )
>      {
> +        offset = msi_upper_address_reg(pos);
>          ret = vpci_add_register(pdev->vpci, address_hi_read, address_hi_write,
> -                                msi_upper_address_reg(pos), 4, pdev->vpci->msi);
> +                                offset, 4, pdev->vpci->msi);
>          if ( ret )
> -            return ret;
> +            goto fail;
> +        registers[reg_index].offset = offset;
> +        registers[reg_index++].size = 4;
>      }
>  
>      if ( pdev->vpci->msi->masking )
>      {
> +        offset = msi_mask_bits_reg(pos, pdev->vpci->msi->address64);
>          ret = vpci_add_register(pdev->vpci, mask_read, mask_write,
> -                                msi_mask_bits_reg(pos,
> -                                                  pdev->vpci->msi->address64),
> -                                4, pdev->vpci->msi);
> +                                offset, 4, pdev->vpci->msi);
>          if ( ret )
> -            return ret;
> +            goto fail;
> +        registers[reg_index].offset = offset;
> +        registers[reg_index++].size = 4;

As commented on the previous patch, I don't like much the usage of
this registers array to store which handlers have been added.  It
would be best if you just removed every possible handler that could be
added, without keeping track of them.

Thinking about it, do we maybe need a helper vcpi function that wipes
all handlers from a specific range?  I think it could be helpful here,
as the size of the capabilities is well-known, so on error it would be
easier to just call such a generic handler to wipe all hooks added to
the region covered by the failing capability.

Let me know what you think of it.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails
  2025-03-27 12:44   ` Roger Pau Monné
@ 2025-03-31  8:13     ` Chen, Jiqian
  2025-03-31  8:53       ` Roger Pau Monné
  0 siblings, 1 reply; 16+ messages in thread
From: Chen, Jiqian @ 2025-03-31  8:13 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Huang, Ray, Chen, Jiqian

On 2025/3/27 20:44, Roger Pau Monné wrote:
> On Thu, Mar 27, 2025 at 03:32:14PM +0800, Jiqian Chen wrote:
>> When init_msi() fails, the new codes will try to hide MSI
>> capability, so it can't rely on vpci_deassign_device() to
>> remove all MSI related registers anymore, those registers
>> must be cleaned up in failure path of init_msi.
>>
>> To do that, use a vpci_register array to record all MSI
>> registers and call vpci_remove_register() to remove registers.
> 
> As I'm just seeing 3 patches on the series, isn't there one missing
> for MSI-X at least?
No, because init_msix only call vpci_add_register once, there is no need to remove registers when it fails.

> 
>> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
>> ---
>>  xen/drivers/vpci/msi.c | 57 +++++++++++++++++++++++++++++-------------
>>  1 file changed, 39 insertions(+), 18 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
>> index 9d7a9fd8dba1..30ef97efb7b0 100644
>> --- a/xen/drivers/vpci/msi.c
>> +++ b/xen/drivers/vpci/msi.c
>> @@ -195,6 +195,9 @@ static void cf_check mask_write(
>>  
>>  static int cf_check init_msi(struct pci_dev *pdev)
>>  {
>> +    unsigned int offset;
>> +    unsigned int reg_index = 0;
>> +    struct vpci_register registers[VPCI_CAP_MAX_REGISTER];
>>      unsigned int pos = pdev->msi_pos;
>>      uint16_t control;
>>      int ret;
>> @@ -206,15 +209,13 @@ static int cf_check init_msi(struct pci_dev *pdev)
>>      if ( !pdev->vpci->msi )
>>          return -ENOMEM;
>>  
>> +    offset = msi_control_reg(pos);
>>      ret = vpci_add_register(pdev->vpci, control_read, control_write,
>> -                            msi_control_reg(pos), 2, pdev->vpci->msi);
>> +                            offset, 2, pdev->vpci->msi);
>>      if ( ret )
>> -        /*
>> -         * NB: there's no need to free the msi struct or remove the register
>> -         * handlers form the config space, the caller will take care of the
>> -         * cleanup.
>> -         */
>> -        return ret;
>> +        goto fail;
>> +    registers[reg_index].offset = offset;
>> +    registers[reg_index++].size = 2;
>>  
>>      /* Get the maximum number of vectors the device supports. */
>>      control = pci_conf_read16(pdev->sbdf, msi_control_reg(pos));
>> @@ -234,33 +235,42 @@ static int cf_check init_msi(struct pci_dev *pdev)
>>      pdev->vpci->msi->address64 = is_64bit_address(control);
>>      pdev->vpci->msi->masking = is_mask_bit_support(control);
>>  
>> +    offset = msi_lower_address_reg(pos);
>>      ret = vpci_add_register(pdev->vpci, address_read, address_write,
>> -                            msi_lower_address_reg(pos), 4, pdev->vpci->msi);
>> +                            offset, 4, pdev->vpci->msi);
>>      if ( ret )
>> -        return ret;
>> +        goto fail;
>> +    registers[reg_index].offset = offset;
>> +    registers[reg_index++].size = 4;
>>  
>> +    offset = msi_data_reg(pos, pdev->vpci->msi->address64);
>>      ret = vpci_add_register(pdev->vpci, data_read, data_write,
>> -                            msi_data_reg(pos, pdev->vpci->msi->address64), 2,
>> -                            pdev->vpci->msi);
>> +                            offset, 2, pdev->vpci->msi);
>>      if ( ret )
>> -        return ret;
>> +        goto fail;
>> +    registers[reg_index].offset = offset;
>> +    registers[reg_index++].size = 2;
>>  
>>      if ( pdev->vpci->msi->address64 )
>>      {
>> +        offset = msi_upper_address_reg(pos);
>>          ret = vpci_add_register(pdev->vpci, address_hi_read, address_hi_write,
>> -                                msi_upper_address_reg(pos), 4, pdev->vpci->msi);
>> +                                offset, 4, pdev->vpci->msi);
>>          if ( ret )
>> -            return ret;
>> +            goto fail;
>> +        registers[reg_index].offset = offset;
>> +        registers[reg_index++].size = 4;
>>      }
>>  
>>      if ( pdev->vpci->msi->masking )
>>      {
>> +        offset = msi_mask_bits_reg(pos, pdev->vpci->msi->address64);
>>          ret = vpci_add_register(pdev->vpci, mask_read, mask_write,
>> -                                msi_mask_bits_reg(pos,
>> -                                                  pdev->vpci->msi->address64),
>> -                                4, pdev->vpci->msi);
>> +                                offset, 4, pdev->vpci->msi);
>>          if ( ret )
>> -            return ret;
>> +            goto fail;
>> +        registers[reg_index].offset = offset;
>> +        registers[reg_index++].size = 4;
> 
> As commented on the previous patch, I don't like much the usage of
> this registers array to store which handlers have been added.  It
> would be best if you just removed every possible handler that could be
> added, without keeping track of them.
Make sense, it will indeed be simpler if it is fine to remove all possible registers.

> 
> Thinking about it, do we maybe need a helper vcpi function that wipes
> all handlers from a specific range?  I think it could be helpful here,
> as the size of the capabilities is well-known, so on error it would be
> easier to just call such a generic handler to wipe all hooks added to
> the region covered by the failing capability.
But I am not sure if that helper function is suitable for all capabilities.
Like Rebar, its structure can range from 12 bytes long(for a single BAR) to 52 bytes long(for all six BARs).
If a device supports Rebar and only has a single resizable BAR, does hardware still reserved the range from 13 bytes to 52 bytes for that device?
I mean if I remove the registers(with range 13 to 52), is it possible that I deleted registers belonging to other abilities?

> 
> Let me know what you think of it.
> 
> Thanks, Roger.

-- 
Best regards,
Jiqian Chen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails
  2025-03-31  8:13     ` Chen, Jiqian
@ 2025-03-31  8:53       ` Roger Pau Monné
  2025-03-31  9:43         ` Chen, Jiqian
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-31  8:53 UTC (permalink / raw)
  To: Chen, Jiqian; +Cc: xen-devel@lists.xenproject.org, Huang, Ray

On Mon, Mar 31, 2025 at 08:13:50AM +0000, Chen, Jiqian wrote:
> On 2025/3/27 20:44, Roger Pau Monné wrote:
> > On Thu, Mar 27, 2025 at 03:32:14PM +0800, Jiqian Chen wrote:
> >> When init_msi() fails, the new codes will try to hide MSI
> >> capability, so it can't rely on vpci_deassign_device() to
> >> remove all MSI related registers anymore, those registers
> >> must be cleaned up in failure path of init_msi.
> >>
> >> To do that, use a vpci_register array to record all MSI
> >> registers and call vpci_remove_register() to remove registers.
> > 
> > As I'm just seeing 3 patches on the series, isn't there one missing
> > for MSI-X at least?
> No, because init_msix only call vpci_add_register once, there is no need to remove registers when it fails.

Seems a bit fragile, what about if there's further code added to
init_msix() that could return an error after the vpci_add_register()
call?  It would be safer to have a cleanup function that removes the
config space handlers, plus the MMIO one (see the call to
register_mmio_handler()), and the addition to the
d->arch.hvm.msix_tables list.

> > 
> >> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> >> ---
> >>  xen/drivers/vpci/msi.c | 57 +++++++++++++++++++++++++++++-------------
> >>  1 file changed, 39 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> >> index 9d7a9fd8dba1..30ef97efb7b0 100644
> >> --- a/xen/drivers/vpci/msi.c
> >> +++ b/xen/drivers/vpci/msi.c
> >> @@ -195,6 +195,9 @@ static void cf_check mask_write(
> >>  
> >>  static int cf_check init_msi(struct pci_dev *pdev)
> >>  {
> >> +    unsigned int offset;
> >> +    unsigned int reg_index = 0;
> >> +    struct vpci_register registers[VPCI_CAP_MAX_REGISTER];
> >>      unsigned int pos = pdev->msi_pos;
> >>      uint16_t control;
> >>      int ret;
> >> @@ -206,15 +209,13 @@ static int cf_check init_msi(struct pci_dev *pdev)
> >>      if ( !pdev->vpci->msi )
> >>          return -ENOMEM;
> >>  
> >> +    offset = msi_control_reg(pos);
> >>      ret = vpci_add_register(pdev->vpci, control_read, control_write,
> >> -                            msi_control_reg(pos), 2, pdev->vpci->msi);
> >> +                            offset, 2, pdev->vpci->msi);
> >>      if ( ret )
> >> -        /*
> >> -         * NB: there's no need to free the msi struct or remove the register
> >> -         * handlers form the config space, the caller will take care of the
> >> -         * cleanup.
> >> -         */
> >> -        return ret;
> >> +        goto fail;
> >> +    registers[reg_index].offset = offset;
> >> +    registers[reg_index++].size = 2;
> >>  
> >>      /* Get the maximum number of vectors the device supports. */
> >>      control = pci_conf_read16(pdev->sbdf, msi_control_reg(pos));
> >> @@ -234,33 +235,42 @@ static int cf_check init_msi(struct pci_dev *pdev)
> >>      pdev->vpci->msi->address64 = is_64bit_address(control);
> >>      pdev->vpci->msi->masking = is_mask_bit_support(control);
> >>  
> >> +    offset = msi_lower_address_reg(pos);
> >>      ret = vpci_add_register(pdev->vpci, address_read, address_write,
> >> -                            msi_lower_address_reg(pos), 4, pdev->vpci->msi);
> >> +                            offset, 4, pdev->vpci->msi);
> >>      if ( ret )
> >> -        return ret;
> >> +        goto fail;
> >> +    registers[reg_index].offset = offset;
> >> +    registers[reg_index++].size = 4;
> >>  
> >> +    offset = msi_data_reg(pos, pdev->vpci->msi->address64);
> >>      ret = vpci_add_register(pdev->vpci, data_read, data_write,
> >> -                            msi_data_reg(pos, pdev->vpci->msi->address64), 2,
> >> -                            pdev->vpci->msi);
> >> +                            offset, 2, pdev->vpci->msi);
> >>      if ( ret )
> >> -        return ret;
> >> +        goto fail;
> >> +    registers[reg_index].offset = offset;
> >> +    registers[reg_index++].size = 2;
> >>  
> >>      if ( pdev->vpci->msi->address64 )
> >>      {
> >> +        offset = msi_upper_address_reg(pos);
> >>          ret = vpci_add_register(pdev->vpci, address_hi_read, address_hi_write,
> >> -                                msi_upper_address_reg(pos), 4, pdev->vpci->msi);
> >> +                                offset, 4, pdev->vpci->msi);
> >>          if ( ret )
> >> -            return ret;
> >> +            goto fail;
> >> +        registers[reg_index].offset = offset;
> >> +        registers[reg_index++].size = 4;
> >>      }
> >>  
> >>      if ( pdev->vpci->msi->masking )
> >>      {
> >> +        offset = msi_mask_bits_reg(pos, pdev->vpci->msi->address64);
> >>          ret = vpci_add_register(pdev->vpci, mask_read, mask_write,
> >> -                                msi_mask_bits_reg(pos,
> >> -                                                  pdev->vpci->msi->address64),
> >> -                                4, pdev->vpci->msi);
> >> +                                offset, 4, pdev->vpci->msi);
> >>          if ( ret )
> >> -            return ret;
> >> +            goto fail;
> >> +        registers[reg_index].offset = offset;
> >> +        registers[reg_index++].size = 4;
> > 
> > As commented on the previous patch, I don't like much the usage of
> > this registers array to store which handlers have been added.  It
> > would be best if you just removed every possible handler that could be
> > added, without keeping track of them.
> Make sense, it will indeed be simpler if it is fine to remove all possible registers.
> 
> > 
> > Thinking about it, do we maybe need a helper vcpi function that wipes
> > all handlers from a specific range?  I think it could be helpful here,
> > as the size of the capabilities is well-known, so on error it would be
> > easier to just call such a generic handler to wipe all hooks added to
> > the region covered by the failing capability.
> But I am not sure if that helper function is suitable for all capabilities.
> Like Rebar, its structure can range from 12 bytes long(for a single BAR) to 52 bytes long(for all six BARs).
> If a device supports Rebar and only has a single resizable BAR, does hardware still reserved the range from 13 bytes to 52 bytes for that device?

No, we would need to fetch the size of the capability in the cleanup
function, or figure it otherwise.  Note the same applies to MSI
capability, which has a variable size depending on whether 64bit
addresses and masking is supported.

> I mean if I remove the registers(with range 13 to 52), is it possible that I deleted registers belonging to other abilities?

It is, indeed.  You need to know or calculate the size of the
capability to be removed, but that's likely easier and more robust
that keeping an array with the list of added registers?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails
  2025-03-31  8:53       ` Roger Pau Monné
@ 2025-03-31  9:43         ` Chen, Jiqian
  2025-03-31 11:12           ` Roger Pau Monné
  0 siblings, 1 reply; 16+ messages in thread
From: Chen, Jiqian @ 2025-03-31  9:43 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Huang, Ray, Chen, Jiqian

On 2025/3/31 16:53, Roger Pau Monné wrote:
> On Mon, Mar 31, 2025 at 08:13:50AM +0000, Chen, Jiqian wrote:
>> On 2025/3/27 20:44, Roger Pau Monné wrote:
>>> On Thu, Mar 27, 2025 at 03:32:14PM +0800, Jiqian Chen wrote:
>>>> When init_msi() fails, the new codes will try to hide MSI
>>>> capability, so it can't rely on vpci_deassign_device() to
>>>> remove all MSI related registers anymore, those registers
>>>> must be cleaned up in failure path of init_msi.
>>>>
>>>> To do that, use a vpci_register array to record all MSI
>>>> registers and call vpci_remove_register() to remove registers.
>>>
>>> As I'm just seeing 3 patches on the series, isn't there one missing
>>> for MSI-X at least?
>> No, because init_msix only call vpci_add_register once, there is no need to remove registers when it fails.
> 
> Seems a bit fragile, what about if there's further code added to
> init_msix() that could return an error after the vpci_add_register()
> call?  It would be safer to have a cleanup function that removes the
> config space handlers, plus the MMIO one (see the call to
> register_mmio_handler()), and the addition to the
> d->arch.hvm.msix_tables list.
I am only talking about the current implementation of init_msix(), which does not need a cleanup function.
But if you want such a function, even if it is not needed now, I will add it in the next version.

> 
>>>
>>>> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
>>>>  
>>>>      if ( pdev->vpci->msi->masking )
>>>>      {
>>>> +        offset = msi_mask_bits_reg(pos, pdev->vpci->msi->address64);
>>>>          ret = vpci_add_register(pdev->vpci, mask_read, mask_write,
>>>> -                                msi_mask_bits_reg(pos,
>>>> -                                                  pdev->vpci->msi->address64),
>>>> -                                4, pdev->vpci->msi);
>>>> +                                offset, 4, pdev->vpci->msi);
>>>>          if ( ret )
>>>> -            return ret;
>>>> +            goto fail;
>>>> +        registers[reg_index].offset = offset;
>>>> +        registers[reg_index++].size = 4;
>>>
>>> As commented on the previous patch, I don't like much the usage of
>>> this registers array to store which handlers have been added.  It
>>> would be best if you just removed every possible handler that could be
>>> added, without keeping track of them.
>> Make sense, it will indeed be simpler if it is fine to remove all possible registers.
>>
>>>
>>> Thinking about it, do we maybe need a helper vcpi function that wipes
>>> all handlers from a specific range?  I think it could be helpful here,
>>> as the size of the capabilities is well-known, so on error it would be
>>> easier to just call such a generic handler to wipe all hooks added to
>>> the region covered by the failing capability.
>> But I am not sure if that helper function is suitable for all capabilities.
>> Like Rebar, its structure can range from 12 bytes long(for a single BAR) to 52 bytes long(for all six BARs).
>> If a device supports Rebar and only has a single resizable BAR, does hardware still reserved the range from 13 bytes to 52 bytes for that device?
> 
> No, we would need to fetch the size of the capability in the cleanup
> function, or figure it otherwise.  Note the same applies to MSI
> capability, which has a variable size depending on whether 64bit
> addresses and masking is supported.
Got it, I originally thought you wanted a general helper function.
But it seems the case is each capability would have its own separate cleanup function instead.

> 
>> I mean if I remove the registers(with range 13 to 52), is it possible that I deleted registers belonging to other abilities?
> 
> It is, indeed.  You need to know or calculate the size of the
> capability to be removed, but that's likely easier and more robust
> that keeping an array with the list of added registers?
Right.

> 
> Thanks, Roger.

-- 
Best regards,
Jiqian Chen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails
  2025-03-31  9:43         ` Chen, Jiqian
@ 2025-03-31 11:12           ` Roger Pau Monné
  0 siblings, 0 replies; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-31 11:12 UTC (permalink / raw)
  To: Chen, Jiqian; +Cc: xen-devel@lists.xenproject.org, Huang, Ray

On Mon, Mar 31, 2025 at 09:43:11AM +0000, Chen, Jiqian wrote:
> On 2025/3/31 16:53, Roger Pau Monné wrote:
> > On Mon, Mar 31, 2025 at 08:13:50AM +0000, Chen, Jiqian wrote:
> >> On 2025/3/27 20:44, Roger Pau Monné wrote:
> >>> On Thu, Mar 27, 2025 at 03:32:14PM +0800, Jiqian Chen wrote:
> >>>> When init_msi() fails, the new codes will try to hide MSI
> >>>> capability, so it can't rely on vpci_deassign_device() to
> >>>> remove all MSI related registers anymore, those registers
> >>>> must be cleaned up in failure path of init_msi.
> >>>>
> >>>> To do that, use a vpci_register array to record all MSI
> >>>> registers and call vpci_remove_register() to remove registers.
> >>>
> >>> As I'm just seeing 3 patches on the series, isn't there one missing
> >>> for MSI-X at least?
> >> No, because init_msix only call vpci_add_register once, there is no need to remove registers when it fails.
> > 
> > Seems a bit fragile, what about if there's further code added to
> > init_msix() that could return an error after the vpci_add_register()
> > call?  It would be safer to have a cleanup function that removes the
> > config space handlers, plus the MMIO one (see the call to
> > register_mmio_handler()), and the addition to the
> > d->arch.hvm.msix_tables list.
> I am only talking about the current implementation of init_msix(), which does not need a cleanup function.
> But if you want such a function, even if it is not needed now, I will add it in the next version.

I think it would be cleaner, so that we could remove the MSI-X
specific logic from vpci_deassign_device().

> > 
> >>>
> >>>> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> >>>>  
> >>>>      if ( pdev->vpci->msi->masking )
> >>>>      {
> >>>> +        offset = msi_mask_bits_reg(pos, pdev->vpci->msi->address64);
> >>>>          ret = vpci_add_register(pdev->vpci, mask_read, mask_write,
> >>>> -                                msi_mask_bits_reg(pos,
> >>>> -                                                  pdev->vpci->msi->address64),
> >>>> -                                4, pdev->vpci->msi);
> >>>> +                                offset, 4, pdev->vpci->msi);
> >>>>          if ( ret )
> >>>> -            return ret;
> >>>> +            goto fail;
> >>>> +        registers[reg_index].offset = offset;
> >>>> +        registers[reg_index++].size = 4;
> >>>
> >>> As commented on the previous patch, I don't like much the usage of
> >>> this registers array to store which handlers have been added.  It
> >>> would be best if you just removed every possible handler that could be
> >>> added, without keeping track of them.
> >> Make sense, it will indeed be simpler if it is fine to remove all possible registers.
> >>
> >>>
> >>> Thinking about it, do we maybe need a helper vcpi function that wipes
> >>> all handlers from a specific range?  I think it could be helpful here,
> >>> as the size of the capabilities is well-known, so on error it would be
> >>> easier to just call such a generic handler to wipe all hooks added to
> >>> the region covered by the failing capability.
> >> But I am not sure if that helper function is suitable for all capabilities.
> >> Like Rebar, its structure can range from 12 bytes long(for a single BAR) to 52 bytes long(for all six BARs).
> >> If a device supports Rebar and only has a single resizable BAR, does hardware still reserved the range from 13 bytes to 52 bytes for that device?
> > 
> > No, we would need to fetch the size of the capability in the cleanup
> > function, or figure it otherwise.  Note the same applies to MSI
> > capability, which has a variable size depending on whether 64bit
> > addresses and masking is supported.
> Got it, I originally thought you wanted a general helper function.
> But it seems the case is each capability would have its own separate cleanup function instead.

Sorry, maybe that wasn't clear.  The generic function would be a
helper to zap all handlers from a given PCI config space range, ie:

vpci_remove_registers(struct vpci *vpci, unsigned int offset,
                      unsigned int size);

Maybe it's even worth to just convert vpci_remove_register() into
vpci_remove_registers() and allow it to zap multiple registers at
once?  As vpci_remove_register() is just used for the tests harness.

That function would be used by each capability cleanup routine to
clean it's PCI config space.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-27  7:32 [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Jiqian Chen
  2025-03-27  7:32 ` [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails Jiqian Chen
  2025-03-27  7:32 ` [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails Jiqian Chen
@ 2025-03-27  9:25 ` Roger Pau Monné
  2025-03-31  7:26   ` Chen, Jiqian
  2 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-27  9:25 UTC (permalink / raw)
  To: Jiqian Chen; +Cc: xen-devel, Huang Rui

On Thu, Mar 27, 2025 at 03:32:12PM +0800, Jiqian Chen wrote:
> When vpci fails to initialize a capability of a device, it just
> return error instead of catching and processing exception. That
> makes the entire device unusable.
> 
> So, refactor REGISTER_VPCI_INIT to contain more capability specific
> information, and try to hide capability when initialization fails
> in vpci_assign_device().
> 
> What's more, change the definition of init_header() since it is
> not a capability and it is needed for all devices' PCI config space.
> 
> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> ---
> Hi all,
> 
> This patch aims to hide a capability when its initialization fails.
> That causes we can't rely on vpci_deassign_device() to clean up assigned
> resources, so, following two patches clean up resources in the failure
> path of init function.
> 
> Best regards,
> Jiqian Chen.
> ---
>  xen/drivers/vpci/header.c |  3 +-
>  xen/drivers/vpci/msi.c    |  2 +-
>  xen/drivers/vpci/msix.c   |  2 +-
>  xen/drivers/vpci/rebar.c  |  2 +-
>  xen/drivers/vpci/vpci.c   | 65 +++++++++++++++++++++++++++++++++------
>  xen/include/xen/vpci.h    | 27 ++++++++++++----
>  6 files changed, 81 insertions(+), 20 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index ef6c965c081c..8c8e4ac5698a 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -745,7 +745,7 @@ static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
>      return !bar->mem ? -ENOMEM : 0;
>  }
>  
> -static int cf_check init_header(struct pci_dev *pdev)
> +int vpci_init_header(struct pci_dev *pdev)
>  {
>      uint16_t cmd;
>      uint64_t addr, size;
> @@ -1007,7 +1007,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>      pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
>      return rc;
>  }
> -REGISTER_VPCI_INIT(init_header, VPCI_PRIORITY_MIDDLE);
>  
>  /*
>   * Local variables:
> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> index 66e5a8a116be..9d7a9fd8dba1 100644
> --- a/xen/drivers/vpci/msi.c
> +++ b/xen/drivers/vpci/msi.c
> @@ -270,7 +270,7 @@ static int cf_check init_msi(struct pci_dev *pdev)
>  
>      return 0;
>  }
> -REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
> +REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSI, init_msi, VPCI_PRIORITY_LOW);
>  
>  void vpci_dump_msi(void)
>  {
> diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
> index 6bd8c55bb48e..50e5f38c1e09 100644
> --- a/xen/drivers/vpci/msix.c
> +++ b/xen/drivers/vpci/msix.c
> @@ -753,7 +753,7 @@ static int cf_check init_msix(struct pci_dev *pdev)
>  
>      return 0;
>  }
> -REGISTER_VPCI_INIT(init_msix, VPCI_PRIORITY_HIGH);
> +REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSIX, init_msix, VPCI_PRIORITY_HIGH);
>  
>  /*
>   * Local variables:
> diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
> index 793937449af7..7c53ee031887 100644
> --- a/xen/drivers/vpci/rebar.c
> +++ b/xen/drivers/vpci/rebar.c
> @@ -118,7 +118,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
>  
>      return 0;
>  }
> -REGISTER_VPCI_INIT(init_rebar, VPCI_PRIORITY_LOW);
> +REGISTER_VPCI_EXTEND_CAP(PCI_EXT_CAP_ID_REBAR, init_rebar, VPCI_PRIORITY_LOW);
>  
>  /*
>   * Local variables:
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 1e6aa5d799b9..a8362e46e097 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -36,8 +36,8 @@ struct vpci_register {
>  };
>  
>  #ifdef __XEN__
> -extern vpci_register_init_t *const __start_vpci_array[];
> -extern vpci_register_init_t *const __end_vpci_array[];
> +extern vpci_capability_t *const __start_vpci_array[];
> +extern vpci_capability_t *const __end_vpci_array[];
>  #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>  
>  #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
> @@ -83,6 +83,47 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>  
>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>  
> +static int vpci_init_cap_with_priority(struct pci_dev *pdev,
> +                                       const char *priority)
> +{
> +    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
> +    {
> +        const vpci_capability_t *capability = __start_vpci_array[i];
> +        const unsigned int cap_id = capability->id;
> +        unsigned int pos;
> +        int rc;
> +
> +        if ( *(capability->priority) != *priority )
> +            continue;
> +
> +        if ( !capability->is_ext )
> +            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
> +        else
> +            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
> +
> +        if ( !pos )
> +            continue;
> +
> +        rc = capability->init(pdev);
> +
> +        if ( rc )
> +        {
> +            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
> +                   pdev->domain, &pdev->sbdf, rc);
> +            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
> +                                   pos, capability->is_ext ? 4 : 1, NULL);

Are you sure this works as intended?  The capability ID 0 is marked as
"reserved" in the spec, so it's unclear to me how OSes would handle
finding such capability on the list - I won't be surprised if some
implementations decide to terminate the walk.  It's fine to mask the
capability ID for the ones that we don't want to expose, but there's
further work to do IMO.

The usual way to deal with masking capabilities is to short circuit
the capability from the linked list, by making the previous capability
"Next Capability Offset" point to the next capability in the list,
thus skipping the current one. So:

capability[n - 1].next_cap = capability[n].next_cap

IOW: you will need to add the handler to the previous capability on
the list.  That's how it's already done in init_header().

> +            if ( rc )
> +            {
> +                printk(XENLOG_ERR "%pd %pp: fail to hide cap rc=%d\n",
> +                       pdev->domain, &pdev->sbdf, rc);
> +                return rc;
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
>  void vpci_deassign_device(struct pci_dev *pdev)
>  {
>      unsigned int i;
> @@ -128,7 +169,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
>  
>  int vpci_assign_device(struct pci_dev *pdev)
>  {
> -    unsigned int i;
>      const unsigned long *ro_map;
>      int rc = 0;
>  
> @@ -159,12 +199,19 @@ int vpci_assign_device(struct pci_dev *pdev)
>          goto out;
>  #endif
>  
> -    for ( i = 0; i < NUM_VPCI_INIT; i++ )
> -    {
> -        rc = __start_vpci_array[i](pdev);
> -        if ( rc )
> -            break;
> -    }
> +    /*
> +     * Capabilities with high priority like MSI-X need to
> +     * be initialized before header
> +     */
> +    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
> +    if ( rc )
> +        goto out;

I understand this is not introduced by this change, but I wonder if
there could be a way to ditch the priority stuff for capabilities,
specially now that we only have two "priorities": before or after PCI
header initialization.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-27  9:25 ` [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Roger Pau Monné
@ 2025-03-31  7:26   ` Chen, Jiqian
  2025-03-31  8:43     ` Roger Pau Monné
  0 siblings, 1 reply; 16+ messages in thread
From: Chen, Jiqian @ 2025-03-31  7:26 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Huang, Ray, Chen, Jiqian

On 2025/3/27 17:25, Roger Pau Monné wrote:
> On Thu, Mar 27, 2025 at 03:32:12PM +0800, Jiqian Chen wrote:
>> When vpci fails to initialize a capability of a device, it just
>> return error instead of catching and processing exception. That
>> makes the entire device unusable.
>>
>> So, refactor REGISTER_VPCI_INIT to contain more capability specific
>> information, and try to hide capability when initialization fails
>> in vpci_assign_device().
>>
>> What's more, change the definition of init_header() since it is
>> not a capability and it is needed for all devices' PCI config space.
>>
>> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
>> ---
>> Hi all,
>>
>> This patch aims to hide a capability when its initialization fails.
>> That causes we can't rely on vpci_deassign_device() to clean up assigned
>> resources, so, following two patches clean up resources in the failure
>> path of init function.
>>
>> Best regards,
>> Jiqian Chen.
>> ---
>>  xen/drivers/vpci/header.c |  3 +-
>>  xen/drivers/vpci/msi.c    |  2 +-
>>  xen/drivers/vpci/msix.c   |  2 +-
>>  xen/drivers/vpci/rebar.c  |  2 +-
>>  xen/drivers/vpci/vpci.c   | 65 +++++++++++++++++++++++++++++++++------
>>  xen/include/xen/vpci.h    | 27 ++++++++++++----
>>  6 files changed, 81 insertions(+), 20 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index ef6c965c081c..8c8e4ac5698a 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -745,7 +745,7 @@ static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
>>      return !bar->mem ? -ENOMEM : 0;
>>  }
>>  
>> -static int cf_check init_header(struct pci_dev *pdev)
>> +int vpci_init_header(struct pci_dev *pdev)
>>  {
>>      uint16_t cmd;
>>      uint64_t addr, size;
>> @@ -1007,7 +1007,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>>      pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
>>      return rc;
>>  }
>> -REGISTER_VPCI_INIT(init_header, VPCI_PRIORITY_MIDDLE);
>>  
>>  /*
>>   * Local variables:
>> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
>> index 66e5a8a116be..9d7a9fd8dba1 100644
>> --- a/xen/drivers/vpci/msi.c
>> +++ b/xen/drivers/vpci/msi.c
>> @@ -270,7 +270,7 @@ static int cf_check init_msi(struct pci_dev *pdev)
>>  
>>      return 0;
>>  }
>> -REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
>> +REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSI, init_msi, VPCI_PRIORITY_LOW);
>>  
>>  void vpci_dump_msi(void)
>>  {
>> diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
>> index 6bd8c55bb48e..50e5f38c1e09 100644
>> --- a/xen/drivers/vpci/msix.c
>> +++ b/xen/drivers/vpci/msix.c
>> @@ -753,7 +753,7 @@ static int cf_check init_msix(struct pci_dev *pdev)
>>  
>>      return 0;
>>  }
>> -REGISTER_VPCI_INIT(init_msix, VPCI_PRIORITY_HIGH);
>> +REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSIX, init_msix, VPCI_PRIORITY_HIGH);
>>  
>>  /*
>>   * Local variables:
>> diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
>> index 793937449af7..7c53ee031887 100644
>> --- a/xen/drivers/vpci/rebar.c
>> +++ b/xen/drivers/vpci/rebar.c
>> @@ -118,7 +118,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
>>  
>>      return 0;
>>  }
>> -REGISTER_VPCI_INIT(init_rebar, VPCI_PRIORITY_LOW);
>> +REGISTER_VPCI_EXTEND_CAP(PCI_EXT_CAP_ID_REBAR, init_rebar, VPCI_PRIORITY_LOW);
>>  
>>  /*
>>   * Local variables:
>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>> index 1e6aa5d799b9..a8362e46e097 100644
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -36,8 +36,8 @@ struct vpci_register {
>>  };
>>  
>>  #ifdef __XEN__
>> -extern vpci_register_init_t *const __start_vpci_array[];
>> -extern vpci_register_init_t *const __end_vpci_array[];
>> +extern vpci_capability_t *const __start_vpci_array[];
>> +extern vpci_capability_t *const __end_vpci_array[];
>>  #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>>  
>>  #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
>> @@ -83,6 +83,47 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>>  
>>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>>  
>> +static int vpci_init_cap_with_priority(struct pci_dev *pdev,
>> +                                       const char *priority)
>> +{
>> +    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>> +    {
>> +        const vpci_capability_t *capability = __start_vpci_array[i];
>> +        const unsigned int cap_id = capability->id;
>> +        unsigned int pos;
>> +        int rc;
>> +
>> +        if ( *(capability->priority) != *priority )
>> +            continue;
>> +
>> +        if ( !capability->is_ext )
>> +            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
>> +        else
>> +            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
>> +
>> +        if ( !pos )
>> +            continue;
>> +
>> +        rc = capability->init(pdev);
>> +
>> +        if ( rc )
>> +        {
>> +            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
>> +                   pdev->domain, &pdev->sbdf, rc);
>> +            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
>> +                                   pos, capability->is_ext ? 4 : 1, NULL);
> 
> Are you sure this works as intended? 
Yes, I used failure test cases of init_msi/rebar.
From the "lspci" result, they were hided from the dom0.
But I forgot to test for domUs.

> The capability ID 0 is marked as "reserved" in the spec, so it's unclear to me how OSes would handle
> finding such capability on the list - I won't be surprised if some
> implementations decide to terminate the walk.  It's fine to mask the
> capability ID for the ones that we don't want to expose, but there's
> further work to do IMO.
> 
> The usual way to deal with masking capabilities is to short circuit
> the capability from the linked list, by making the previous capability
> "Next Capability Offset" point to the next capability in the list,
> thus skipping the current one. So:
> 
> capability[n - 1].next_cap = capability[n].next_cap
> 
> IOW: you will need to add the handler to the previous capability on
> the list.  That's how it's already done in init_header().
Oh, I got your opinion.
But we may need to discuss this more.
In my opinion, there should be two situations:
First, if device belongs to hardware domain, there is no emulation of legacy or extended capabilities linked list if I understand codes right.
So, for this situation, I think current implementation of my patch is enough for hiding legacy or extended capabilities.

Second, if device belongs to common domain, we just need to consider legacy capabilities since all extended capabilities are hided in init_header().
So, for this situation, I need to what you said " capability[n - 1].next_cap = capability[n].next_cap "

I am not sure if above are right.
> 
>> +            if ( rc )
>> +            {
>> +                printk(XENLOG_ERR "%pd %pp: fail to hide cap rc=%d\n",
>> +                       pdev->domain, &pdev->sbdf, rc);
>> +                return rc;
>> +            }
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>  void vpci_deassign_device(struct pci_dev *pdev)
>>  {
>>      unsigned int i;
>> @@ -128,7 +169,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
>>  
>>  int vpci_assign_device(struct pci_dev *pdev)
>>  {
>> -    unsigned int i;
>>      const unsigned long *ro_map;
>>      int rc = 0;
>>  
>> @@ -159,12 +199,19 @@ int vpci_assign_device(struct pci_dev *pdev)
>>          goto out;
>>  #endif
>>  
>> -    for ( i = 0; i < NUM_VPCI_INIT; i++ )
>> -    {
>> -        rc = __start_vpci_array[i](pdev);
>> -        if ( rc )
>> -            break;
>> -    }
>> +    /*
>> +     * Capabilities with high priority like MSI-X need to
>> +     * be initialized before header
>> +     */
>> +    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
>> +    if ( rc )
>> +        goto out;
> 
> I understand this is not introduced by this change, but I wonder if
> there could be a way to ditch the priority stuff for capabilities,
> specially now that we only have two "priorities": before or after PCI
> header initialization.
I have an idea, but it seems like a hake.
Can we add a flag(maybe name it "msix_initialized") to struct vpci{}?
Then in vpci_make_msix_hole(), we can first check that flag, if it is false, we return an error to let modify_decoding() directly return in the process of init_header.
And in the start of init_msix(), to set msix_initialized=true, in the end of init_msix(), to call modify_decoding() to setup p2m.
Then we can remove the priorities.

> 
> Thanks, Roger.

-- 
Best regards,
Jiqian Chen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-31  7:26   ` Chen, Jiqian
@ 2025-03-31  8:43     ` Roger Pau Monné
  2025-03-31  8:46       ` Jan Beulich
  2025-03-31  9:32       ` Chen, Jiqian
  0 siblings, 2 replies; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-31  8:43 UTC (permalink / raw)
  To: Chen, Jiqian; +Cc: xen-devel@lists.xenproject.org, Huang, Ray

On Mon, Mar 31, 2025 at 07:26:20AM +0000, Chen, Jiqian wrote:
> On 2025/3/27 17:25, Roger Pau Monné wrote:
> > On Thu, Mar 27, 2025 at 03:32:12PM +0800, Jiqian Chen wrote:
> >> When vpci fails to initialize a capability of a device, it just
> >> return error instead of catching and processing exception. That
> >> makes the entire device unusable.
> >>
> >> So, refactor REGISTER_VPCI_INIT to contain more capability specific
> >> information, and try to hide capability when initialization fails
> >> in vpci_assign_device().
> >>
> >> What's more, change the definition of init_header() since it is
> >> not a capability and it is needed for all devices' PCI config space.
> >>
> >> Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
> >> ---
> >> Hi all,
> >>
> >> This patch aims to hide a capability when its initialization fails.
> >> That causes we can't rely on vpci_deassign_device() to clean up assigned
> >> resources, so, following two patches clean up resources in the failure
> >> path of init function.
> >>
> >> Best regards,
> >> Jiqian Chen.
> >> ---
> >>  xen/drivers/vpci/header.c |  3 +-
> >>  xen/drivers/vpci/msi.c    |  2 +-
> >>  xen/drivers/vpci/msix.c   |  2 +-
> >>  xen/drivers/vpci/rebar.c  |  2 +-
> >>  xen/drivers/vpci/vpci.c   | 65 +++++++++++++++++++++++++++++++++------
> >>  xen/include/xen/vpci.h    | 27 ++++++++++++----
> >>  6 files changed, 81 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> >> index ef6c965c081c..8c8e4ac5698a 100644
> >> --- a/xen/drivers/vpci/header.c
> >> +++ b/xen/drivers/vpci/header.c
> >> @@ -745,7 +745,7 @@ static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> >>      return !bar->mem ? -ENOMEM : 0;
> >>  }
> >>  
> >> -static int cf_check init_header(struct pci_dev *pdev)
> >> +int vpci_init_header(struct pci_dev *pdev)
> >>  {
> >>      uint16_t cmd;
> >>      uint64_t addr, size;
> >> @@ -1007,7 +1007,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> >>      pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> >>      return rc;
> >>  }
> >> -REGISTER_VPCI_INIT(init_header, VPCI_PRIORITY_MIDDLE);
> >>  
> >>  /*
> >>   * Local variables:
> >> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> >> index 66e5a8a116be..9d7a9fd8dba1 100644
> >> --- a/xen/drivers/vpci/msi.c
> >> +++ b/xen/drivers/vpci/msi.c
> >> @@ -270,7 +270,7 @@ static int cf_check init_msi(struct pci_dev *pdev)
> >>  
> >>      return 0;
> >>  }
> >> -REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
> >> +REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSI, init_msi, VPCI_PRIORITY_LOW);
> >>  
> >>  void vpci_dump_msi(void)
> >>  {
> >> diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
> >> index 6bd8c55bb48e..50e5f38c1e09 100644
> >> --- a/xen/drivers/vpci/msix.c
> >> +++ b/xen/drivers/vpci/msix.c
> >> @@ -753,7 +753,7 @@ static int cf_check init_msix(struct pci_dev *pdev)
> >>  
> >>      return 0;
> >>  }
> >> -REGISTER_VPCI_INIT(init_msix, VPCI_PRIORITY_HIGH);
> >> +REGISTER_VPCI_LEGACY_CAP(PCI_CAP_ID_MSIX, init_msix, VPCI_PRIORITY_HIGH);
> >>  
> >>  /*
> >>   * Local variables:
> >> diff --git a/xen/drivers/vpci/rebar.c b/xen/drivers/vpci/rebar.c
> >> index 793937449af7..7c53ee031887 100644
> >> --- a/xen/drivers/vpci/rebar.c
> >> +++ b/xen/drivers/vpci/rebar.c
> >> @@ -118,7 +118,7 @@ static int cf_check init_rebar(struct pci_dev *pdev)
> >>  
> >>      return 0;
> >>  }
> >> -REGISTER_VPCI_INIT(init_rebar, VPCI_PRIORITY_LOW);
> >> +REGISTER_VPCI_EXTEND_CAP(PCI_EXT_CAP_ID_REBAR, init_rebar, VPCI_PRIORITY_LOW);
> >>  
> >>  /*
> >>   * Local variables:
> >> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> >> index 1e6aa5d799b9..a8362e46e097 100644
> >> --- a/xen/drivers/vpci/vpci.c
> >> +++ b/xen/drivers/vpci/vpci.c
> >> @@ -36,8 +36,8 @@ struct vpci_register {
> >>  };
> >>  
> >>  #ifdef __XEN__
> >> -extern vpci_register_init_t *const __start_vpci_array[];
> >> -extern vpci_register_init_t *const __end_vpci_array[];
> >> +extern vpci_capability_t *const __start_vpci_array[];
> >> +extern vpci_capability_t *const __end_vpci_array[];
> >>  #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
> >>  
> >>  #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
> >> @@ -83,6 +83,47 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
> >>  
> >>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
> >>  
> >> +static int vpci_init_cap_with_priority(struct pci_dev *pdev,
> >> +                                       const char *priority)
> >> +{
> >> +    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
> >> +    {
> >> +        const vpci_capability_t *capability = __start_vpci_array[i];
> >> +        const unsigned int cap_id = capability->id;
> >> +        unsigned int pos;
> >> +        int rc;
> >> +
> >> +        if ( *(capability->priority) != *priority )
> >> +            continue;
> >> +
> >> +        if ( !capability->is_ext )
> >> +            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
> >> +        else
> >> +            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
> >> +
> >> +        if ( !pos )
> >> +            continue;
> >> +
> >> +        rc = capability->init(pdev);
> >> +
> >> +        if ( rc )
> >> +        {
> >> +            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
> >> +                   pdev->domain, &pdev->sbdf, rc);
> >> +            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
> >> +                                   pos, capability->is_ext ? 4 : 1, NULL);
> > 
> > Are you sure this works as intended? 
> Yes, I used failure test cases of init_msi/rebar.
> From the "lspci" result, they were hided from the dom0.
> But I forgot to test for domUs.

I assume that's only tested with Linux?  See my comment below about
capability ID 0 being reserved, and hence I think we should not keep
capabilities with ID 0 on the list, as it might cause malfunctions to
OSes.

> > The capability ID 0 is marked as "reserved" in the spec, so it's unclear to me how OSes would handle
> > finding such capability on the list - I won't be surprised if some
> > implementations decide to terminate the walk.  It's fine to mask the
> > capability ID for the ones that we don't want to expose, but there's
> > further work to do IMO.
> > 
> > The usual way to deal with masking capabilities is to short circuit
> > the capability from the linked list, by making the previous capability
> > "Next Capability Offset" point to the next capability in the list,
> > thus skipping the current one. So:
> > 
> > capability[n - 1].next_cap = capability[n].next_cap
> > 
> > IOW: you will need to add the handler to the previous capability on
> > the list.  That's how it's already done in init_header().
> Oh, I got your opinion.
> But we may need to discuss this more.
> In my opinion, there should be two situations:
> First, if device belongs to hardware domain, there is no emulation of legacy or extended capabilities linked list if I understand codes right.

Yes, but that needs to be fixed, we need to have this kind of
emulation uniformly.

> So, for this situation, I think current implementation of my patch is enough for hiding legacy or extended capabilities.

It works given the current code in Linux.  As said above, I don't
think this is fully correct according to the PCI spec.

> Second, if device belongs to common domain, we just need to consider legacy capabilities since all extended capabilities are hided in init_header().
> So, for this situation, I need to what you said " capability[n - 1].next_cap = capability[n].next_cap "

I'm not sure why would want to handle the hardware domain vs
unprivileged domains differently here.  The way to hide the
capabilities should always be the same, like it's currently done for
domUs.

> I am not sure if above are right.
> > 
> >> +            if ( rc )
> >> +            {
> >> +                printk(XENLOG_ERR "%pd %pp: fail to hide cap rc=%d\n",
> >> +                       pdev->domain, &pdev->sbdf, rc);
> >> +                return rc;
> >> +            }
> >> +        }
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>  void vpci_deassign_device(struct pci_dev *pdev)
> >>  {
> >>      unsigned int i;
> >> @@ -128,7 +169,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> >>  
> >>  int vpci_assign_device(struct pci_dev *pdev)
> >>  {
> >> -    unsigned int i;
> >>      const unsigned long *ro_map;
> >>      int rc = 0;
> >>  
> >> @@ -159,12 +199,19 @@ int vpci_assign_device(struct pci_dev *pdev)
> >>          goto out;
> >>  #endif
> >>  
> >> -    for ( i = 0; i < NUM_VPCI_INIT; i++ )
> >> -    {
> >> -        rc = __start_vpci_array[i](pdev);
> >> -        if ( rc )
> >> -            break;
> >> -    }
> >> +    /*
> >> +     * Capabilities with high priority like MSI-X need to
> >> +     * be initialized before header
> >> +     */
> >> +    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
> >> +    if ( rc )
> >> +        goto out;
> > 
> > I understand this is not introduced by this change, but I wonder if
> > there could be a way to ditch the priority stuff for capabilities,
> > specially now that we only have two "priorities": before or after PCI
> > header initialization.
> I have an idea, but it seems like a hake.
> Can we add a flag(maybe name it "msix_initialized") to struct vpci{}?
> Then in vpci_make_msix_hole(), we can first check that flag, if it is false, we return an error to let modify_decoding() directly return in the process of init_header.
> And in the start of init_msix(), to set msix_initialized=true, in the end of init_msix(), to call modify_decoding() to setup p2m.
> Then we can remove the priorities.

Maybe the initialization of the MSI-X capability could be done after
the header, and call vpci_make_msix_hole()?  There's a bit of
redundancy here in that the BAR is first fully mapped, and then a hole
is punched in place of the MSI-X related tables.  Seems like the
easier option to break the depedency of init_msix() in being called
ahead of init_header().

Completely unrelated: looking at vpci_make_msix_hole() I see the call
in modify_decoding() is redundant, as modify_bars() already craves the
MSI-X regions out of the BARs.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-31  8:43     ` Roger Pau Monné
@ 2025-03-31  8:46       ` Jan Beulich
  2025-03-31  9:32       ` Chen, Jiqian
  1 sibling, 0 replies; 16+ messages in thread
From: Jan Beulich @ 2025-03-31  8:46 UTC (permalink / raw)
  To: Chen, Jiqian
  Cc: xen-devel@lists.xenproject.org, Huang, Ray, Roger Pau Monné

On 31.03.2025 10:43, Roger Pau Monné wrote:
> On Mon, Mar 31, 2025 at 07:26:20AM +0000, Chen, Jiqian wrote:
>> On 2025/3/27 17:25, Roger Pau Monné wrote:
>>> On Thu, Mar 27, 2025 at 03:32:12PM +0800, Jiqian Chen wrote:
>>>> --- a/xen/drivers/vpci/vpci.c
>>>> +++ b/xen/drivers/vpci/vpci.c
>>>> @@ -36,8 +36,8 @@ struct vpci_register {
>>>>  };
>>>>  
>>>>  #ifdef __XEN__
>>>> -extern vpci_register_init_t *const __start_vpci_array[];
>>>> -extern vpci_register_init_t *const __end_vpci_array[];
>>>> +extern vpci_capability_t *const __start_vpci_array[];
>>>> +extern vpci_capability_t *const __end_vpci_array[];
>>>>  #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>>>>  
>>>>  #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
>>>> @@ -83,6 +83,47 @@ static int assign_virtual_sbdf(struct pci_dev *pdev)
>>>>  
>>>>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>>>>  
>>>> +static int vpci_init_cap_with_priority(struct pci_dev *pdev,
>>>> +                                       const char *priority)
>>>> +{
>>>> +    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>>>> +    {
>>>> +        const vpci_capability_t *capability = __start_vpci_array[i];
>>>> +        const unsigned int cap_id = capability->id;
>>>> +        unsigned int pos;
>>>> +        int rc;
>>>> +
>>>> +        if ( *(capability->priority) != *priority )
>>>> +            continue;
>>>> +
>>>> +        if ( !capability->is_ext )
>>>> +            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
>>>> +        else
>>>> +            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
>>>> +
>>>> +        if ( !pos )
>>>> +            continue;
>>>> +
>>>> +        rc = capability->init(pdev);
>>>> +
>>>> +        if ( rc )
>>>> +        {
>>>> +            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
>>>> +                   pdev->domain, &pdev->sbdf, rc);
>>>> +            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
>>>> +                                   pos, capability->is_ext ? 4 : 1, NULL);
>>>
>>> Are you sure this works as intended? 
>> Yes, I used failure test cases of init_msi/rebar.
>> From the "lspci" result, they were hided from the dom0.
>> But I forgot to test for domUs.
> 
> I assume that's only tested with Linux?  See my comment below about
> capability ID 0 being reserved, and hence I think we should not keep
> capabilities with ID 0 on the list, as it might cause malfunctions to
> OSes.
> 
>>> The capability ID 0 is marked as "reserved" in the spec, so it's unclear to me how OSes would handle
>>> finding such capability on the list - I won't be surprised if some
>>> implementations decide to terminate the walk.  It's fine to mask the
>>> capability ID for the ones that we don't want to expose, but there's
>>> further work to do IMO.
>>>
>>> The usual way to deal with masking capabilities is to short circuit
>>> the capability from the linked list, by making the previous capability
>>> "Next Capability Offset" point to the next capability in the list,
>>> thus skipping the current one. So:
>>>
>>> capability[n - 1].next_cap = capability[n].next_cap
>>>
>>> IOW: you will need to add the handler to the previous capability on
>>> the list.  That's how it's already done in init_header().
>> Oh, I got your opinion.
>> But we may need to discuss this more.
>> In my opinion, there should be two situations:
>> First, if device belongs to hardware domain, there is no emulation of legacy or extended capabilities linked list if I understand codes right.
> 
> Yes, but that needs to be fixed, we need to have this kind of
> emulation uniformly.
> 
>> So, for this situation, I think current implementation of my patch is enough for hiding legacy or extended capabilities.
> 
> It works given the current code in Linux.  As said above, I don't
> think this is fully correct according to the PCI spec.
> 
>> Second, if device belongs to common domain, we just need to consider legacy capabilities since all extended capabilities are hided in init_header().
>> So, for this situation, I need to what you said " capability[n - 1].next_cap = capability[n].next_cap "
> 
> I'm not sure why would want to handle the hardware domain vs
> unprivileged domains differently here.  The way to hide the
> capabilities should always be the same, like it's currently done for
> domUs.

FWIW - I agree with Roger on all the point made.

Jan


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-31  8:43     ` Roger Pau Monné
  2025-03-31  8:46       ` Jan Beulich
@ 2025-03-31  9:32       ` Chen, Jiqian
  2025-03-31 11:04         ` Roger Pau Monné
  1 sibling, 1 reply; 16+ messages in thread
From: Chen, Jiqian @ 2025-03-31  9:32 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Huang, Ray, Chen, Jiqian

On 2025/3/31 16:43, Roger Pau Monné wrote:
> On Mon, Mar 31, 2025 at 07:26:20AM +0000, Chen, Jiqian wrote:
>> On 2025/3/27 17:25, Roger Pau Monné wrote:
>>> On Thu, Mar 27, 2025 at 03:32:12PM +0800, Jiqian Chen wrote: 
>>>>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
>>>>  
>>>> +static int vpci_init_cap_with_priority(struct pci_dev *pdev,
>>>> +                                       const char *priority)
>>>> +{
>>>> +    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
>>>> +    {
>>>> +        const vpci_capability_t *capability = __start_vpci_array[i];
>>>> +        const unsigned int cap_id = capability->id;
>>>> +        unsigned int pos;
>>>> +        int rc;
>>>> +
>>>> +        if ( *(capability->priority) != *priority )
>>>> +            continue;
>>>> +
>>>> +        if ( !capability->is_ext )
>>>> +            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
>>>> +        else
>>>> +            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
>>>> +
>>>> +        if ( !pos )
>>>> +            continue;
>>>> +
>>>> +        rc = capability->init(pdev);
>>>> +
>>>> +        if ( rc )
>>>> +        {
>>>> +            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
>>>> +                   pdev->domain, &pdev->sbdf, rc);
>>>> +            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
>>>> +                                   pos, capability->is_ext ? 4 : 1, NULL);
>>>
>>> Are you sure this works as intended? 
>> Yes, I used failure test cases of init_msi/rebar.
>> From the "lspci" result, they were hided from the dom0.
>> But I forgot to test for domUs.
> 
> I assume that's only tested with Linux?  See my comment below about
> capability ID 0 being reserved, and hence I think we should not keep
> capabilities with ID 0 on the list, as it might cause malfunctions to
> OSes.
> 
>>> The capability ID 0 is marked as "reserved" in the spec, so it's unclear to me how OSes would handle
>>> finding such capability on the list - I won't be surprised if some
>>> implementations decide to terminate the walk.  It's fine to mask the
>>> capability ID for the ones that we don't want to expose, but there's
>>> further work to do IMO.
>>>
>>> The usual way to deal with masking capabilities is to short circuit
>>> the capability from the linked list, by making the previous capability
>>> "Next Capability Offset" point to the next capability in the list,
>>> thus skipping the current one. So:
>>>
>>> capability[n - 1].next_cap = capability[n].next_cap
>>>
>>> IOW: you will need to add the handler to the previous capability on
>>> the list.  That's how it's already done in init_header().
>> Oh, I got your opinion.
>> But we may need to discuss this more.
>> In my opinion, there should be two situations:
>> First, if device belongs to hardware domain, there is no emulation of legacy or extended capabilities linked list if I understand codes right.
> 
> Yes, but that needs to be fixed, we need to have this kind of
> emulation uniformly.
> 
>> So, for this situation, I think current implementation of my patch is enough for hiding legacy or extended capabilities.
> 
> It works given the current code in Linux.  As said above, I don't
> think this is fully correct according to the PCI spec.
> 
>> Second, if device belongs to common domain, we just need to consider legacy capabilities since all extended capabilities are hided in init_header().
>> So, for this situation, I need to what you said " capability[n - 1].next_cap = capability[n].next_cap "
> 
> I'm not sure why would want to handle the hardware domain vs
> unprivileged domains differently here.  The way to hide the
> capabilities should always be the same, like it's currently done for
> domUs.
So, I need to refactor the emulating PCI capability list codes of init_header() to serve
for all domain(dom0+domUs) firstly, since current codes only emulate PCI capability list for domUs, right?

> 
>> I am not sure if above are right.
>>>
>>>> +            if ( rc )
>>>> +            {
>>>> +                printk(XENLOG_ERR "%pd %pp: fail to hide cap rc=%d\n",
>>>> +                       pdev->domain, &pdev->sbdf, rc);
>>>> +                return rc;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>  void vpci_deassign_device(struct pci_dev *pdev)
>>>>  {
>>>>      unsigned int i;
>>>> @@ -128,7 +169,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
>>>>  
>>>>  int vpci_assign_device(struct pci_dev *pdev)
>>>>  {
>>>> -    unsigned int i;
>>>>      const unsigned long *ro_map;
>>>>      int rc = 0;
>>>>  
>>>> @@ -159,12 +199,19 @@ int vpci_assign_device(struct pci_dev *pdev)
>>>>          goto out;
>>>>  #endif
>>>>  
>>>> -    for ( i = 0; i < NUM_VPCI_INIT; i++ )
>>>> -    {
>>>> -        rc = __start_vpci_array[i](pdev);
>>>> -        if ( rc )
>>>> -            break;
>>>> -    }
>>>> +    /*
>>>> +     * Capabilities with high priority like MSI-X need to
>>>> +     * be initialized before header
>>>> +     */
>>>> +    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
>>>> +    if ( rc )
>>>> +        goto out;
>>>
>>> I understand this is not introduced by this change, but I wonder if
>>> there could be a way to ditch the priority stuff for capabilities,
>>> specially now that we only have two "priorities": before or after PCI
>>> header initialization.
>> I have an idea, but it seems like a hake.
>> Can we add a flag(maybe name it "msix_initialized") to struct vpci{}?
>> Then in vpci_make_msix_hole(), we can first check that flag, if it is false, we return an error to let modify_decoding() directly return in the process of init_header.
>> And in the start of init_msix(), to set msix_initialized=true, in the end of init_msix(), to call modify_decoding() to setup p2m.
>> Then we can remove the priorities.
> 
> Maybe the initialization of the MSI-X capability could be done after
> the header, and call vpci_make_msix_hole()?  There's a bit of
> redundancy here in that the BAR is first fully mapped, and then a hole
> is punched in place of the MSI-X related tables.  Seems like the
> easier option to break the depedency of init_msix() in being called
> ahead of init_header().
You mean the sequence should be:
vpci_init_header()
vpci_init_capability() // all capabilities
vpci_make_msix_hole()

Right?

> 
> Completely unrelated: looking at vpci_make_msix_hole() I see the call
> in modify_decoding() is redundant, as modify_bars() already craves the
> MSI-X regions out of the BARs.
> 
> Thanks, Roger.

-- 
Best regards,
Jiqian Chen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-31  9:32       ` Chen, Jiqian
@ 2025-03-31 11:04         ` Roger Pau Monné
  2025-04-01  9:17           ` Chen, Jiqian
  0 siblings, 1 reply; 16+ messages in thread
From: Roger Pau Monné @ 2025-03-31 11:04 UTC (permalink / raw)
  To: Chen, Jiqian; +Cc: xen-devel@lists.xenproject.org, Huang, Ray

On Mon, Mar 31, 2025 at 09:32:02AM +0000, Chen, Jiqian wrote:
> On 2025/3/31 16:43, Roger Pau Monné wrote:
> > On Mon, Mar 31, 2025 at 07:26:20AM +0000, Chen, Jiqian wrote:
> >> On 2025/3/27 17:25, Roger Pau Monné wrote:
> >>> On Thu, Mar 27, 2025 at 03:32:12PM +0800, Jiqian Chen wrote: 
> >>>>  #endif /* CONFIG_HAS_VPCI_GUEST_SUPPORT */
> >>>>  
> >>>> +static int vpci_init_cap_with_priority(struct pci_dev *pdev,
> >>>> +                                       const char *priority)
> >>>> +{
> >>>> +    for ( unsigned int i = 0; i < NUM_VPCI_INIT; i++ )
> >>>> +    {
> >>>> +        const vpci_capability_t *capability = __start_vpci_array[i];
> >>>> +        const unsigned int cap_id = capability->id;
> >>>> +        unsigned int pos;
> >>>> +        int rc;
> >>>> +
> >>>> +        if ( *(capability->priority) != *priority )
> >>>> +            continue;
> >>>> +
> >>>> +        if ( !capability->is_ext )
> >>>> +            pos = pci_find_cap_offset(pdev->sbdf, cap_id);
> >>>> +        else
> >>>> +            pos = pci_find_ext_capability(pdev->sbdf, cap_id);
> >>>> +
> >>>> +        if ( !pos )
> >>>> +            continue;
> >>>> +
> >>>> +        rc = capability->init(pdev);
> >>>> +
> >>>> +        if ( rc )
> >>>> +        {
> >>>> +            printk(XENLOG_WARNING "%pd %pp: cap init fail rc=%d, try to hide\n",
> >>>> +                   pdev->domain, &pdev->sbdf, rc);
> >>>> +            rc = vpci_add_register(pdev->vpci, vpci_read_val, NULL,
> >>>> +                                   pos, capability->is_ext ? 4 : 1, NULL);
> >>>
> >>> Are you sure this works as intended? 
> >> Yes, I used failure test cases of init_msi/rebar.
> >> From the "lspci" result, they were hided from the dom0.
> >> But I forgot to test for domUs.
> > 
> > I assume that's only tested with Linux?  See my comment below about
> > capability ID 0 being reserved, and hence I think we should not keep
> > capabilities with ID 0 on the list, as it might cause malfunctions to
> > OSes.
> > 
> >>> The capability ID 0 is marked as "reserved" in the spec, so it's unclear to me how OSes would handle
> >>> finding such capability on the list - I won't be surprised if some
> >>> implementations decide to terminate the walk.  It's fine to mask the
> >>> capability ID for the ones that we don't want to expose, but there's
> >>> further work to do IMO.
> >>>
> >>> The usual way to deal with masking capabilities is to short circuit
> >>> the capability from the linked list, by making the previous capability
> >>> "Next Capability Offset" point to the next capability in the list,
> >>> thus skipping the current one. So:
> >>>
> >>> capability[n - 1].next_cap = capability[n].next_cap
> >>>
> >>> IOW: you will need to add the handler to the previous capability on
> >>> the list.  That's how it's already done in init_header().
> >> Oh, I got your opinion.
> >> But we may need to discuss this more.
> >> In my opinion, there should be two situations:
> >> First, if device belongs to hardware domain, there is no emulation of legacy or extended capabilities linked list if I understand codes right.
> > 
> > Yes, but that needs to be fixed, we need to have this kind of
> > emulation uniformly.
> > 
> >> So, for this situation, I think current implementation of my patch is enough for hiding legacy or extended capabilities.
> > 
> > It works given the current code in Linux.  As said above, I don't
> > think this is fully correct according to the PCI spec.
> > 
> >> Second, if device belongs to common domain, we just need to consider legacy capabilities since all extended capabilities are hided in init_header().
> >> So, for this situation, I need to what you said " capability[n - 1].next_cap = capability[n].next_cap "
> > 
> > I'm not sure why would want to handle the hardware domain vs
> > unprivileged domains differently here.  The way to hide the
> > capabilities should always be the same, like it's currently done for
> > domUs.
> So, I need to refactor the emulating PCI capability list codes of init_header() to serve
> for all domain(dom0+domUs) firstly, since current codes only emulate PCI capability list for domUs, right?

Yes, that would be my understanding.  The current logic in
init_header() needs to be expanded/generalized so it can be used for
masking random PCI capabilities, plus adapted to work with PCIe
capabilities also.

> > 
> >> I am not sure if above are right.
> >>>
> >>>> +            if ( rc )
> >>>> +            {
> >>>> +                printk(XENLOG_ERR "%pd %pp: fail to hide cap rc=%d\n",
> >>>> +                       pdev->domain, &pdev->sbdf, rc);
> >>>> +                return rc;
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>>  void vpci_deassign_device(struct pci_dev *pdev)
> >>>>  {
> >>>>      unsigned int i;
> >>>> @@ -128,7 +169,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> >>>>  
> >>>>  int vpci_assign_device(struct pci_dev *pdev)
> >>>>  {
> >>>> -    unsigned int i;
> >>>>      const unsigned long *ro_map;
> >>>>      int rc = 0;
> >>>>  
> >>>> @@ -159,12 +199,19 @@ int vpci_assign_device(struct pci_dev *pdev)
> >>>>          goto out;
> >>>>  #endif
> >>>>  
> >>>> -    for ( i = 0; i < NUM_VPCI_INIT; i++ )
> >>>> -    {
> >>>> -        rc = __start_vpci_array[i](pdev);
> >>>> -        if ( rc )
> >>>> -            break;
> >>>> -    }
> >>>> +    /*
> >>>> +     * Capabilities with high priority like MSI-X need to
> >>>> +     * be initialized before header
> >>>> +     */
> >>>> +    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
> >>>> +    if ( rc )
> >>>> +        goto out;
> >>>
> >>> I understand this is not introduced by this change, but I wonder if
> >>> there could be a way to ditch the priority stuff for capabilities,
> >>> specially now that we only have two "priorities": before or after PCI
> >>> header initialization.
> >> I have an idea, but it seems like a hake.
> >> Can we add a flag(maybe name it "msix_initialized") to struct vpci{}?
> >> Then in vpci_make_msix_hole(), we can first check that flag, if it is false, we return an error to let modify_decoding() directly return in the process of init_header.
> >> And in the start of init_msix(), to set msix_initialized=true, in the end of init_msix(), to call modify_decoding() to setup p2m.
> >> Then we can remove the priorities.
> > 
> > Maybe the initialization of the MSI-X capability could be done after
> > the header, and call vpci_make_msix_hole()?  There's a bit of
> > redundancy here in that the BAR is first fully mapped, and then a hole
> > is punched in place of the MSI-X related tables.  Seems like the
> > easier option to break the depedency of init_msix() in being called
> > ahead of init_header().
> You mean the sequence should be:
> vpci_init_header()
> vpci_init_capability() // all capabilities
> vpci_make_msix_hole()
> 
> Right?

Yes, I think that would be my preference.  The call to
vpci_make_msix_hole() should be inside of init_msix().

Thanks, Roger.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/3] vpci: Hide capability when it fails to initialize
  2025-03-31 11:04         ` Roger Pau Monné
@ 2025-04-01  9:17           ` Chen, Jiqian
  0 siblings, 0 replies; 16+ messages in thread
From: Chen, Jiqian @ 2025-04-01  9:17 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: xen-devel@lists.xenproject.org, Huang, Ray, Chen, Jiqian

On 2025/3/31 19:04, Roger Pau Monné wrote:
> On Mon, Mar 31, 2025 at 09:32:02AM +0000, Chen, Jiqian wrote:
>> So, I need to refactor the emulating PCI capability list codes of init_header() to serve
>> for all domain(dom0+domUs) firstly, since current codes only emulate PCI capability list for domUs, right?
> 
> Yes, that would be my understanding.  The current logic in
> init_header() needs to be expanded/generalized so it can be used for
> masking random PCI capabilities, plus adapted to work with PCIe
> capabilities also.
OK, I will try to refactor the logic in next version.
Hoping the next version will be more in line with your ideas.
Thanks!
> 
>>>
>>>>
>>>>>
>>>>>> +    /*
>>>>>> +     * Capabilities with high priority like MSI-X need to
>>>>>> +     * be initialized before header
>>>>>> +     */
>>>>>> +    rc = vpci_init_cap_with_priority(pdev, VPCI_PRIORITY_HIGH);
>>>>>> +    if ( rc )
>>>>>> +        goto out;
>>>>>
>>>>> I understand this is not introduced by this change, but I wonder if
>>>>> there could be a way to ditch the priority stuff for capabilities,
>>>>> specially now that we only have two "priorities": before or after PCI
>>>>> header initialization.
>>>> I have an idea, but it seems like a hake.
>>>> Can we add a flag(maybe name it "msix_initialized") to struct vpci{}?
>>>> Then in vpci_make_msix_hole(), we can first check that flag, if it is false, we return an error to let modify_decoding() directly return in the process of init_header.
>>>> And in the start of init_msix(), to set msix_initialized=true, in the end of init_msix(), to call modify_decoding() to setup p2m.
>>>> Then we can remove the priorities.
>>>
>>> Maybe the initialization of the MSI-X capability could be done after
>>> the header, and call vpci_make_msix_hole()?  There's a bit of
>>> redundancy here in that the BAR is first fully mapped, and then a hole
>>> is punched in place of the MSI-X related tables.  Seems like the
>>> easier option to break the depedency of init_msix() in being called
>>> ahead of init_header().
>> You mean the sequence should be:
>> vpci_init_header()
>> vpci_init_capability() // all capabilities
>> vpci_make_msix_hole()
>>
>> Right?
> 
> Yes, I think that would be my preference.  The call to
> vpci_make_msix_hole() should be inside of init_msix().
Got it, will do in next version.

> 
> Thanks, Roger.

-- 
Best regards,
Jiqian Chen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-04-01  9:18 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-27  7:32 [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Jiqian Chen
2025-03-27  7:32 ` [PATCH v1 2/3] vpci/rebar: Remove registers when init_rebar() fails Jiqian Chen
2025-03-27 12:38   ` Roger Pau Monné
2025-03-27  7:32 ` [PATCH v1 3/3] vpci/msi: Remove registers when init_msi() fails Jiqian Chen
2025-03-27 12:44   ` Roger Pau Monné
2025-03-31  8:13     ` Chen, Jiqian
2025-03-31  8:53       ` Roger Pau Monné
2025-03-31  9:43         ` Chen, Jiqian
2025-03-31 11:12           ` Roger Pau Monné
2025-03-27  9:25 ` [PATCH v1 1/3] vpci: Hide capability when it fails to initialize Roger Pau Monné
2025-03-31  7:26   ` Chen, Jiqian
2025-03-31  8:43     ` Roger Pau Monné
2025-03-31  8:46       ` Jan Beulich
2025-03-31  9:32       ` Chen, Jiqian
2025-03-31 11:04         ` Roger Pau Monné
2025-04-01  9:17           ` Chen, Jiqian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.