All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Roger Pau Monné" <roger.pau@citrix.com>
To: Stewart Hildebrand <stewart.hildebrand@amd.com>
Cc: xen-devel@lists.xenproject.org,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Anthony PERARD <anthony.perard@vates.tech>,
	Michal Orzel <michal.orzel@amd.com>,
	Jan Beulich <jbeulich@suse.com>, Julien Grall <julien@xen.org>,
	Stefano Stabellini <sstabellini@kernel.org>
Subject: Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
Date: Thu, 24 Jul 2025 18:44:32 +0200	[thread overview]
Message-ID: <aIJi8E2BC-dzAIz8@macbook.local> (raw)
In-Reply-To: <20250723163744.13095-2-stewart.hildebrand@amd.com>

On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> general vPCI state: in particular, move the per-BAR rangeset out of
> struct vpci and into the map task struct.
> 
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> v1->v2:
> * new patch
> 
> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> ---
>  xen/common/domain.c       |   4 +
>  xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
>  xen/drivers/vpci/vpci.c   |   3 -
>  xen/include/xen/vpci.h    |  16 +++-
>  4 files changed, 139 insertions(+), 81 deletions(-)
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 303c338ef293..214795e2d2fe 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>          d->vcpu[prev_id]->next_in_list = v;
>      }
>  
> +#ifdef CONFIG_HAS_VPCI
> +    INIT_LIST_HEAD(&v->vpci.task_queue);
> +#endif
> +
>      /* Must be called after making new vcpu visible to for_each_vcpu(). */
>      vcpu_check_shutdown(v);
>  
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bb76e707992c..df065a5f5faf 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -34,7 +34,7 @@
>  
>  struct map_data {
>      struct domain *d;
> -    const struct vpci_bar *bar;
> +    const struct vpci_bar_map *bar;
>      bool map;
>  };
>  
> @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>          ASSERT_UNREACHABLE();
>  }
>  
> -bool vpci_process_pending(struct vcpu *v)
> +static bool vpci_process_map_task(struct vpci_map_task *task)
>  {
> -    const struct pci_dev *pdev = v->vpci.pdev;
> -    struct vpci_header *header = NULL;
> +    const struct pci_dev *pdev = task->pdev;
>      unsigned int i;
>  
>      if ( !pdev )
>          return false;
>  
> -    read_lock(&v->domain->pci_lock);
> -
> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
> -    {
> -        v->vpci.pdev = NULL;
> -        read_unlock(&v->domain->pci_lock);
> +    if ( !pdev->vpci || (task->domain != pdev->domain) )
>          return false;
> -    }
>  
> -    header = &pdev->vpci->header;
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar_map *bar = &task->bars[i];
>          struct map_data data = {
> -            .d = v->domain,
> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> +            .d = task->domain,
> +            .map = task->cmd & PCI_COMMAND_MEMORY,
>              .bar = bar,
>          };
>          int rc;
> @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
>          rc = rangeset_consume_ranges(bar->mem, map_range, &data);
>  
>          if ( rc == -ERESTART )
> -        {
> -            read_unlock(&v->domain->pci_lock);
>              return true;
> -        }
>  
>          if ( rc )
>          {
>              spin_lock(&pdev->vpci->lock);
>              /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> +            modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
>                              false);
>              spin_unlock(&pdev->vpci->lock);
>  
> -            /* Clean all the rangesets */
> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> -                if ( !rangeset_is_empty(header->bars[i].mem) )
> -                     rangeset_purge(header->bars[i].mem);
> -
> -            v->vpci.pdev = NULL;
> -
> -            read_unlock(&v->domain->pci_lock);
> -
> -            if ( !is_hardware_domain(v->domain) )
> -                domain_crash(v->domain);
> +            if ( !is_hardware_domain(task->domain) )
> +                domain_crash(task->domain);
>  
>              return false;
>          }
>      }
> -    v->vpci.pdev = NULL;
>  
>      spin_lock(&pdev->vpci->lock);
> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> +    modify_decoding(pdev, task->cmd, task->rom_only);
>      spin_unlock(&pdev->vpci->lock);
>  
> -    read_unlock(&v->domain->pci_lock);
> +    return false;
> +}
> +
> +static void destroy_map_task(struct vpci_map_task *task)
> +{
> +    unsigned int i;
>  
> +    if ( !task )
> +        return;

Maybe I'm missing a case, but is there any caller that can possibly
pass a NULL task?  Not that having the check is bad, just wondering
myself.  You might consider adding an ASSERT_UNREACHABLE() if there
are no callers that pass NULL (to ensure correctness of the calling
logic, not that the function can't deal with them).

> +
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> +        rangeset_destroy(task->bars[i].mem);
> +
> +    xfree(task);
> +}
> +
> +bool vpci_process_pending(struct vcpu *v)
> +{
> +    struct vpci_map_task *task;

Newline.

> +    read_lock(&v->domain->pci_lock);
> +
> +    while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> +                                             struct vpci_map_task,
> +                                             next)) != NULL )
> +    {
> +        if ( vpci_process_map_task(task) )
> +        {
> +            read_unlock(&v->domain->pci_lock);
> +            return true;
> +        }
> +
> +        list_del(&task->next);
> +        destroy_map_task(task);
> +    }
> +
> +    read_unlock(&v->domain->pci_lock);
>      return false;
>  }
>  
> -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            uint16_t cmd)
> +static int __init apply_map(struct vpci_map_task *task)
>  {
> -    struct vpci_header *header = &pdev->vpci->header;
> +    struct domain *d = task->domain;
> +    const struct pci_dev *pdev = task->pdev;
> +    uint16_t cmd = task->cmd;
>      int rc = 0;
>      unsigned int i;
>  
>      ASSERT(rw_is_write_locked(&d->pci_lock));
>  
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar_map *bar = &task->bars[i];
>          struct map_data data = { .d = d, .map = true, .bar = bar };
>  
>          if ( rangeset_is_empty(bar->mem) )
> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>      return rc;
>  }
>  
> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> +                                            uint16_t cmd, bool rom_only)
> +{
> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);

xvzalloc() preferably.

This however introduces run-time allocations as a result of guest
actions, which is not ideal IMO.  It would be preferable to do those
allocations as part of the header initialization, and re-use them.

> +    unsigned int i;
> +
> +    if ( !task )
> +        return NULL;
> +
> +    BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> +
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> +    {
> +        if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )

You possibly don't need to prepare this for VPCI_BAR_MEM64_HI BAR
types?

I think you want to use the existing MAPPABLE_BAR() macro to check
whether a BAR requires backing mapping infrastructure.

Also, a nit, but you might want to do something like:

if ( !MAPPABLE_BAR(&pdev->vpci->header.bars[i]) )
    continue;

To reduce a level of indentation.

> +        {
> +            char str[32];
> +
> +            snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> +
> +            task->bars[i].mem = rangeset_new(pdev->domain, str,
> +                                             RANGESETF_no_print);
> +
> +            if ( !task->bars[i].mem )
> +            {
> +                destroy_map_task(task);
> +                return NULL;
> +            }
> +
> +            task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> +            task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> +        }
> +    }
> +
> +    task->pdev = pdev;
> +    task->domain = pdev->domain;
> +    task->cmd = cmd;
> +    task->rom_only = rom_only;
> +
> +    return task;
> +}
> +
> +static void defer_map(struct vpci_map_task *task)
>  {
>      struct vcpu *curr = current;
>  
> @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       * is mapped. This can lead to parallel mapping operations being
>       * started for the same device if the domain is not well-behaved.
>       */
> -    curr->vpci.pdev = pdev;
> -    curr->vpci.cmd = cmd;
> -    curr->vpci.rom_only = rom_only;
> +
> +    list_add_tail(&task->next, &curr->vpci.task_queue);

You could probably assert the list is empty before adding, albeit that
won't be the case after further patches are added?

>      /*
>       * Raise a scheduler softirq in order to prevent the guest from resuming
>       * execution with pending mapping operations, to trigger the invocation
> @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      struct pci_dev *tmp;
>      const struct domain *d;
>      const struct vpci_msix *msix = pdev->vpci->msix;
> +    struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
>      unsigned int i, j;
>      int rc;
>  
>      ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>  
> +    if ( !task )
> +        return -ENOMEM;
> +
>      /*
>       * Create a rangeset per BAR that represents the current device memory
>       * region and compare it against all the currently active BAR memory
> @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
>          struct vpci_bar *bar = &header->bars[i];
> +        struct rangeset *mem = task->bars[i].mem;
>          unsigned long start = PFN_DOWN(bar->addr);
>          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>          unsigned long start_guest = PFN_DOWN(bar->guest_addr);
>          unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
>  
> -        if ( !bar->mem )
> +        if ( !mem )
>              continue;
>  
>          if ( !MAPPABLE_BAR(bar) ||
> @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              continue;
>          }
>  
> -        ASSERT(rangeset_is_empty(bar->mem));
> +        ASSERT(rangeset_is_empty(mem));
>  
>          /*
>           * Make sure that the guest set address has the same page offset
> @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              gprintk(XENLOG_G_WARNING,
>                      "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
>                      &pdev->sbdf, i, bar->guest_addr, bar->addr);
> +            destroy_map_task(task);
>              return -EINVAL;
>          }
>  
> -        rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> +        rc = rangeset_add_range(mem, start_guest, end_guest);
>          if ( rc )
>          {
>              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                     start_guest, end_guest, rc);
> +            destroy_map_task(task);
>              return rc;
>          }
>  
>          /* Check for overlap with the already setup BAR ranges. */
>          for ( j = 0; j < i; j++ )
>          {
> -            struct vpci_bar *prev_bar = &header->bars[j];
> +            struct vpci_bar_map *prev_bar = &task->bars[j];
>  
>              if ( rangeset_is_empty(prev_bar->mem) )
>                  continue;
> @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  gprintk(XENLOG_WARNING,
>                         "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
>                          &pdev->sbdf, start_guest, end_guest, rc);
> +                destroy_map_task(task);
>                  return rc;
>              }
>          }
>  
> -        rc = pci_sanitize_bar_memory(bar->mem);
> +        rc = pci_sanitize_bar_memory(mem);
>          if ( rc )
>          {
>              gprintk(XENLOG_WARNING,
>                      "%pp: failed to sanitize BAR#%u memory: %d\n",
>                      &pdev->sbdf, i, rc);
> +            destroy_map_task(task);
>              return rc;
>          }
>      }
> @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>  
>          for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>          {
> -            const struct vpci_bar *bar = &header->bars[j];
> +            const struct vpci_bar_map *bar = &task->bars[j];
>  
>              if ( rangeset_is_empty(bar->mem) )
>                  continue;
> @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  gprintk(XENLOG_WARNING,
>                         "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
>                          &pdev->sbdf, start, end, rc);
> +                destroy_map_task(task);
>                  return rc;
>              }
>          }
> @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
>                  {
>                      const struct vpci_bar *bar = &header->bars[j];
> +                    struct rangeset *mem = task->bars[j].mem;
>  
> -                    if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> +                    if ( !rangeset_overlaps_range(mem, start, end) ||
>                           /*
>                            * If only the ROM enable bit is toggled check against
>                            * other BARs in the same device for overlaps, but not
> @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                            bar->type == VPCI_BAR_ROM) )
>                          continue;
>  
> -                    rc = rangeset_remove_range(bar->mem, start, end);
> +                    rc = rangeset_remove_range(mem, start, end);
>                      if ( rc )
>                      {
>                          gprintk(XENLOG_WARNING,
>                                  "%pp: failed to remove [%lx, %lx]: %d\n",
>                                  &pdev->sbdf, start, end, rc);
> +                        destroy_map_task(task);
>                          return rc;
>                      }
>                  }
> @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>           * will always be to establish mappings and process all the BARs.
>           */
>          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, cmd);
> +        rc = apply_map(task);
> +        destroy_map_task(task);

All the destroy calls are a bit repetitive, you could consider using
an error label or similar.

> +        return rc;
>      }
>  
> -    defer_map(pdev, cmd, rom_only);
> +    defer_map(task);
>  
>      return 0;
>  }
> @@ -731,18 +800,6 @@ static void cf_check rom_write(
>      }
>  }
>  
> -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> -                            unsigned int i)
> -{
> -    char str[32];
> -
> -    snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> -
> -    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> -
> -    return !bar->mem ? -ENOMEM : 0;
> -}
> -
>  static int vpci_init_capability_list(struct pci_dev *pdev)
>  {
>      int rc;
> @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>          else
>              bars[i].type = VPCI_BAR_MEM32;
>  
> -        rc = bar_add_rangeset(pdev, &bars[i], i);
> -        if ( rc )
> -            goto fail;
> -
>          rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
>                                (i == num_bars - 1) ? PCI_BAR_LAST : 0);
>          if ( rc < 0 )
> @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>                                 4, rom);
>          if ( rc )
>              rom->type = VPCI_BAR_EMPTY;
> -        else
> -        {
> -            rc = bar_add_rangeset(pdev, rom, num_bars);
> -            if ( rc )
> -                goto fail;
> -        }
>      }
>      else if ( !is_hwdom )
>      {
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 09988f04c27c..7177cfce355d 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
>                  iounmap(pdev->vpci->msix->table[i]);
>      }
>  
> -    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> -        rangeset_destroy(pdev->vpci->header.bars[i].mem);
> -
>      xfree(pdev->vpci->msix);
>      xfree(pdev->vpci->msi);
>      xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 6a481a4e89d3..c2e75076691f 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -103,7 +103,6 @@ struct vpci {
>              uint64_t guest_addr;
>              uint64_t size;
>              uint64_t resizable_sizes;
> -            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -194,14 +193,25 @@ struct vpci {
>  #endif
>  };
>  
> -struct vpci_vcpu {
> +#ifdef __XEN__
> +struct vpci_map_task {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> +    struct list_head next;
>      const struct pci_dev *pdev;
> +    struct domain *domain;
> +    struct vpci_bar_map {
> +        uint64_t addr;
> +        uint64_t guest_addr;
> +        struct rangeset *mem;
> +    } bars[PCI_HEADER_NORMAL_NR_BARS + 1];

You might be able to use ARRAY_SIZE() to derive the number of elements
from the existing array in vpci_header struct.

Thanks, Roger.


  parent reply	other threads:[~2025-07-24 16:44 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-23 16:37 [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
2025-07-24  3:00   ` dmkhn
2025-07-24  3:42     ` dmkhn
2025-07-24  7:47       ` Jan Beulich
2025-07-25 20:25         ` dmkhn
2025-07-24 16:44   ` Roger Pau Monné [this message]
2025-07-25  7:58     ` Roger Pau Monné
2025-08-01 21:06       ` Stewart Hildebrand
2025-08-04 13:55         ` Roger Pau Monné
2025-08-04 13:57           ` Jan Beulich
2025-08-04 15:02             ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit Stewart Hildebrand
2025-07-25  8:13   ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aIJi8E2BC-dzAIz8@macbook.local \
    --to=roger.pau@citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=anthony.perard@vates.tech \
    --cc=jbeulich@suse.com \
    --cc=julien@xen.org \
    --cc=michal.orzel@amd.com \
    --cc=sstabellini@kernel.org \
    --cc=stewart.hildebrand@amd.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.