From: dmkhn@proton.me
To: "Stewart Hildebrand" <stewart.hildebrand@amd.com>,
xen-devel@lists.xenproject.org,
"Andrew Cooper" <andrew.cooper3@citrix.com>,
"Anthony PERARD" <anthony.perard@vates.tech>,
"Michal Orzel" <michal.orzel@amd.com>,
"Jan Beulich" <jbeulich@suse.com>,
"Julien Grall" <julien@xen.org>,
"Roger Pau Monné" <roger.pau@citrix.com>,
"Stefano Stabellini" <sstabellini@kernel.org>
Subject: Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
Date: Thu, 24 Jul 2025 03:42:09 +0000 [thread overview]
Message-ID: <aIGrin7bHQr5KvtJ@kraken> (raw)
In-Reply-To: <aIGh2i5+Z2CW0mPr@kraken>
On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> > Introduce vPCI BAR mapping task queue. Decouple map operation state from
> > general vPCI state: in particular, move the per-BAR rangeset out of
> > struct vpci and into the map task struct.
> >
> > This is preparatory work for further changes that need to perform
> > multiple unmap/map operations before returning to guest.
> >
> > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > ---
> > v1->v2:
> > * new patch
> >
> > Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> > ---
> > xen/common/domain.c | 4 +
> > xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> > xen/drivers/vpci/vpci.c | 3 -
> > xen/include/xen/vpci.h | 16 +++-
> > 4 files changed, 139 insertions(+), 81 deletions(-)
> >
> > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > index 303c338ef293..214795e2d2fe 100644
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
> > d->vcpu[prev_id]->next_in_list = v;
> > }
> >
> > +#ifdef CONFIG_HAS_VPCI
> > + INIT_LIST_HEAD(&v->vpci.task_queue);
> > +#endif
> > +
> > /* Must be called after making new vcpu visible to for_each_vcpu(). */
> > vcpu_check_shutdown(v);
> >
> > diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> > index bb76e707992c..df065a5f5faf 100644
> > --- a/xen/drivers/vpci/header.c
> > +++ b/xen/drivers/vpci/header.c
> > @@ -34,7 +34,7 @@
> >
> > struct map_data {
> > struct domain *d;
> > - const struct vpci_bar *bar;
> > + const struct vpci_bar_map *bar;
>
> Perhaps s/vpci_bar_map/vpci_bar_mmio/g to highlight the BAR type?
>
> > bool map;
> > };
> >
> > @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
> > ASSERT_UNREACHABLE();
> > }
> >
> > -bool vpci_process_pending(struct vcpu *v)
> > +static bool vpci_process_map_task(struct vpci_map_task *task)
> > {
> > - const struct pci_dev *pdev = v->vpci.pdev;
> > - struct vpci_header *header = NULL;
> > + const struct pci_dev *pdev = task->pdev;
> > unsigned int i;
> >
> > if ( !pdev )
> > return false;
> >
> > - read_lock(&v->domain->pci_lock);
> > -
> > - if ( !pdev->vpci || (v->domain != pdev->domain) )
> > - {
> > - v->vpci.pdev = NULL;
> > - read_unlock(&v->domain->pci_lock);
> > + if ( !pdev->vpci || (task->domain != pdev->domain) )
> > return false;
> > - }
> >
> > - header = &pdev->vpci->header;
> > - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > {
> > - struct vpci_bar *bar = &header->bars[i];
> > + struct vpci_bar_map *bar = &task->bars[i];
> > struct map_data data = {
> > - .d = v->domain,
> > - .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> > + .d = task->domain,
> > + .map = task->cmd & PCI_COMMAND_MEMORY,
> > .bar = bar,
> > };
> > int rc;
> > @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
> > rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> >
> > if ( rc == -ERESTART )
> > - {
> > - read_unlock(&v->domain->pci_lock);
> > return true;
> > - }
> >
> > if ( rc )
> > {
> > spin_lock(&pdev->vpci->lock);
> > /* Disable memory decoding unconditionally on failure. */
> > - modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> > + modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
> > false);
> > spin_unlock(&pdev->vpci->lock);
> >
> > - /* Clean all the rangesets */
> > - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > - if ( !rangeset_is_empty(header->bars[i].mem) )
> > - rangeset_purge(header->bars[i].mem);
> > -
> > - v->vpci.pdev = NULL;
> > -
> > - read_unlock(&v->domain->pci_lock);
> > -
> > - if ( !is_hardware_domain(v->domain) )
> > - domain_crash(v->domain);
> > + if ( !is_hardware_domain(task->domain) )
> > + domain_crash(task->domain);
> >
> > return false;
> > }
> > }
> > - v->vpci.pdev = NULL;
> >
> > spin_lock(&pdev->vpci->lock);
> > - modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> > + modify_decoding(pdev, task->cmd, task->rom_only);
> > spin_unlock(&pdev->vpci->lock);
> >
> > - read_unlock(&v->domain->pci_lock);
> > + return false;
> > +}
> > +
> > +static void destroy_map_task(struct vpci_map_task *task)
> > +{
> > + unsigned int i;
> >
> > + if ( !task )
>
> Looks like task is never NULL in the code, suggest ASSERT().
>
> > + return;
> > +
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > + rangeset_destroy(task->bars[i].mem);
> > +
> > + xfree(task);
> > +}
> > +
> > +bool vpci_process_pending(struct vcpu *v)
> > +{
> > + struct vpci_map_task *task;
> > + read_lock(&v->domain->pci_lock);
> > +
> > + while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> > + struct vpci_map_task,
> > + next)) != NULL )
> > + {
> > + if ( vpci_process_map_task(task) )
> > + {
> > + read_unlock(&v->domain->pci_lock);
>
> I would add local variable and use it here for shorter code:
>
> int rc;
>
> read_lock(&v->domain->pci_lock);
>
> while (...)
> {
> rc = vpci_process_map_task(task);
> if ( rc )
> break;
>
> ...
> }
>
> read_unlock(&v->domain->pci_lock);
>
> return rc;
>
>
>
> > + return true;
> > + }
> > +
> > + list_del(&task->next);
> > + destroy_map_task(task);
> > + }
> > +
> > + read_unlock(&v->domain->pci_lock);
> > return false;
> > }
> >
> > -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> > - uint16_t cmd)
> > +static int __init apply_map(struct vpci_map_task *task)
> > {
> > - struct vpci_header *header = &pdev->vpci->header;
> > + struct domain *d = task->domain;
> > + const struct pci_dev *pdev = task->pdev;
> > + uint16_t cmd = task->cmd;
> > int rc = 0;
> > unsigned int i;
> >
> > ASSERT(rw_is_write_locked(&d->pci_lock));
> >
> > - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > {
> > - struct vpci_bar *bar = &header->bars[i];
> > + struct vpci_bar_map *bar = &task->bars[i];
> > struct map_data data = { .d = d, .map = true, .bar = bar };
> >
> > if ( rangeset_is_empty(bar->mem) )
> > @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> > return rc;
> > }
> >
> > -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> > + uint16_t cmd, bool rom_only)
> > +{
> > + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> > + unsigned int i;
> > +
>
> I would move allocation outside of initializer and use preferred xvzalloc()
> variant:
>
> task = xvzalloc(struct vpci_map_task);
> > + if ( !task )
> > + return NULL;
> > +
> > + BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> > +
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > + {
> > + if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
>
> I would reduce one level of indentation by
>
> if ( pdev->vpci->header.bars[i].type < VPCI_BAR_MEM32 )
> continue;
>
>
> > + {
> > + char str[32];
> > +
> > + snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> > +
> > + task->bars[i].mem = rangeset_new(pdev->domain, str,
> > + RANGESETF_no_print);
> > +
> > + if ( !task->bars[i].mem )
> > + {
> > + destroy_map_task(task);
>
> What if allocation fails in the middle of the loop, e.g. i == 5?
> I think previously allocated rangesets in this loop should be freed.
Oh, I see that is done in destroy_map_task(); please disregard.
>
> > + return NULL;
> > + }
> > +
> > + task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> > + task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> > + }
> > + }
> > +
> > + task->pdev = pdev;
> > + task->domain = pdev->domain;
> > + task->cmd = cmd;
> > + task->rom_only = rom_only;
> > +
> > + return task;
> > +}
> > +
> > +static void defer_map(struct vpci_map_task *task)
> > {
> > struct vcpu *curr = current;
> >
> > @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > * is mapped. This can lead to parallel mapping operations being
> > * started for the same device if the domain is not well-behaved.
> > */
> > - curr->vpci.pdev = pdev;
> > - curr->vpci.cmd = cmd;
> > - curr->vpci.rom_only = rom_only;
> > +
> > + list_add_tail(&task->next, &curr->vpci.task_queue);
> > +
> > /*
> > * Raise a scheduler softirq in order to prevent the guest from resuming
> > * execution with pending mapping operations, to trigger the invocation
> > @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > struct pci_dev *tmp;
> > const struct domain *d;
> > const struct vpci_msix *msix = pdev->vpci->msix;
> > + struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
>
> Same comment re: having allocation on a separate line.
>
> > unsigned int i, j;
> > int rc;
> >
> > ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
> >
> > + if ( !task )
> > + return -ENOMEM;
> > +
> > /*
> > * Create a rangeset per BAR that represents the current device memory
> > * region and compare it against all the currently active BAR memory
> > @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > {
> > struct vpci_bar *bar = &header->bars[i];
> > + struct rangeset *mem = task->bars[i].mem;
> > unsigned long start = PFN_DOWN(bar->addr);
> > unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> > unsigned long start_guest = PFN_DOWN(bar->guest_addr);
> > unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
> >
> > - if ( !bar->mem )
> > + if ( !mem )
> > continue;
> >
> > if ( !MAPPABLE_BAR(bar) ||
> > @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > continue;
> > }
> >
> > - ASSERT(rangeset_is_empty(bar->mem));
> > + ASSERT(rangeset_is_empty(mem));
> >
> > /*
> > * Make sure that the guest set address has the same page offset
> > @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > gprintk(XENLOG_G_WARNING,
> > "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
> > &pdev->sbdf, i, bar->guest_addr, bar->addr);
> > + destroy_map_task(task);
>
> I think using goto would be justified for doing cleanup in one place.
>
> > return -EINVAL;
> > }
> >
> > - rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> > + rc = rangeset_add_range(mem, start_guest, end_guest);
> > if ( rc )
> > {
> > printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
> > start_guest, end_guest, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> >
> > /* Check for overlap with the already setup BAR ranges. */
> > for ( j = 0; j < i; j++ )
> > {
> > - struct vpci_bar *prev_bar = &header->bars[j];
> > + struct vpci_bar_map *prev_bar = &task->bars[j];
> >
> > if ( rangeset_is_empty(prev_bar->mem) )
> > continue;
> > @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
> > &pdev->sbdf, start_guest, end_guest, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> >
> > - rc = pci_sanitize_bar_memory(bar->mem);
> > + rc = pci_sanitize_bar_memory(mem);
> > if ( rc )
> > {
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to sanitize BAR#%u memory: %d\n",
> > &pdev->sbdf, i, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> > @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >
> > for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
> > {
> > - const struct vpci_bar *bar = &header->bars[j];
> > + const struct vpci_bar_map *bar = &task->bars[j];
> >
> > if ( rangeset_is_empty(bar->mem) )
> > continue;
> > @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
> > &pdev->sbdf, start, end, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> > @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
> > {
> > const struct vpci_bar *bar = &header->bars[j];
> > + struct rangeset *mem = task->bars[j].mem;
> >
> > - if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> > + if ( !rangeset_overlaps_range(mem, start, end) ||
> > /*
> > * If only the ROM enable bit is toggled check against
> > * other BARs in the same device for overlaps, but not
> > @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > bar->type == VPCI_BAR_ROM) )
> > continue;
> >
> > - rc = rangeset_remove_range(bar->mem, start, end);
> > + rc = rangeset_remove_range(mem, start, end);
> > if ( rc )
> > {
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to remove [%lx, %lx]: %d\n",
> > &pdev->sbdf, start, end, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> > @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > * will always be to establish mappings and process all the BARs.
> > */
> > ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> > - return apply_map(pdev->domain, pdev, cmd);
> > + rc = apply_map(task);
> > + destroy_map_task(task);
> > + return rc;
> > }
> >
> > - defer_map(pdev, cmd, rom_only);
> > + defer_map(task);
> >
> > return 0;
> > }
> > @@ -731,18 +800,6 @@ static void cf_check rom_write(
> > }
> > }
> >
> > -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> > - unsigned int i)
> > -{
> > - char str[32];
> > -
> > - snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> > -
> > - bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> > -
> > - return !bar->mem ? -ENOMEM : 0;
> > -}
> > -
> > static int vpci_init_capability_list(struct pci_dev *pdev)
> > {
> > int rc;
> > @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> > else
> > bars[i].type = VPCI_BAR_MEM32;
> >
> > - rc = bar_add_rangeset(pdev, &bars[i], i);
> > - if ( rc )
> > - goto fail;
> > -
> > rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
> > (i == num_bars - 1) ? PCI_BAR_LAST : 0);
> > if ( rc < 0 )
> > @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> > 4, rom);
> > if ( rc )
> > rom->type = VPCI_BAR_EMPTY;
> > - else
> > - {
> > - rc = bar_add_rangeset(pdev, rom, num_bars);
> > - if ( rc )
> > - goto fail;
> > - }
> > }
> > else if ( !is_hwdom )
> > {
> > diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> > index 09988f04c27c..7177cfce355d 100644
> > --- a/xen/drivers/vpci/vpci.c
> > +++ b/xen/drivers/vpci/vpci.c
> > @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> > iounmap(pdev->vpci->msix->table[i]);
> > }
> >
> > - for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> > - rangeset_destroy(pdev->vpci->header.bars[i].mem);
> > -
> > xfree(pdev->vpci->msix);
> > xfree(pdev->vpci->msi);
> > xfree(pdev->vpci);
> > diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> > index 6a481a4e89d3..c2e75076691f 100644
> > --- a/xen/include/xen/vpci.h
> > +++ b/xen/include/xen/vpci.h
> > @@ -103,7 +103,6 @@ struct vpci {
> > uint64_t guest_addr;
> > uint64_t size;
> > uint64_t resizable_sizes;
> > - struct rangeset *mem;
> > enum {
> > VPCI_BAR_EMPTY,
> > VPCI_BAR_IO,
> > @@ -194,14 +193,25 @@ struct vpci {
> > #endif
> > };
> >
> > -struct vpci_vcpu {
> > +#ifdef __XEN__
> > +struct vpci_map_task {
> > /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> > + struct list_head next;
> > const struct pci_dev *pdev;
> > + struct domain *domain;
> > + struct vpci_bar_map {
> > + uint64_t addr;
> > + uint64_t guest_addr;
>
> Perhaps use hpa/gpa notation for naming address fields?
>
> > + struct rangeset *mem;
> > + } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
>
> I would add a brief comment for `+ 1`
>
> > uint16_t cmd;
> > bool rom_only : 1;
> > };
> >
> > -#ifdef __XEN__
> > +struct vpci_vcpu {
> > + struct list_head task_queue;
> > +};
> > +
> > void vpci_dump_msi(void);
> >
> > /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
> > --
> > 2.50.1
> >
> >
>
>
next prev parent reply other threads:[~2025-07-24 3:42 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-23 16:37 [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
2025-07-24 3:00 ` dmkhn
2025-07-24 3:42 ` dmkhn [this message]
2025-07-24 7:47 ` Jan Beulich
2025-07-25 20:25 ` dmkhn
2025-07-24 16:44 ` Roger Pau Monné
2025-07-25 7:58 ` Roger Pau Monné
2025-08-01 21:06 ` Stewart Hildebrand
2025-08-04 13:55 ` Roger Pau Monné
2025-08-04 13:57 ` Jan Beulich
2025-08-04 15:02 ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit Stewart Hildebrand
2025-07-25 8:13 ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aIGrin7bHQr5KvtJ@kraken \
--to=dmkhn@proton.me \
--cc=andrew.cooper3@citrix.com \
--cc=anthony.perard@vates.tech \
--cc=jbeulich@suse.com \
--cc=julien@xen.org \
--cc=michal.orzel@amd.com \
--cc=roger.pau@citrix.com \
--cc=sstabellini@kernel.org \
--cc=stewart.hildebrand@amd.com \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.