* [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled
@ 2025-07-23 16:37 Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Stewart Hildebrand @ 2025-07-23 16:37 UTC (permalink / raw)
To: xen-devel
Cc: Stewart Hildebrand, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Roger Pau Monné,
Stefano Stabellini
Pipeline: https://gitlab.com/xen-project/people/stewarthildebrand/xen/-/pipelines/1944094764
v1->v2:
* new approach with queued p2m operations
RFC->v1:
* rework BAR mapping machinery to support unmap-then-map operation
Stewart Hildebrand (3):
vpci: allow queueing of mapping operations
vpci: allow BAR map/unmap without affecting memory decoding bit
vpci: allow 32-bit BAR writes with memory decoding enabled
xen/common/domain.c | 4 +
xen/drivers/vpci/header.c | 249 ++++++++++++++++++++++++--------------
xen/drivers/vpci/vpci.c | 3 -
xen/include/xen/vpci.h | 20 ++-
4 files changed, 179 insertions(+), 97 deletions(-)
base-commit: 5c798ac8854af3528a78ca5a622c9ea68399809b
--
2.50.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-23 16:37 [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
@ 2025-07-23 16:37 ` Stewart Hildebrand
2025-07-24 3:00 ` dmkhn
2025-07-24 16:44 ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2 siblings, 2 replies; 15+ messages in thread
From: Stewart Hildebrand @ 2025-07-23 16:37 UTC (permalink / raw)
To: xen-devel
Cc: Stewart Hildebrand, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Roger Pau Monné,
Stefano Stabellini
Introduce vPCI BAR mapping task queue. Decouple map operation state from
general vPCI state: in particular, move the per-BAR rangeset out of
struct vpci and into the map task struct.
This is preparatory work for further changes that need to perform
multiple unmap/map operations before returning to guest.
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
v1->v2:
* new patch
Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
---
xen/common/domain.c | 4 +
xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
xen/drivers/vpci/vpci.c | 3 -
xen/include/xen/vpci.h | 16 +++-
4 files changed, 139 insertions(+), 81 deletions(-)
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 303c338ef293..214795e2d2fe 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
d->vcpu[prev_id]->next_in_list = v;
}
+#ifdef CONFIG_HAS_VPCI
+ INIT_LIST_HEAD(&v->vpci.task_queue);
+#endif
+
/* Must be called after making new vcpu visible to for_each_vcpu(). */
vcpu_check_shutdown(v);
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index bb76e707992c..df065a5f5faf 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -34,7 +34,7 @@
struct map_data {
struct domain *d;
- const struct vpci_bar *bar;
+ const struct vpci_bar_map *bar;
bool map;
};
@@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
ASSERT_UNREACHABLE();
}
-bool vpci_process_pending(struct vcpu *v)
+static bool vpci_process_map_task(struct vpci_map_task *task)
{
- const struct pci_dev *pdev = v->vpci.pdev;
- struct vpci_header *header = NULL;
+ const struct pci_dev *pdev = task->pdev;
unsigned int i;
if ( !pdev )
return false;
- read_lock(&v->domain->pci_lock);
-
- if ( !pdev->vpci || (v->domain != pdev->domain) )
- {
- v->vpci.pdev = NULL;
- read_unlock(&v->domain->pci_lock);
+ if ( !pdev->vpci || (task->domain != pdev->domain) )
return false;
- }
- header = &pdev->vpci->header;
- for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
{
- struct vpci_bar *bar = &header->bars[i];
+ struct vpci_bar_map *bar = &task->bars[i];
struct map_data data = {
- .d = v->domain,
- .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
+ .d = task->domain,
+ .map = task->cmd & PCI_COMMAND_MEMORY,
.bar = bar,
};
int rc;
@@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
rc = rangeset_consume_ranges(bar->mem, map_range, &data);
if ( rc == -ERESTART )
- {
- read_unlock(&v->domain->pci_lock);
return true;
- }
if ( rc )
{
spin_lock(&pdev->vpci->lock);
/* Disable memory decoding unconditionally on failure. */
- modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
+ modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
false);
spin_unlock(&pdev->vpci->lock);
- /* Clean all the rangesets */
- for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
- if ( !rangeset_is_empty(header->bars[i].mem) )
- rangeset_purge(header->bars[i].mem);
-
- v->vpci.pdev = NULL;
-
- read_unlock(&v->domain->pci_lock);
-
- if ( !is_hardware_domain(v->domain) )
- domain_crash(v->domain);
+ if ( !is_hardware_domain(task->domain) )
+ domain_crash(task->domain);
return false;
}
}
- v->vpci.pdev = NULL;
spin_lock(&pdev->vpci->lock);
- modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
+ modify_decoding(pdev, task->cmd, task->rom_only);
spin_unlock(&pdev->vpci->lock);
- read_unlock(&v->domain->pci_lock);
+ return false;
+}
+
+static void destroy_map_task(struct vpci_map_task *task)
+{
+ unsigned int i;
+ if ( !task )
+ return;
+
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+ rangeset_destroy(task->bars[i].mem);
+
+ xfree(task);
+}
+
+bool vpci_process_pending(struct vcpu *v)
+{
+ struct vpci_map_task *task;
+ read_lock(&v->domain->pci_lock);
+
+ while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
+ struct vpci_map_task,
+ next)) != NULL )
+ {
+ if ( vpci_process_map_task(task) )
+ {
+ read_unlock(&v->domain->pci_lock);
+ return true;
+ }
+
+ list_del(&task->next);
+ destroy_map_task(task);
+ }
+
+ read_unlock(&v->domain->pci_lock);
return false;
}
-static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
- uint16_t cmd)
+static int __init apply_map(struct vpci_map_task *task)
{
- struct vpci_header *header = &pdev->vpci->header;
+ struct domain *d = task->domain;
+ const struct pci_dev *pdev = task->pdev;
+ uint16_t cmd = task->cmd;
int rc = 0;
unsigned int i;
ASSERT(rw_is_write_locked(&d->pci_lock));
- for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
{
- struct vpci_bar *bar = &header->bars[i];
+ struct vpci_bar_map *bar = &task->bars[i];
struct map_data data = { .d = d, .map = true, .bar = bar };
if ( rangeset_is_empty(bar->mem) )
@@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
return rc;
}
-static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
+ uint16_t cmd, bool rom_only)
+{
+ struct vpci_map_task *task = xzalloc(struct vpci_map_task);
+ unsigned int i;
+
+ if ( !task )
+ return NULL;
+
+ BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
+
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+ {
+ if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
+ {
+ char str[32];
+
+ snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
+
+ task->bars[i].mem = rangeset_new(pdev->domain, str,
+ RANGESETF_no_print);
+
+ if ( !task->bars[i].mem )
+ {
+ destroy_map_task(task);
+ return NULL;
+ }
+
+ task->bars[i].addr = pdev->vpci->header.bars[i].addr;
+ task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
+ }
+ }
+
+ task->pdev = pdev;
+ task->domain = pdev->domain;
+ task->cmd = cmd;
+ task->rom_only = rom_only;
+
+ return task;
+}
+
+static void defer_map(struct vpci_map_task *task)
{
struct vcpu *curr = current;
@@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
* is mapped. This can lead to parallel mapping operations being
* started for the same device if the domain is not well-behaved.
*/
- curr->vpci.pdev = pdev;
- curr->vpci.cmd = cmd;
- curr->vpci.rom_only = rom_only;
+
+ list_add_tail(&task->next, &curr->vpci.task_queue);
+
/*
* Raise a scheduler softirq in order to prevent the guest from resuming
* execution with pending mapping operations, to trigger the invocation
@@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
struct pci_dev *tmp;
const struct domain *d;
const struct vpci_msix *msix = pdev->vpci->msix;
+ struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
unsigned int i, j;
int rc;
ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
+ if ( !task )
+ return -ENOMEM;
+
/*
* Create a rangeset per BAR that represents the current device memory
* region and compare it against all the currently active BAR memory
@@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
{
struct vpci_bar *bar = &header->bars[i];
+ struct rangeset *mem = task->bars[i].mem;
unsigned long start = PFN_DOWN(bar->addr);
unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
unsigned long start_guest = PFN_DOWN(bar->guest_addr);
unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
- if ( !bar->mem )
+ if ( !mem )
continue;
if ( !MAPPABLE_BAR(bar) ||
@@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
continue;
}
- ASSERT(rangeset_is_empty(bar->mem));
+ ASSERT(rangeset_is_empty(mem));
/*
* Make sure that the guest set address has the same page offset
@@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_G_WARNING,
"%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
&pdev->sbdf, i, bar->guest_addr, bar->addr);
+ destroy_map_task(task);
return -EINVAL;
}
- rc = rangeset_add_range(bar->mem, start_guest, end_guest);
+ rc = rangeset_add_range(mem, start_guest, end_guest);
if ( rc )
{
printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
start_guest, end_guest, rc);
+ destroy_map_task(task);
return rc;
}
/* Check for overlap with the already setup BAR ranges. */
for ( j = 0; j < i; j++ )
{
- struct vpci_bar *prev_bar = &header->bars[j];
+ struct vpci_bar_map *prev_bar = &task->bars[j];
if ( rangeset_is_empty(prev_bar->mem) )
continue;
@@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_WARNING,
"%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
&pdev->sbdf, start_guest, end_guest, rc);
+ destroy_map_task(task);
return rc;
}
}
- rc = pci_sanitize_bar_memory(bar->mem);
+ rc = pci_sanitize_bar_memory(mem);
if ( rc )
{
gprintk(XENLOG_WARNING,
"%pp: failed to sanitize BAR#%u memory: %d\n",
&pdev->sbdf, i, rc);
+ destroy_map_task(task);
return rc;
}
}
@@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
{
- const struct vpci_bar *bar = &header->bars[j];
+ const struct vpci_bar_map *bar = &task->bars[j];
if ( rangeset_is_empty(bar->mem) )
continue;
@@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_WARNING,
"%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
&pdev->sbdf, start, end, rc);
+ destroy_map_task(task);
return rc;
}
}
@@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
{
const struct vpci_bar *bar = &header->bars[j];
+ struct rangeset *mem = task->bars[j].mem;
- if ( !rangeset_overlaps_range(bar->mem, start, end) ||
+ if ( !rangeset_overlaps_range(mem, start, end) ||
/*
* If only the ROM enable bit is toggled check against
* other BARs in the same device for overlaps, but not
@@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
bar->type == VPCI_BAR_ROM) )
continue;
- rc = rangeset_remove_range(bar->mem, start, end);
+ rc = rangeset_remove_range(mem, start, end);
if ( rc )
{
gprintk(XENLOG_WARNING,
"%pp: failed to remove [%lx, %lx]: %d\n",
&pdev->sbdf, start, end, rc);
+ destroy_map_task(task);
return rc;
}
}
@@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
* will always be to establish mappings and process all the BARs.
*/
ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
- return apply_map(pdev->domain, pdev, cmd);
+ rc = apply_map(task);
+ destroy_map_task(task);
+ return rc;
}
- defer_map(pdev, cmd, rom_only);
+ defer_map(task);
return 0;
}
@@ -731,18 +800,6 @@ static void cf_check rom_write(
}
}
-static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
- unsigned int i)
-{
- char str[32];
-
- snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
-
- bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
-
- return !bar->mem ? -ENOMEM : 0;
-}
-
static int vpci_init_capability_list(struct pci_dev *pdev)
{
int rc;
@@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
else
bars[i].type = VPCI_BAR_MEM32;
- rc = bar_add_rangeset(pdev, &bars[i], i);
- if ( rc )
- goto fail;
-
rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
(i == num_bars - 1) ? PCI_BAR_LAST : 0);
if ( rc < 0 )
@@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
4, rom);
if ( rc )
rom->type = VPCI_BAR_EMPTY;
- else
- {
- rc = bar_add_rangeset(pdev, rom, num_bars);
- if ( rc )
- goto fail;
- }
}
else if ( !is_hwdom )
{
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 09988f04c27c..7177cfce355d 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
iounmap(pdev->vpci->msix->table[i]);
}
- for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
- rangeset_destroy(pdev->vpci->header.bars[i].mem);
-
xfree(pdev->vpci->msix);
xfree(pdev->vpci->msi);
xfree(pdev->vpci);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 6a481a4e89d3..c2e75076691f 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -103,7 +103,6 @@ struct vpci {
uint64_t guest_addr;
uint64_t size;
uint64_t resizable_sizes;
- struct rangeset *mem;
enum {
VPCI_BAR_EMPTY,
VPCI_BAR_IO,
@@ -194,14 +193,25 @@ struct vpci {
#endif
};
-struct vpci_vcpu {
+#ifdef __XEN__
+struct vpci_map_task {
/* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
+ struct list_head next;
const struct pci_dev *pdev;
+ struct domain *domain;
+ struct vpci_bar_map {
+ uint64_t addr;
+ uint64_t guest_addr;
+ struct rangeset *mem;
+ } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
uint16_t cmd;
bool rom_only : 1;
};
-#ifdef __XEN__
+struct vpci_vcpu {
+ struct list_head task_queue;
+};
+
void vpci_dump_msi(void);
/* Make sure there's a hole in the p2m for the MSIX mmio areas. */
--
2.50.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit
2025-07-23 16:37 [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
@ 2025-07-23 16:37 ` Stewart Hildebrand
2025-07-25 8:13 ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2 siblings, 1 reply; 15+ messages in thread
From: Stewart Hildebrand @ 2025-07-23 16:37 UTC (permalink / raw)
To: xen-devel; +Cc: Stewart Hildebrand, Roger Pau Monné
Introduce enum vpci_map_op and allow invoking modify_bars() without
changing the memory decoding bit.
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
v1->v2:
* new patch
---
xen/drivers/vpci/header.c | 22 +++++++++++++++-------
xen/include/xen/vpci.h | 4 ++++
2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index df065a5f5faf..1c66796b625b 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -189,7 +189,7 @@ static bool vpci_process_map_task(struct vpci_map_task *task)
struct vpci_bar_map *bar = &task->bars[i];
struct map_data data = {
.d = task->domain,
- .map = task->cmd & PCI_COMMAND_MEMORY,
+ .map = task->map_op == VPCI_MAP,
.bar = bar,
};
int rc;
@@ -298,7 +298,9 @@ static int __init apply_map(struct vpci_map_task *task)
}
static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
- uint16_t cmd, bool rom_only)
+ uint16_t cmd,
+ enum vpci_map_op map_op,
+ bool rom_only)
{
struct vpci_map_task *task = xzalloc(struct vpci_map_task);
unsigned int i;
@@ -333,6 +335,7 @@ static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
task->pdev = pdev;
task->domain = pdev->domain;
task->cmd = cmd;
+ task->map_op = map_op;
task->rom_only = rom_only;
return task;
@@ -359,13 +362,14 @@ static void defer_map(struct vpci_map_task *task)
raise_softirq(SCHEDULE_SOFTIRQ);
}
-static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+static int modify_bars(const struct pci_dev *pdev, uint16_t cmd,
+ enum vpci_map_op map_op, bool rom_only)
{
struct vpci_header *header = &pdev->vpci->header;
struct pci_dev *tmp;
const struct domain *d;
const struct vpci_msix *msix = pdev->vpci->msix;
- struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
+ struct vpci_map_task *task = alloc_map_task(pdev, cmd, map_op, rom_only);
unsigned int i, j;
int rc;
@@ -614,7 +618,8 @@ static void cf_check cmd_write(
* memory decoding bit has not been changed, so leave everything as-is,
* hoping the guest will realize and try again.
*/
- modify_bars(pdev, cmd, false);
+ modify_bars(pdev, cmd, cmd & PCI_COMMAND_MEMORY ? VPCI_MAP : VPCI_UNMAP,
+ false);
else
pci_conf_write16(pdev->sbdf, reg, cmd);
}
@@ -782,7 +787,8 @@ static void cf_check rom_write(
* Pass PCI_COMMAND_MEMORY or 0 to signal a map/unmap request, note that
* this fabricated command is never going to be written to the register.
*/
- else if ( modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0, true) )
+ else if ( modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0,
+ new_enabled ? VPCI_MAP : VPCI_UNMAP, true) )
/*
* No memory has been added or removed from the p2m (because the actual
* p2m changes are deferred in defer_map) and the ROM enable bit has
@@ -1067,7 +1073,9 @@ static int cf_check init_header(struct pci_dev *pdev)
goto fail;
}
- return (cmd & PCI_COMMAND_MEMORY) ? modify_bars(pdev, cmd, false) : 0;
+ return (cmd & PCI_COMMAND_MEMORY)
+ ? modify_bars(pdev, cmd, VPCI_MAP, false)
+ : 0;
fail:
pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index c2e75076691f..fb6cad62d418 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -205,6 +205,10 @@ struct vpci_map_task {
struct rangeset *mem;
} bars[PCI_HEADER_NORMAL_NR_BARS + 1];
uint16_t cmd;
+ enum vpci_map_op {
+ VPCI_MAP,
+ VPCI_UNMAP,
+ } map_op;
bool rom_only : 1;
};
--
2.50.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled
2025-07-23 16:37 [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit Stewart Hildebrand
@ 2025-07-23 16:37 ` Stewart Hildebrand
2 siblings, 0 replies; 15+ messages in thread
From: Stewart Hildebrand @ 2025-07-23 16:37 UTC (permalink / raw)
To: xen-devel; +Cc: Stewart Hildebrand, Roger Pau Monné
Currently, Xen vPCI refuses BAR writes if the BAR is mapped in p2m. If
firmware initializes a 32-bit BAR to a bad address, Linux may try to
write a new address to the 32-bit BAR without disabling memory decoding.
Since Xen refuses such writes, the BAR (and thus PCI device) will be
non-functional.
Allow the hardware domain to issue 32-bit BAR writes with memory
decoding enabled. This increases the compatibility of PVH dom0 with more
hardware.
Note that Linux aims at disabling memory decoding before writing 64-bit
BARs. Continue to refuse 64-bit BAR writes in Xen while those BARs are
mapped for now to avoid mapping half-updated BARs in p2m.
Take the opportunity to remove a stray newline in bar_write().
Resolves: https://gitlab.com/xen-project/xen/-/issues/197
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
v1->v2:
* rework on top of queued BAR map/unmap operation machinery
RFC->v1:
* keep memory decoding enabled in hardware
* allow write while memory decoding is enabled for 32-bit BARs only
* rework BAR mapping machinery to support unmap-then-map operation
---
xen/drivers/vpci/header.c | 36 ++++++++++++++++++++++++------------
1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 1c66796b625b..06c1dbfd5de0 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -404,9 +404,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd,
if ( !MAPPABLE_BAR(bar) ||
(rom_only ? bar->type != VPCI_BAR_ROM
- : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) ||
- /* Skip BARs already in the requested state. */
- bar->enabled == !!(cmd & PCI_COMMAND_MEMORY) )
+ : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
continue;
if ( !pci_check_bar(pdev, _mfn(start), _mfn(end)) )
@@ -650,19 +648,29 @@ static void cf_check bar_write(
val &= PCI_BASE_ADDRESS_MEM_MASK;
/*
- * Xen only cares whether the BAR is mapped into the p2m, so allow BAR
- * writes as long as the BAR is not mapped into the p2m.
+ * Allow 64-bit BAR writes only when the BAR is not mapped in p2m. Always
+ * allow 32-bit BAR writes.
*/
if ( bar->enabled )
{
- /* If the value written is the current one avoid printing a warning. */
- if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
- gprintk(XENLOG_WARNING,
- "%pp: ignored BAR %zu write while mapped\n",
- &pdev->sbdf, bar - pdev->vpci->header.bars + hi);
- return;
- }
+ if ( bar->type == VPCI_BAR_MEM32 )
+ {
+ if ( val == bar->addr )
+ return;
+ modify_bars(pdev, pci_conf_read16(pdev->sbdf, PCI_COMMAND),
+ VPCI_UNMAP, false);
+ }
+ else
+ {
+ /* If the value written is the same avoid printing a warning. */
+ if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
+ gprintk(XENLOG_WARNING,
+ "%pp: ignored BAR %zu write while mapped\n",
+ &pdev->sbdf, bar - pdev->vpci->header.bars + hi);
+ return;
+ }
+ }
/*
* Update the cached address, so that when memory decoding is enabled
@@ -682,6 +690,10 @@ static void cf_check bar_write(
}
pci_conf_write32(pdev->sbdf, reg, val);
+
+ if ( bar->enabled )
+ modify_bars(pdev, pci_conf_read16(pdev->sbdf, PCI_COMMAND), VPCI_MAP,
+ false);
}
static void cf_check guest_mem_bar_write(const struct pci_dev *pdev,
--
2.50.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
@ 2025-07-24 3:00 ` dmkhn
2025-07-24 3:42 ` dmkhn
2025-07-24 16:44 ` Roger Pau Monné
1 sibling, 1 reply; 15+ messages in thread
From: dmkhn @ 2025-07-24 3:00 UTC (permalink / raw)
To: Stewart Hildebrand
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Roger Pau Monné,
Stefano Stabellini
On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> general vPCI state: in particular, move the per-BAR rangeset out of
> struct vpci and into the map task struct.
>
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
>
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> v1->v2:
> * new patch
>
> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> ---
> xen/common/domain.c | 4 +
> xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> xen/drivers/vpci/vpci.c | 3 -
> xen/include/xen/vpci.h | 16 +++-
> 4 files changed, 139 insertions(+), 81 deletions(-)
>
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 303c338ef293..214795e2d2fe 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
> d->vcpu[prev_id]->next_in_list = v;
> }
>
> +#ifdef CONFIG_HAS_VPCI
> + INIT_LIST_HEAD(&v->vpci.task_queue);
> +#endif
> +
> /* Must be called after making new vcpu visible to for_each_vcpu(). */
> vcpu_check_shutdown(v);
>
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bb76e707992c..df065a5f5faf 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -34,7 +34,7 @@
>
> struct map_data {
> struct domain *d;
> - const struct vpci_bar *bar;
> + const struct vpci_bar_map *bar;
Perhaps s/vpci_bar_map/vpci_bar_mmio/g to highlight the BAR type?
> bool map;
> };
>
> @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
> ASSERT_UNREACHABLE();
> }
>
> -bool vpci_process_pending(struct vcpu *v)
> +static bool vpci_process_map_task(struct vpci_map_task *task)
> {
> - const struct pci_dev *pdev = v->vpci.pdev;
> - struct vpci_header *header = NULL;
> + const struct pci_dev *pdev = task->pdev;
> unsigned int i;
>
> if ( !pdev )
> return false;
>
> - read_lock(&v->domain->pci_lock);
> -
> - if ( !pdev->vpci || (v->domain != pdev->domain) )
> - {
> - v->vpci.pdev = NULL;
> - read_unlock(&v->domain->pci_lock);
> + if ( !pdev->vpci || (task->domain != pdev->domain) )
> return false;
> - }
>
> - header = &pdev->vpci->header;
> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> {
> - struct vpci_bar *bar = &header->bars[i];
> + struct vpci_bar_map *bar = &task->bars[i];
> struct map_data data = {
> - .d = v->domain,
> - .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> + .d = task->domain,
> + .map = task->cmd & PCI_COMMAND_MEMORY,
> .bar = bar,
> };
> int rc;
> @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
> rc = rangeset_consume_ranges(bar->mem, map_range, &data);
>
> if ( rc == -ERESTART )
> - {
> - read_unlock(&v->domain->pci_lock);
> return true;
> - }
>
> if ( rc )
> {
> spin_lock(&pdev->vpci->lock);
> /* Disable memory decoding unconditionally on failure. */
> - modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> + modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
> false);
> spin_unlock(&pdev->vpci->lock);
>
> - /* Clean all the rangesets */
> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> - if ( !rangeset_is_empty(header->bars[i].mem) )
> - rangeset_purge(header->bars[i].mem);
> -
> - v->vpci.pdev = NULL;
> -
> - read_unlock(&v->domain->pci_lock);
> -
> - if ( !is_hardware_domain(v->domain) )
> - domain_crash(v->domain);
> + if ( !is_hardware_domain(task->domain) )
> + domain_crash(task->domain);
>
> return false;
> }
> }
> - v->vpci.pdev = NULL;
>
> spin_lock(&pdev->vpci->lock);
> - modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> + modify_decoding(pdev, task->cmd, task->rom_only);
> spin_unlock(&pdev->vpci->lock);
>
> - read_unlock(&v->domain->pci_lock);
> + return false;
> +}
> +
> +static void destroy_map_task(struct vpci_map_task *task)
> +{
> + unsigned int i;
>
> + if ( !task )
Looks like task is never NULL in the code, suggest ASSERT().
> + return;
> +
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> + rangeset_destroy(task->bars[i].mem);
> +
> + xfree(task);
> +}
> +
> +bool vpci_process_pending(struct vcpu *v)
> +{
> + struct vpci_map_task *task;
> + read_lock(&v->domain->pci_lock);
> +
> + while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> + struct vpci_map_task,
> + next)) != NULL )
> + {
> + if ( vpci_process_map_task(task) )
> + {
> + read_unlock(&v->domain->pci_lock);
I would add local variable and use it here for shorter code:
int rc;
read_lock(&v->domain->pci_lock);
while (...)
{
rc = vpci_process_map_task(task);
if ( rc )
break;
...
}
read_unlock(&v->domain->pci_lock);
return rc;
> + return true;
> + }
> +
> + list_del(&task->next);
> + destroy_map_task(task);
> + }
> +
> + read_unlock(&v->domain->pci_lock);
> return false;
> }
>
> -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> - uint16_t cmd)
> +static int __init apply_map(struct vpci_map_task *task)
> {
> - struct vpci_header *header = &pdev->vpci->header;
> + struct domain *d = task->domain;
> + const struct pci_dev *pdev = task->pdev;
> + uint16_t cmd = task->cmd;
> int rc = 0;
> unsigned int i;
>
> ASSERT(rw_is_write_locked(&d->pci_lock));
>
> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> {
> - struct vpci_bar *bar = &header->bars[i];
> + struct vpci_bar_map *bar = &task->bars[i];
> struct map_data data = { .d = d, .map = true, .bar = bar };
>
> if ( rangeset_is_empty(bar->mem) )
> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> return rc;
> }
>
> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> + uint16_t cmd, bool rom_only)
> +{
> + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> + unsigned int i;
> +
I would move allocation outside of initializer and use preferred xvzalloc()
variant:
task = xvzalloc(struct vpci_map_task);
> + if ( !task )
> + return NULL;
> +
> + BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> +
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> + {
> + if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
I would reduce one level of indentation by
if ( pdev->vpci->header.bars[i].type < VPCI_BAR_MEM32 )
continue;
> + {
> + char str[32];
> +
> + snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> +
> + task->bars[i].mem = rangeset_new(pdev->domain, str,
> + RANGESETF_no_print);
> +
> + if ( !task->bars[i].mem )
> + {
> + destroy_map_task(task);
What if allocation fails in the middle of the loop, e.g. i == 5?
I think previously allocated rangesets in this loop should be freed.
> + return NULL;
> + }
> +
> + task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> + task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> + }
> + }
> +
> + task->pdev = pdev;
> + task->domain = pdev->domain;
> + task->cmd = cmd;
> + task->rom_only = rom_only;
> +
> + return task;
> +}
> +
> +static void defer_map(struct vpci_map_task *task)
> {
> struct vcpu *curr = current;
>
> @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> * is mapped. This can lead to parallel mapping operations being
> * started for the same device if the domain is not well-behaved.
> */
> - curr->vpci.pdev = pdev;
> - curr->vpci.cmd = cmd;
> - curr->vpci.rom_only = rom_only;
> +
> + list_add_tail(&task->next, &curr->vpci.task_queue);
> +
> /*
> * Raise a scheduler softirq in order to prevent the guest from resuming
> * execution with pending mapping operations, to trigger the invocation
> @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> struct pci_dev *tmp;
> const struct domain *d;
> const struct vpci_msix *msix = pdev->vpci->msix;
> + struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
Same comment re: having allocation on a separate line.
> unsigned int i, j;
> int rc;
>
> ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>
> + if ( !task )
> + return -ENOMEM;
> +
> /*
> * Create a rangeset per BAR that represents the current device memory
> * region and compare it against all the currently active BAR memory
> @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> {
> struct vpci_bar *bar = &header->bars[i];
> + struct rangeset *mem = task->bars[i].mem;
> unsigned long start = PFN_DOWN(bar->addr);
> unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> unsigned long start_guest = PFN_DOWN(bar->guest_addr);
> unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
>
> - if ( !bar->mem )
> + if ( !mem )
> continue;
>
> if ( !MAPPABLE_BAR(bar) ||
> @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> continue;
> }
>
> - ASSERT(rangeset_is_empty(bar->mem));
> + ASSERT(rangeset_is_empty(mem));
>
> /*
> * Make sure that the guest set address has the same page offset
> @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> gprintk(XENLOG_G_WARNING,
> "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
> &pdev->sbdf, i, bar->guest_addr, bar->addr);
> + destroy_map_task(task);
I think using goto would be justified for doing cleanup in one place.
> return -EINVAL;
> }
>
> - rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> + rc = rangeset_add_range(mem, start_guest, end_guest);
> if ( rc )
> {
> printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
> start_guest, end_guest, rc);
> + destroy_map_task(task);
> return rc;
> }
>
> /* Check for overlap with the already setup BAR ranges. */
> for ( j = 0; j < i; j++ )
> {
> - struct vpci_bar *prev_bar = &header->bars[j];
> + struct vpci_bar_map *prev_bar = &task->bars[j];
>
> if ( rangeset_is_empty(prev_bar->mem) )
> continue;
> @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> gprintk(XENLOG_WARNING,
> "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
> &pdev->sbdf, start_guest, end_guest, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
>
> - rc = pci_sanitize_bar_memory(bar->mem);
> + rc = pci_sanitize_bar_memory(mem);
> if ( rc )
> {
> gprintk(XENLOG_WARNING,
> "%pp: failed to sanitize BAR#%u memory: %d\n",
> &pdev->sbdf, i, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
> @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>
> for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
> {
> - const struct vpci_bar *bar = &header->bars[j];
> + const struct vpci_bar_map *bar = &task->bars[j];
>
> if ( rangeset_is_empty(bar->mem) )
> continue;
> @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> gprintk(XENLOG_WARNING,
> "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
> &pdev->sbdf, start, end, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
> @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
> {
> const struct vpci_bar *bar = &header->bars[j];
> + struct rangeset *mem = task->bars[j].mem;
>
> - if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> + if ( !rangeset_overlaps_range(mem, start, end) ||
> /*
> * If only the ROM enable bit is toggled check against
> * other BARs in the same device for overlaps, but not
> @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> bar->type == VPCI_BAR_ROM) )
> continue;
>
> - rc = rangeset_remove_range(bar->mem, start, end);
> + rc = rangeset_remove_range(mem, start, end);
> if ( rc )
> {
> gprintk(XENLOG_WARNING,
> "%pp: failed to remove [%lx, %lx]: %d\n",
> &pdev->sbdf, start, end, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
> @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> * will always be to establish mappings and process all the BARs.
> */
> ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> - return apply_map(pdev->domain, pdev, cmd);
> + rc = apply_map(task);
> + destroy_map_task(task);
> + return rc;
> }
>
> - defer_map(pdev, cmd, rom_only);
> + defer_map(task);
>
> return 0;
> }
> @@ -731,18 +800,6 @@ static void cf_check rom_write(
> }
> }
>
> -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> - unsigned int i)
> -{
> - char str[32];
> -
> - snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> -
> - bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> -
> - return !bar->mem ? -ENOMEM : 0;
> -}
> -
> static int vpci_init_capability_list(struct pci_dev *pdev)
> {
> int rc;
> @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> else
> bars[i].type = VPCI_BAR_MEM32;
>
> - rc = bar_add_rangeset(pdev, &bars[i], i);
> - if ( rc )
> - goto fail;
> -
> rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
> (i == num_bars - 1) ? PCI_BAR_LAST : 0);
> if ( rc < 0 )
> @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> 4, rom);
> if ( rc )
> rom->type = VPCI_BAR_EMPTY;
> - else
> - {
> - rc = bar_add_rangeset(pdev, rom, num_bars);
> - if ( rc )
> - goto fail;
> - }
> }
> else if ( !is_hwdom )
> {
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 09988f04c27c..7177cfce355d 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> iounmap(pdev->vpci->msix->table[i]);
> }
>
> - for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> - rangeset_destroy(pdev->vpci->header.bars[i].mem);
> -
> xfree(pdev->vpci->msix);
> xfree(pdev->vpci->msi);
> xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 6a481a4e89d3..c2e75076691f 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -103,7 +103,6 @@ struct vpci {
> uint64_t guest_addr;
> uint64_t size;
> uint64_t resizable_sizes;
> - struct rangeset *mem;
> enum {
> VPCI_BAR_EMPTY,
> VPCI_BAR_IO,
> @@ -194,14 +193,25 @@ struct vpci {
> #endif
> };
>
> -struct vpci_vcpu {
> +#ifdef __XEN__
> +struct vpci_map_task {
> /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> + struct list_head next;
> const struct pci_dev *pdev;
> + struct domain *domain;
> + struct vpci_bar_map {
> + uint64_t addr;
> + uint64_t guest_addr;
Perhaps use hpa/gpa notation for naming address fields?
> + struct rangeset *mem;
> + } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
I would add a brief comment for `+ 1`
> uint16_t cmd;
> bool rom_only : 1;
> };
>
> -#ifdef __XEN__
> +struct vpci_vcpu {
> + struct list_head task_queue;
> +};
> +
> void vpci_dump_msi(void);
>
> /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
> --
> 2.50.1
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-24 3:00 ` dmkhn
@ 2025-07-24 3:42 ` dmkhn
2025-07-24 7:47 ` Jan Beulich
0 siblings, 1 reply; 15+ messages in thread
From: dmkhn @ 2025-07-24 3:42 UTC (permalink / raw)
To: Stewart Hildebrand, xen-devel, Andrew Cooper, Anthony PERARD,
Michal Orzel, Jan Beulich, Julien Grall, Roger Pau Monné,
Stefano Stabellini
On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> > Introduce vPCI BAR mapping task queue. Decouple map operation state from
> > general vPCI state: in particular, move the per-BAR rangeset out of
> > struct vpci and into the map task struct.
> >
> > This is preparatory work for further changes that need to perform
> > multiple unmap/map operations before returning to guest.
> >
> > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > ---
> > v1->v2:
> > * new patch
> >
> > Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> > ---
> > xen/common/domain.c | 4 +
> > xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> > xen/drivers/vpci/vpci.c | 3 -
> > xen/include/xen/vpci.h | 16 +++-
> > 4 files changed, 139 insertions(+), 81 deletions(-)
> >
> > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > index 303c338ef293..214795e2d2fe 100644
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
> > d->vcpu[prev_id]->next_in_list = v;
> > }
> >
> > +#ifdef CONFIG_HAS_VPCI
> > + INIT_LIST_HEAD(&v->vpci.task_queue);
> > +#endif
> > +
> > /* Must be called after making new vcpu visible to for_each_vcpu(). */
> > vcpu_check_shutdown(v);
> >
> > diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> > index bb76e707992c..df065a5f5faf 100644
> > --- a/xen/drivers/vpci/header.c
> > +++ b/xen/drivers/vpci/header.c
> > @@ -34,7 +34,7 @@
> >
> > struct map_data {
> > struct domain *d;
> > - const struct vpci_bar *bar;
> > + const struct vpci_bar_map *bar;
>
> Perhaps s/vpci_bar_map/vpci_bar_mmio/g to highlight the BAR type?
>
> > bool map;
> > };
> >
> > @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
> > ASSERT_UNREACHABLE();
> > }
> >
> > -bool vpci_process_pending(struct vcpu *v)
> > +static bool vpci_process_map_task(struct vpci_map_task *task)
> > {
> > - const struct pci_dev *pdev = v->vpci.pdev;
> > - struct vpci_header *header = NULL;
> > + const struct pci_dev *pdev = task->pdev;
> > unsigned int i;
> >
> > if ( !pdev )
> > return false;
> >
> > - read_lock(&v->domain->pci_lock);
> > -
> > - if ( !pdev->vpci || (v->domain != pdev->domain) )
> > - {
> > - v->vpci.pdev = NULL;
> > - read_unlock(&v->domain->pci_lock);
> > + if ( !pdev->vpci || (task->domain != pdev->domain) )
> > return false;
> > - }
> >
> > - header = &pdev->vpci->header;
> > - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > {
> > - struct vpci_bar *bar = &header->bars[i];
> > + struct vpci_bar_map *bar = &task->bars[i];
> > struct map_data data = {
> > - .d = v->domain,
> > - .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> > + .d = task->domain,
> > + .map = task->cmd & PCI_COMMAND_MEMORY,
> > .bar = bar,
> > };
> > int rc;
> > @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
> > rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> >
> > if ( rc == -ERESTART )
> > - {
> > - read_unlock(&v->domain->pci_lock);
> > return true;
> > - }
> >
> > if ( rc )
> > {
> > spin_lock(&pdev->vpci->lock);
> > /* Disable memory decoding unconditionally on failure. */
> > - modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> > + modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
> > false);
> > spin_unlock(&pdev->vpci->lock);
> >
> > - /* Clean all the rangesets */
> > - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > - if ( !rangeset_is_empty(header->bars[i].mem) )
> > - rangeset_purge(header->bars[i].mem);
> > -
> > - v->vpci.pdev = NULL;
> > -
> > - read_unlock(&v->domain->pci_lock);
> > -
> > - if ( !is_hardware_domain(v->domain) )
> > - domain_crash(v->domain);
> > + if ( !is_hardware_domain(task->domain) )
> > + domain_crash(task->domain);
> >
> > return false;
> > }
> > }
> > - v->vpci.pdev = NULL;
> >
> > spin_lock(&pdev->vpci->lock);
> > - modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> > + modify_decoding(pdev, task->cmd, task->rom_only);
> > spin_unlock(&pdev->vpci->lock);
> >
> > - read_unlock(&v->domain->pci_lock);
> > + return false;
> > +}
> > +
> > +static void destroy_map_task(struct vpci_map_task *task)
> > +{
> > + unsigned int i;
> >
> > + if ( !task )
>
> Looks like task is never NULL in the code, suggest ASSERT().
>
> > + return;
> > +
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > + rangeset_destroy(task->bars[i].mem);
> > +
> > + xfree(task);
> > +}
> > +
> > +bool vpci_process_pending(struct vcpu *v)
> > +{
> > + struct vpci_map_task *task;
> > + read_lock(&v->domain->pci_lock);
> > +
> > + while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> > + struct vpci_map_task,
> > + next)) != NULL )
> > + {
> > + if ( vpci_process_map_task(task) )
> > + {
> > + read_unlock(&v->domain->pci_lock);
>
> I would add local variable and use it here for shorter code:
>
> int rc;
>
> read_lock(&v->domain->pci_lock);
>
> while (...)
> {
> rc = vpci_process_map_task(task);
> if ( rc )
> break;
>
> ...
> }
>
> read_unlock(&v->domain->pci_lock);
>
> return rc;
>
>
>
> > + return true;
> > + }
> > +
> > + list_del(&task->next);
> > + destroy_map_task(task);
> > + }
> > +
> > + read_unlock(&v->domain->pci_lock);
> > return false;
> > }
> >
> > -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> > - uint16_t cmd)
> > +static int __init apply_map(struct vpci_map_task *task)
> > {
> > - struct vpci_header *header = &pdev->vpci->header;
> > + struct domain *d = task->domain;
> > + const struct pci_dev *pdev = task->pdev;
> > + uint16_t cmd = task->cmd;
> > int rc = 0;
> > unsigned int i;
> >
> > ASSERT(rw_is_write_locked(&d->pci_lock));
> >
> > - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > {
> > - struct vpci_bar *bar = &header->bars[i];
> > + struct vpci_bar_map *bar = &task->bars[i];
> > struct map_data data = { .d = d, .map = true, .bar = bar };
> >
> > if ( rangeset_is_empty(bar->mem) )
> > @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> > return rc;
> > }
> >
> > -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> > + uint16_t cmd, bool rom_only)
> > +{
> > + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> > + unsigned int i;
> > +
>
> I would move allocation outside of initializer and use preferred xvzalloc()
> variant:
>
> task = xvzalloc(struct vpci_map_task);
> > + if ( !task )
> > + return NULL;
> > +
> > + BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> > +
> > + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > + {
> > + if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
>
> I would reduce one level of indentation by
>
> if ( pdev->vpci->header.bars[i].type < VPCI_BAR_MEM32 )
> continue;
>
>
> > + {
> > + char str[32];
> > +
> > + snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> > +
> > + task->bars[i].mem = rangeset_new(pdev->domain, str,
> > + RANGESETF_no_print);
> > +
> > + if ( !task->bars[i].mem )
> > + {
> > + destroy_map_task(task);
>
> What if allocation fails in the middle of the loop, e.g. i == 5?
> I think previously allocated rangesets in this loop should be freed.
Oh, I see that is done in destroy_map_task(); please disregard.
>
> > + return NULL;
> > + }
> > +
> > + task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> > + task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> > + }
> > + }
> > +
> > + task->pdev = pdev;
> > + task->domain = pdev->domain;
> > + task->cmd = cmd;
> > + task->rom_only = rom_only;
> > +
> > + return task;
> > +}
> > +
> > +static void defer_map(struct vpci_map_task *task)
> > {
> > struct vcpu *curr = current;
> >
> > @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > * is mapped. This can lead to parallel mapping operations being
> > * started for the same device if the domain is not well-behaved.
> > */
> > - curr->vpci.pdev = pdev;
> > - curr->vpci.cmd = cmd;
> > - curr->vpci.rom_only = rom_only;
> > +
> > + list_add_tail(&task->next, &curr->vpci.task_queue);
> > +
> > /*
> > * Raise a scheduler softirq in order to prevent the guest from resuming
> > * execution with pending mapping operations, to trigger the invocation
> > @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > struct pci_dev *tmp;
> > const struct domain *d;
> > const struct vpci_msix *msix = pdev->vpci->msix;
> > + struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
>
> Same comment re: having allocation on a separate line.
>
> > unsigned int i, j;
> > int rc;
> >
> > ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
> >
> > + if ( !task )
> > + return -ENOMEM;
> > +
> > /*
> > * Create a rangeset per BAR that represents the current device memory
> > * region and compare it against all the currently active BAR memory
> > @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > {
> > struct vpci_bar *bar = &header->bars[i];
> > + struct rangeset *mem = task->bars[i].mem;
> > unsigned long start = PFN_DOWN(bar->addr);
> > unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> > unsigned long start_guest = PFN_DOWN(bar->guest_addr);
> > unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
> >
> > - if ( !bar->mem )
> > + if ( !mem )
> > continue;
> >
> > if ( !MAPPABLE_BAR(bar) ||
> > @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > continue;
> > }
> >
> > - ASSERT(rangeset_is_empty(bar->mem));
> > + ASSERT(rangeset_is_empty(mem));
> >
> > /*
> > * Make sure that the guest set address has the same page offset
> > @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > gprintk(XENLOG_G_WARNING,
> > "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
> > &pdev->sbdf, i, bar->guest_addr, bar->addr);
> > + destroy_map_task(task);
>
> I think using goto would be justified for doing cleanup in one place.
>
> > return -EINVAL;
> > }
> >
> > - rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> > + rc = rangeset_add_range(mem, start_guest, end_guest);
> > if ( rc )
> > {
> > printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
> > start_guest, end_guest, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> >
> > /* Check for overlap with the already setup BAR ranges. */
> > for ( j = 0; j < i; j++ )
> > {
> > - struct vpci_bar *prev_bar = &header->bars[j];
> > + struct vpci_bar_map *prev_bar = &task->bars[j];
> >
> > if ( rangeset_is_empty(prev_bar->mem) )
> > continue;
> > @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
> > &pdev->sbdf, start_guest, end_guest, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> >
> > - rc = pci_sanitize_bar_memory(bar->mem);
> > + rc = pci_sanitize_bar_memory(mem);
> > if ( rc )
> > {
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to sanitize BAR#%u memory: %d\n",
> > &pdev->sbdf, i, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> > @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >
> > for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
> > {
> > - const struct vpci_bar *bar = &header->bars[j];
> > + const struct vpci_bar_map *bar = &task->bars[j];
> >
> > if ( rangeset_is_empty(bar->mem) )
> > continue;
> > @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
> > &pdev->sbdf, start, end, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> > @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
> > {
> > const struct vpci_bar *bar = &header->bars[j];
> > + struct rangeset *mem = task->bars[j].mem;
> >
> > - if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> > + if ( !rangeset_overlaps_range(mem, start, end) ||
> > /*
> > * If only the ROM enable bit is toggled check against
> > * other BARs in the same device for overlaps, but not
> > @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > bar->type == VPCI_BAR_ROM) )
> > continue;
> >
> > - rc = rangeset_remove_range(bar->mem, start, end);
> > + rc = rangeset_remove_range(mem, start, end);
> > if ( rc )
> > {
> > gprintk(XENLOG_WARNING,
> > "%pp: failed to remove [%lx, %lx]: %d\n",
> > &pdev->sbdf, start, end, rc);
> > + destroy_map_task(task);
> > return rc;
> > }
> > }
> > @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > * will always be to establish mappings and process all the BARs.
> > */
> > ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> > - return apply_map(pdev->domain, pdev, cmd);
> > + rc = apply_map(task);
> > + destroy_map_task(task);
> > + return rc;
> > }
> >
> > - defer_map(pdev, cmd, rom_only);
> > + defer_map(task);
> >
> > return 0;
> > }
> > @@ -731,18 +800,6 @@ static void cf_check rom_write(
> > }
> > }
> >
> > -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> > - unsigned int i)
> > -{
> > - char str[32];
> > -
> > - snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> > -
> > - bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> > -
> > - return !bar->mem ? -ENOMEM : 0;
> > -}
> > -
> > static int vpci_init_capability_list(struct pci_dev *pdev)
> > {
> > int rc;
> > @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> > else
> > bars[i].type = VPCI_BAR_MEM32;
> >
> > - rc = bar_add_rangeset(pdev, &bars[i], i);
> > - if ( rc )
> > - goto fail;
> > -
> > rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
> > (i == num_bars - 1) ? PCI_BAR_LAST : 0);
> > if ( rc < 0 )
> > @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> > 4, rom);
> > if ( rc )
> > rom->type = VPCI_BAR_EMPTY;
> > - else
> > - {
> > - rc = bar_add_rangeset(pdev, rom, num_bars);
> > - if ( rc )
> > - goto fail;
> > - }
> > }
> > else if ( !is_hwdom )
> > {
> > diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> > index 09988f04c27c..7177cfce355d 100644
> > --- a/xen/drivers/vpci/vpci.c
> > +++ b/xen/drivers/vpci/vpci.c
> > @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> > iounmap(pdev->vpci->msix->table[i]);
> > }
> >
> > - for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> > - rangeset_destroy(pdev->vpci->header.bars[i].mem);
> > -
> > xfree(pdev->vpci->msix);
> > xfree(pdev->vpci->msi);
> > xfree(pdev->vpci);
> > diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> > index 6a481a4e89d3..c2e75076691f 100644
> > --- a/xen/include/xen/vpci.h
> > +++ b/xen/include/xen/vpci.h
> > @@ -103,7 +103,6 @@ struct vpci {
> > uint64_t guest_addr;
> > uint64_t size;
> > uint64_t resizable_sizes;
> > - struct rangeset *mem;
> > enum {
> > VPCI_BAR_EMPTY,
> > VPCI_BAR_IO,
> > @@ -194,14 +193,25 @@ struct vpci {
> > #endif
> > };
> >
> > -struct vpci_vcpu {
> > +#ifdef __XEN__
> > +struct vpci_map_task {
> > /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> > + struct list_head next;
> > const struct pci_dev *pdev;
> > + struct domain *domain;
> > + struct vpci_bar_map {
> > + uint64_t addr;
> > + uint64_t guest_addr;
>
> Perhaps use hpa/gpa notation for naming address fields?
>
> > + struct rangeset *mem;
> > + } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
>
> I would add a brief comment for `+ 1`
>
> > uint16_t cmd;
> > bool rom_only : 1;
> > };
> >
> > -#ifdef __XEN__
> > +struct vpci_vcpu {
> > + struct list_head task_queue;
> > +};
> > +
> > void vpci_dump_msi(void);
> >
> > /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
> > --
> > 2.50.1
> >
> >
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-24 3:42 ` dmkhn
@ 2025-07-24 7:47 ` Jan Beulich
2025-07-25 20:25 ` dmkhn
0 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2025-07-24 7:47 UTC (permalink / raw)
To: dmkhn
Cc: Stewart Hildebrand, xen-devel, Andrew Cooper, Anthony PERARD,
Michal Orzel, Julien Grall, Roger Pau Monné,
Stefano Stabellini
On 24.07.2025 05:42, dmkhn@proton.me wrote:
> On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
>>> Introduce vPCI BAR mapping task queue. Decouple map operation state from
>>> general vPCI state: in particular, move the per-BAR rangeset out of
>>> struct vpci and into the map task struct.
>>>
>>> This is preparatory work for further changes that need to perform
>>> multiple unmap/map operations before returning to guest.
>>>
>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>> ---
>>> v1->v2:
>>> * new patch
>>>
>>> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
>>> ---
>>> xen/common/domain.c | 4 +
>>> xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
>>> xen/drivers/vpci/vpci.c | 3 -
>>> xen/include/xen/vpci.h | 16 +++-
>>> 4 files changed, 139 insertions(+), 81 deletions(-)
Why did I (and many others) end up on the To: list of this reply? (Breaks my
rules of sorting incoming mail into appropriate folders, for context.) Also,
please trim reply context suitably. Without you doing so, every single reader
will need to scroll through the entirety of a long mail just to find (in this
case) a single line of reply (somewhere in the middle). Of course you
shouldn't be too agressive with trimming, to retain proper context for your
reply.
Jan
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
2025-07-24 3:00 ` dmkhn
@ 2025-07-24 16:44 ` Roger Pau Monné
2025-07-25 7:58 ` Roger Pau Monné
1 sibling, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2025-07-24 16:44 UTC (permalink / raw)
To: Stewart Hildebrand
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Stefano Stabellini
On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> general vPCI state: in particular, move the per-BAR rangeset out of
> struct vpci and into the map task struct.
>
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
>
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> v1->v2:
> * new patch
>
> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> ---
> xen/common/domain.c | 4 +
> xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> xen/drivers/vpci/vpci.c | 3 -
> xen/include/xen/vpci.h | 16 +++-
> 4 files changed, 139 insertions(+), 81 deletions(-)
>
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 303c338ef293..214795e2d2fe 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
> d->vcpu[prev_id]->next_in_list = v;
> }
>
> +#ifdef CONFIG_HAS_VPCI
> + INIT_LIST_HEAD(&v->vpci.task_queue);
> +#endif
> +
> /* Must be called after making new vcpu visible to for_each_vcpu(). */
> vcpu_check_shutdown(v);
>
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bb76e707992c..df065a5f5faf 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -34,7 +34,7 @@
>
> struct map_data {
> struct domain *d;
> - const struct vpci_bar *bar;
> + const struct vpci_bar_map *bar;
> bool map;
> };
>
> @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
> ASSERT_UNREACHABLE();
> }
>
> -bool vpci_process_pending(struct vcpu *v)
> +static bool vpci_process_map_task(struct vpci_map_task *task)
> {
> - const struct pci_dev *pdev = v->vpci.pdev;
> - struct vpci_header *header = NULL;
> + const struct pci_dev *pdev = task->pdev;
> unsigned int i;
>
> if ( !pdev )
> return false;
>
> - read_lock(&v->domain->pci_lock);
> -
> - if ( !pdev->vpci || (v->domain != pdev->domain) )
> - {
> - v->vpci.pdev = NULL;
> - read_unlock(&v->domain->pci_lock);
> + if ( !pdev->vpci || (task->domain != pdev->domain) )
> return false;
> - }
>
> - header = &pdev->vpci->header;
> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> {
> - struct vpci_bar *bar = &header->bars[i];
> + struct vpci_bar_map *bar = &task->bars[i];
> struct map_data data = {
> - .d = v->domain,
> - .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> + .d = task->domain,
> + .map = task->cmd & PCI_COMMAND_MEMORY,
> .bar = bar,
> };
> int rc;
> @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
> rc = rangeset_consume_ranges(bar->mem, map_range, &data);
>
> if ( rc == -ERESTART )
> - {
> - read_unlock(&v->domain->pci_lock);
> return true;
> - }
>
> if ( rc )
> {
> spin_lock(&pdev->vpci->lock);
> /* Disable memory decoding unconditionally on failure. */
> - modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> + modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
> false);
> spin_unlock(&pdev->vpci->lock);
>
> - /* Clean all the rangesets */
> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> - if ( !rangeset_is_empty(header->bars[i].mem) )
> - rangeset_purge(header->bars[i].mem);
> -
> - v->vpci.pdev = NULL;
> -
> - read_unlock(&v->domain->pci_lock);
> -
> - if ( !is_hardware_domain(v->domain) )
> - domain_crash(v->domain);
> + if ( !is_hardware_domain(task->domain) )
> + domain_crash(task->domain);
>
> return false;
> }
> }
> - v->vpci.pdev = NULL;
>
> spin_lock(&pdev->vpci->lock);
> - modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> + modify_decoding(pdev, task->cmd, task->rom_only);
> spin_unlock(&pdev->vpci->lock);
>
> - read_unlock(&v->domain->pci_lock);
> + return false;
> +}
> +
> +static void destroy_map_task(struct vpci_map_task *task)
> +{
> + unsigned int i;
>
> + if ( !task )
> + return;
Maybe I'm missing a case, but is there any caller that can possibly
pass a NULL task? Not that having the check is bad, just wondering
myself. You might consider adding an ASSERT_UNREACHABLE() if there
are no callers that pass NULL (to ensure correctness of the calling
logic, not that the function can't deal with them).
> +
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> + rangeset_destroy(task->bars[i].mem);
> +
> + xfree(task);
> +}
> +
> +bool vpci_process_pending(struct vcpu *v)
> +{
> + struct vpci_map_task *task;
Newline.
> + read_lock(&v->domain->pci_lock);
> +
> + while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> + struct vpci_map_task,
> + next)) != NULL )
> + {
> + if ( vpci_process_map_task(task) )
> + {
> + read_unlock(&v->domain->pci_lock);
> + return true;
> + }
> +
> + list_del(&task->next);
> + destroy_map_task(task);
> + }
> +
> + read_unlock(&v->domain->pci_lock);
> return false;
> }
>
> -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> - uint16_t cmd)
> +static int __init apply_map(struct vpci_map_task *task)
> {
> - struct vpci_header *header = &pdev->vpci->header;
> + struct domain *d = task->domain;
> + const struct pci_dev *pdev = task->pdev;
> + uint16_t cmd = task->cmd;
> int rc = 0;
> unsigned int i;
>
> ASSERT(rw_is_write_locked(&d->pci_lock));
>
> - for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> {
> - struct vpci_bar *bar = &header->bars[i];
> + struct vpci_bar_map *bar = &task->bars[i];
> struct map_data data = { .d = d, .map = true, .bar = bar };
>
> if ( rangeset_is_empty(bar->mem) )
> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> return rc;
> }
>
> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> + uint16_t cmd, bool rom_only)
> +{
> + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
xvzalloc() preferably.
This however introduces run-time allocations as a result of guest
actions, which is not ideal IMO. It would be preferable to do those
allocations as part of the header initialization, and re-use them.
> + unsigned int i;
> +
> + if ( !task )
> + return NULL;
> +
> + BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> +
> + for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> + {
> + if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
You possibly don't need to prepare this for VPCI_BAR_MEM64_HI BAR
types?
I think you want to use the existing MAPPABLE_BAR() macro to check
whether a BAR requires backing mapping infrastructure.
Also, a nit, but you might want to do something like:
if ( !MAPPABLE_BAR(&pdev->vpci->header.bars[i]) )
continue;
To reduce a level of indentation.
> + {
> + char str[32];
> +
> + snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> +
> + task->bars[i].mem = rangeset_new(pdev->domain, str,
> + RANGESETF_no_print);
> +
> + if ( !task->bars[i].mem )
> + {
> + destroy_map_task(task);
> + return NULL;
> + }
> +
> + task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> + task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> + }
> + }
> +
> + task->pdev = pdev;
> + task->domain = pdev->domain;
> + task->cmd = cmd;
> + task->rom_only = rom_only;
> +
> + return task;
> +}
> +
> +static void defer_map(struct vpci_map_task *task)
> {
> struct vcpu *curr = current;
>
> @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> * is mapped. This can lead to parallel mapping operations being
> * started for the same device if the domain is not well-behaved.
> */
> - curr->vpci.pdev = pdev;
> - curr->vpci.cmd = cmd;
> - curr->vpci.rom_only = rom_only;
> +
> + list_add_tail(&task->next, &curr->vpci.task_queue);
You could probably assert the list is empty before adding, albeit that
won't be the case after further patches are added?
> /*
> * Raise a scheduler softirq in order to prevent the guest from resuming
> * execution with pending mapping operations, to trigger the invocation
> @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> struct pci_dev *tmp;
> const struct domain *d;
> const struct vpci_msix *msix = pdev->vpci->msix;
> + struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
> unsigned int i, j;
> int rc;
>
> ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>
> + if ( !task )
> + return -ENOMEM;
> +
> /*
> * Create a rangeset per BAR that represents the current device memory
> * region and compare it against all the currently active BAR memory
> @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> {
> struct vpci_bar *bar = &header->bars[i];
> + struct rangeset *mem = task->bars[i].mem;
> unsigned long start = PFN_DOWN(bar->addr);
> unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> unsigned long start_guest = PFN_DOWN(bar->guest_addr);
> unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
>
> - if ( !bar->mem )
> + if ( !mem )
> continue;
>
> if ( !MAPPABLE_BAR(bar) ||
> @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> continue;
> }
>
> - ASSERT(rangeset_is_empty(bar->mem));
> + ASSERT(rangeset_is_empty(mem));
>
> /*
> * Make sure that the guest set address has the same page offset
> @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> gprintk(XENLOG_G_WARNING,
> "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
> &pdev->sbdf, i, bar->guest_addr, bar->addr);
> + destroy_map_task(task);
> return -EINVAL;
> }
>
> - rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> + rc = rangeset_add_range(mem, start_guest, end_guest);
> if ( rc )
> {
> printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
> start_guest, end_guest, rc);
> + destroy_map_task(task);
> return rc;
> }
>
> /* Check for overlap with the already setup BAR ranges. */
> for ( j = 0; j < i; j++ )
> {
> - struct vpci_bar *prev_bar = &header->bars[j];
> + struct vpci_bar_map *prev_bar = &task->bars[j];
>
> if ( rangeset_is_empty(prev_bar->mem) )
> continue;
> @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> gprintk(XENLOG_WARNING,
> "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
> &pdev->sbdf, start_guest, end_guest, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
>
> - rc = pci_sanitize_bar_memory(bar->mem);
> + rc = pci_sanitize_bar_memory(mem);
> if ( rc )
> {
> gprintk(XENLOG_WARNING,
> "%pp: failed to sanitize BAR#%u memory: %d\n",
> &pdev->sbdf, i, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
> @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>
> for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
> {
> - const struct vpci_bar *bar = &header->bars[j];
> + const struct vpci_bar_map *bar = &task->bars[j];
>
> if ( rangeset_is_empty(bar->mem) )
> continue;
> @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> gprintk(XENLOG_WARNING,
> "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
> &pdev->sbdf, start, end, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
> @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
> {
> const struct vpci_bar *bar = &header->bars[j];
> + struct rangeset *mem = task->bars[j].mem;
>
> - if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> + if ( !rangeset_overlaps_range(mem, start, end) ||
> /*
> * If only the ROM enable bit is toggled check against
> * other BARs in the same device for overlaps, but not
> @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> bar->type == VPCI_BAR_ROM) )
> continue;
>
> - rc = rangeset_remove_range(bar->mem, start, end);
> + rc = rangeset_remove_range(mem, start, end);
> if ( rc )
> {
> gprintk(XENLOG_WARNING,
> "%pp: failed to remove [%lx, %lx]: %d\n",
> &pdev->sbdf, start, end, rc);
> + destroy_map_task(task);
> return rc;
> }
> }
> @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> * will always be to establish mappings and process all the BARs.
> */
> ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> - return apply_map(pdev->domain, pdev, cmd);
> + rc = apply_map(task);
> + destroy_map_task(task);
All the destroy calls are a bit repetitive, you could consider using
an error label or similar.
> + return rc;
> }
>
> - defer_map(pdev, cmd, rom_only);
> + defer_map(task);
>
> return 0;
> }
> @@ -731,18 +800,6 @@ static void cf_check rom_write(
> }
> }
>
> -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> - unsigned int i)
> -{
> - char str[32];
> -
> - snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> -
> - bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> -
> - return !bar->mem ? -ENOMEM : 0;
> -}
> -
> static int vpci_init_capability_list(struct pci_dev *pdev)
> {
> int rc;
> @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> else
> bars[i].type = VPCI_BAR_MEM32;
>
> - rc = bar_add_rangeset(pdev, &bars[i], i);
> - if ( rc )
> - goto fail;
> -
> rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
> (i == num_bars - 1) ? PCI_BAR_LAST : 0);
> if ( rc < 0 )
> @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> 4, rom);
> if ( rc )
> rom->type = VPCI_BAR_EMPTY;
> - else
> - {
> - rc = bar_add_rangeset(pdev, rom, num_bars);
> - if ( rc )
> - goto fail;
> - }
> }
> else if ( !is_hwdom )
> {
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 09988f04c27c..7177cfce355d 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> iounmap(pdev->vpci->msix->table[i]);
> }
>
> - for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> - rangeset_destroy(pdev->vpci->header.bars[i].mem);
> -
> xfree(pdev->vpci->msix);
> xfree(pdev->vpci->msi);
> xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 6a481a4e89d3..c2e75076691f 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -103,7 +103,6 @@ struct vpci {
> uint64_t guest_addr;
> uint64_t size;
> uint64_t resizable_sizes;
> - struct rangeset *mem;
> enum {
> VPCI_BAR_EMPTY,
> VPCI_BAR_IO,
> @@ -194,14 +193,25 @@ struct vpci {
> #endif
> };
>
> -struct vpci_vcpu {
> +#ifdef __XEN__
> +struct vpci_map_task {
> /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> + struct list_head next;
> const struct pci_dev *pdev;
> + struct domain *domain;
> + struct vpci_bar_map {
> + uint64_t addr;
> + uint64_t guest_addr;
> + struct rangeset *mem;
> + } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
You might be able to use ARRAY_SIZE() to derive the number of elements
from the existing array in vpci_header struct.
Thanks, Roger.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-24 16:44 ` Roger Pau Monné
@ 2025-07-25 7:58 ` Roger Pau Monné
2025-08-01 21:06 ` Stewart Hildebrand
0 siblings, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2025-07-25 7:58 UTC (permalink / raw)
To: Stewart Hildebrand
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Stefano Stabellini
On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> > @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> > return rc;
> > }
> >
> > -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> > + uint16_t cmd, bool rom_only)
> > +{
> > + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
>
> xvzalloc() preferably.
>
> This however introduces run-time allocations as a result of guest
> actions, which is not ideal IMO. It would be preferable to do those
> allocations as part of the header initialization, and re-use them.
I've been thinking over this, as I've realized that while commenting
on it, I didn't provide any alternatives.
The usage of rangesets to figure out the regions to map is already not
optimal, as adding/removing from a rangeset can lead to memory
allocations. It would be good if we could create rangesets with a
pre-allocated number of ranges (iow: a pool of struct ranges), but
that's for another patchset. I think Jan already commented on this
aspect long time ago.
I'm considering whether to allocate the deferred mapping structures
per-vCPU instead of per-device. That would for example mean moving
the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
struct instead. The point would be to not have the rangesets per
device (because there can be a lot of devices, specially for the
hardware domain), but instead have those per-vCPU. This should work
because a vCPU can only queue a single vPCI operation, from a single
device.
It should then be possible to allocate the deferred mapping structures
at vCPU creation. I also ponder if we really need a linked list to
queue them; AFAIK there can only ever be an unmapping and a mapping
operation pending (so 2 operations at most). Hence we could use a
more "fixed" structure like an array. For example in struct vpci_vcpu
you could introduce a struct vpci_map_task task[2] field?
Sorry, I know this is not a minor change to request. It shouldn't
change the overall logic much, but it would inevitably affect the
code. Let me know what you think.
Thanks, Roger.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit
2025-07-23 16:37 ` [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit Stewart Hildebrand
@ 2025-07-25 8:13 ` Roger Pau Monné
0 siblings, 0 replies; 15+ messages in thread
From: Roger Pau Monné @ 2025-07-25 8:13 UTC (permalink / raw)
To: Stewart Hildebrand; +Cc: xen-devel
On Wed, Jul 23, 2025 at 12:37:42PM -0400, Stewart Hildebrand wrote:
> Introduce enum vpci_map_op and allow invoking modify_bars() without
> changing the memory decoding bit.
>
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> v1->v2:
> * new patch
> ---
> xen/drivers/vpci/header.c | 22 +++++++++++++++-------
> xen/include/xen/vpci.h | 4 ++++
> 2 files changed, 19 insertions(+), 7 deletions(-)
>
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index df065a5f5faf..1c66796b625b 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -189,7 +189,7 @@ static bool vpci_process_map_task(struct vpci_map_task *task)
> struct vpci_bar_map *bar = &task->bars[i];
> struct map_data data = {
> .d = task->domain,
> - .map = task->cmd & PCI_COMMAND_MEMORY,
> + .map = task->map_op == VPCI_MAP,
> .bar = bar,
> };
> int rc;
> @@ -298,7 +298,9 @@ static int __init apply_map(struct vpci_map_task *task)
> }
>
> static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> - uint16_t cmd, bool rom_only)
> + uint16_t cmd,
> + enum vpci_map_op map_op,
> + bool rom_only)
> {
> struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> unsigned int i;
> @@ -333,6 +335,7 @@ static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> task->pdev = pdev;
> task->domain = pdev->domain;
> task->cmd = cmd;
> + task->map_op = map_op;
> task->rom_only = rom_only;
>
> return task;
> @@ -359,13 +362,14 @@ static void defer_map(struct vpci_map_task *task)
> raise_softirq(SCHEDULE_SOFTIRQ);
> }
>
> -static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> +static int modify_bars(const struct pci_dev *pdev, uint16_t cmd,
> + enum vpci_map_op map_op, bool rom_only)
> {
> struct vpci_header *header = &pdev->vpci->header;
> struct pci_dev *tmp;
> const struct domain *d;
> const struct vpci_msix *msix = pdev->vpci->msix;
> - struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
> + struct vpci_map_task *task = alloc_map_task(pdev, cmd, map_op, rom_only);
> unsigned int i, j;
> int rc;
>
> @@ -614,7 +618,8 @@ static void cf_check cmd_write(
> * memory decoding bit has not been changed, so leave everything as-is,
> * hoping the guest will realize and try again.
> */
> - modify_bars(pdev, cmd, false);
> + modify_bars(pdev, cmd, cmd & PCI_COMMAND_MEMORY ? VPCI_MAP : VPCI_UNMAP,
> + false);
> else
> pci_conf_write16(pdev->sbdf, reg, cmd);
> }
> @@ -782,7 +787,8 @@ static void cf_check rom_write(
> * Pass PCI_COMMAND_MEMORY or 0 to signal a map/unmap request, note that
> * this fabricated command is never going to be written to the register.
> */
> - else if ( modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0, true) )
> + else if ( modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0,
> + new_enabled ? VPCI_MAP : VPCI_UNMAP, true) )
> /*
> * No memory has been added or removed from the p2m (because the actual
> * p2m changes are deferred in defer_map) and the ROM enable bit has
> @@ -1067,7 +1073,9 @@ static int cf_check init_header(struct pci_dev *pdev)
> goto fail;
> }
>
> - return (cmd & PCI_COMMAND_MEMORY) ? modify_bars(pdev, cmd, false) : 0;
> + return (cmd & PCI_COMMAND_MEMORY)
> + ? modify_bars(pdev, cmd, VPCI_MAP, false)
> + : 0;
>
> fail:
> pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index c2e75076691f..fb6cad62d418 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -205,6 +205,10 @@ struct vpci_map_task {
> struct rangeset *mem;
> } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
> uint16_t cmd;
> + enum vpci_map_op {
> + VPCI_MAP,
> + VPCI_UNMAP,
> + } map_op;
> bool rom_only : 1;
Since you already have a bitfield here, I would consider using a
boolean also, ie:
bool rom_only : 1;
bool map : 1;
Or are we expecting to gain new operations aside from map and unmap?
Thanks, Roger.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-24 7:47 ` Jan Beulich
@ 2025-07-25 20:25 ` dmkhn
0 siblings, 0 replies; 15+ messages in thread
From: dmkhn @ 2025-07-25 20:25 UTC (permalink / raw)
To: Jan Beulich
Cc: Stewart Hildebrand, xen-devel, Andrew Cooper, Anthony PERARD,
Michal Orzel, Julien Grall, Roger Pau Monné,
Stefano Stabellini
On Thu, Jul 24, 2025 at 09:47:24AM +0200, Jan Beulich wrote:
> On 24.07.2025 05:42, dmkhn@proton.me wrote:
> > On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
> >> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> >>> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> >>> general vPCI state: in particular, move the per-BAR rangeset out of
> >>> struct vpci and into the map task struct.
> >>>
> >>> This is preparatory work for further changes that need to perform
> >>> multiple unmap/map operations before returning to guest.
> >>>
> >>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> >>> ---
> >>> v1->v2:
> >>> * new patch
> >>>
> >>> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> >>> ---
> >>> xen/common/domain.c | 4 +
> >>> xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> >>> xen/drivers/vpci/vpci.c | 3 -
> >>> xen/include/xen/vpci.h | 16 +++-
> >>> 4 files changed, 139 insertions(+), 81 deletions(-)
>
> Why did I (and many others) end up on the To: list of this reply? (Breaks my
Apologies for that, I might missed something when sending the email out.
> rules of sorting incoming mail into appropriate folders, for context.) Also,
> please trim reply context suitably. Without you doing so, every single reader
> will need to scroll through the entirety of a long mail just to find (in this
> case) a single line of reply (somewhere in the middle). Of course you
> shouldn't be too agressive with trimming, to retain proper context for your
> reply.
Ack.
>
> Jan
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-07-25 7:58 ` Roger Pau Monné
@ 2025-08-01 21:06 ` Stewart Hildebrand
2025-08-04 13:55 ` Roger Pau Monné
0 siblings, 1 reply; 15+ messages in thread
From: Stewart Hildebrand @ 2025-08-01 21:06 UTC (permalink / raw)
To: Roger Pau Monné
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Stefano Stabellini
On 7/25/25 03:58, Roger Pau Monné wrote:
> On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
>>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>> return rc;
>>> }
>>>
>>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
>>> + uint16_t cmd, bool rom_only)
>>> +{
>>> + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
>>
>> xvzalloc() preferably.
>>
>> This however introduces run-time allocations as a result of guest
>> actions, which is not ideal IMO. It would be preferable to do those
>> allocations as part of the header initialization, and re-use them.
>
> I've been thinking over this, as I've realized that while commenting
> on it, I didn't provide any alternatives.
>
> The usage of rangesets to figure out the regions to map is already not
> optimal, as adding/removing from a rangeset can lead to memory
> allocations. It would be good if we could create rangesets with a
> pre-allocated number of ranges (iow: a pool of struct ranges), but
> that's for another patchset. I think Jan already commented on this
> aspect long time ago.
+1
> I'm considering whether to allocate the deferred mapping structures
> per-vCPU instead of per-device. That would for example mean moving
> the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
> struct instead. The point would be to not have the rangesets per
> device (because there can be a lot of devices, specially for the
> hardware domain), but instead have those per-vCPU. This should work
> because a vCPU can only queue a single vPCI operation, from a single
> device.
>
> It should then be possible to allocate the deferred mapping structures
> at vCPU creation. I also ponder if we really need a linked list to
> queue them; AFAIK there can only ever be an unmapping and a mapping
> operation pending (so 2 operations at most). Hence we could use a
> more "fixed" structure like an array. For example in struct vpci_vcpu
> you could introduce a struct vpci_map_task task[2] field?
>
> Sorry, I know this is not a minor change to request. It shouldn't
> change the overall logic much, but it would inevitably affect the
> code. Let me know what you think.
Thanks for the feedback and suggestion. Yeah, I'll give this a try.
Here's roughly what I'm thinking so far. I'll keep playing with it.
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 5241a1629eeb..942c9fe7d364 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
*/
static int vcpu_teardown(struct vcpu *v)
{
+#ifdef CONFIG_HAS_VPCI
+ for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+ {
+ struct vpci_map_task *task = &v->vpci.task[i];
+
+ for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
+ rangeset_destroy(task->bars[j].mem);
+ }
+#endif
+
vmtrace_free_buffer(v);
return 0;
@@ -467,6 +477,26 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
d->vcpu[prev_id]->next_in_list = v;
}
+#ifdef CONFIG_HAS_VPCI
+ for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+ {
+ struct vpci_map_task *task = &v->vpci.task[i];
+
+ for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
+ {
+ struct vpci_bar_map *bar = &task->bars[j];
+ char str[32];
+
+ snprintf(str, sizeof(str), "PCI map vcpu %u task %u BAR %u", vcpu_id, i, j);
+
+ bar->mem = rangeset_new(v->domain, str, RANGESETF_no_print);
+
+ if ( !bar->mem )
+ goto fail_sched;
+ }
+ }
+#endif
+
/* Must be called after making new vcpu visible to for_each_vcpu(). */
vcpu_check_shutdown(v);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 17cfecb0aabf..afe78b00ffc9 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -116,7 +116,6 @@ struct vpci {
uint64_t guest_addr;
uint64_t size;
uint64_t resizable_sizes;
- struct rangeset *mem;
enum {
VPCI_BAR_EMPTY,
VPCI_BAR_IO,
@@ -207,14 +206,23 @@ struct vpci {
#endif
};
+#ifdef __XEN__
struct vpci_vcpu {
/* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
const struct pci_dev *pdev;
- uint16_t cmd;
- bool rom_only : 1;
+ struct domain *domain;
+ unsigned int nr_pending_ops;
+ struct vpci_map_task {
+ struct vpci_bar_map {
+ uint64_t addr;
+ uint64_t guest_addr;
+ struct rangeset *mem;
+ } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
+ uint16_t cmd;
+ bool rom_only : 1;
+ } task[2];
};
-#ifdef __XEN__
void vpci_dump_msi(void);
/* Make sure there's a hole in the p2m for the MSIX mmio areas. */
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-08-01 21:06 ` Stewart Hildebrand
@ 2025-08-04 13:55 ` Roger Pau Monné
2025-08-04 13:57 ` Jan Beulich
0 siblings, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2025-08-04 13:55 UTC (permalink / raw)
To: Stewart Hildebrand
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Jan Beulich, Julien Grall, Stefano Stabellini
On Fri, Aug 01, 2025 at 05:06:32PM -0400, Stewart Hildebrand wrote:
> On 7/25/25 03:58, Roger Pau Monné wrote:
> > On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
> >> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> >>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >>> return rc;
> >>> }
> >>>
> >>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> >>> + uint16_t cmd, bool rom_only)
> >>> +{
> >>> + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> >>
> >> xvzalloc() preferably.
> >>
> >> This however introduces run-time allocations as a result of guest
> >> actions, which is not ideal IMO. It would be preferable to do those
> >> allocations as part of the header initialization, and re-use them.
> >
> > I've been thinking over this, as I've realized that while commenting
> > on it, I didn't provide any alternatives.
> >
> > The usage of rangesets to figure out the regions to map is already not
> > optimal, as adding/removing from a rangeset can lead to memory
> > allocations. It would be good if we could create rangesets with a
> > pre-allocated number of ranges (iow: a pool of struct ranges), but
> > that's for another patchset. I think Jan already commented on this
> > aspect long time ago.
>
> +1
>
> > I'm considering whether to allocate the deferred mapping structures
> > per-vCPU instead of per-device. That would for example mean moving
> > the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
> > struct instead. The point would be to not have the rangesets per
> > device (because there can be a lot of devices, specially for the
> > hardware domain), but instead have those per-vCPU. This should work
> > because a vCPU can only queue a single vPCI operation, from a single
> > device.
> >
> > It should then be possible to allocate the deferred mapping structures
> > at vCPU creation. I also ponder if we really need a linked list to
> > queue them; AFAIK there can only ever be an unmapping and a mapping
> > operation pending (so 2 operations at most). Hence we could use a
> > more "fixed" structure like an array. For example in struct vpci_vcpu
> > you could introduce a struct vpci_map_task task[2] field?
> >
> > Sorry, I know this is not a minor change to request. It shouldn't
> > change the overall logic much, but it would inevitably affect the
> > code. Let me know what you think.
>
> Thanks for the feedback and suggestion. Yeah, I'll give this a try.
> Here's roughly what I'm thinking so far. I'll keep playing with it.
>
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 5241a1629eeb..942c9fe7d364 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
> */
> static int vcpu_teardown(struct vcpu *v)
> {
> +#ifdef CONFIG_HAS_VPCI
> + for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
> + {
> + struct vpci_map_task *task = &v->vpci.task[i];
> +
> + for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
> + rangeset_destroy(task->bars[j].mem);
You might want to additionally do:
task->bars[j].mem = NULL;
> + }
> +#endif
> +
> vmtrace_free_buffer(v);
>
> return 0;
> @@ -467,6 +477,26 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
> d->vcpu[prev_id]->next_in_list = v;
> }
>
> +#ifdef CONFIG_HAS_VPCI
> + for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
> + {
> + struct vpci_map_task *task = &v->vpci.task[i];
> +
> + for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
> + {
> + struct vpci_bar_map *bar = &task->bars[j];
> + char str[32];
> +
> + snprintf(str, sizeof(str), "PCI map vcpu %u task %u BAR %u", vcpu_id, i, j);
> +
> + bar->mem = rangeset_new(v->domain, str, RANGESETF_no_print);
Not sure there's much point in naming those with that much detail -
those are scratch space for mapping calculations. You already pass
RANGESETF_no_print, which means the contents of the rangeset won't be
dumped, and hence the name is kind of meaningless. I shouldn't have
named those either when allocated in bar_add_rangeset().
> +
> + if ( !bar->mem )
> + goto fail_sched;
> + }
> + }
> +#endif
> +
> /* Must be called after making new vcpu visible to for_each_vcpu(). */
> vcpu_check_shutdown(v);
>
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 17cfecb0aabf..afe78b00ffc9 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -116,7 +116,6 @@ struct vpci {
> uint64_t guest_addr;
> uint64_t size;
> uint64_t resizable_sizes;
> - struct rangeset *mem;
> enum {
> VPCI_BAR_EMPTY,
> VPCI_BAR_IO,
> @@ -207,14 +206,23 @@ struct vpci {
> #endif
> };
>
> +#ifdef __XEN__
> struct vpci_vcpu {
> /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> const struct pci_dev *pdev;
> - uint16_t cmd;
> - bool rom_only : 1;
> + struct domain *domain;
> + unsigned int nr_pending_ops;
Not sure you really need a pending ops counter? Hard to tell without
seeing the code that makes use of it.
> + struct vpci_map_task {
> + struct vpci_bar_map {
> + uint64_t addr;
> + uint64_t guest_addr;
> + struct rangeset *mem;
> + } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
> + uint16_t cmd;
> + bool rom_only : 1;
> + } task[2];
Don't you need a way to differentiate between map/unmap operations?
Do you plan to use slot 0 as unmap and slot 1 as map? Or would you
rather introduce a boolean field to signal it in struct vpci_map_task?
Overall seems OK, but obviously it all needs to fit together with the
current code :).
Thanks, Roger.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-08-04 13:55 ` Roger Pau Monné
@ 2025-08-04 13:57 ` Jan Beulich
2025-08-04 15:02 ` Roger Pau Monné
0 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2025-08-04 13:57 UTC (permalink / raw)
To: Roger Pau Monné
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Julien Grall, Stefano Stabellini, Stewart Hildebrand
On 04.08.2025 15:55, Roger Pau Monné wrote:
> On Fri, Aug 01, 2025 at 05:06:32PM -0400, Stewart Hildebrand wrote:
>> On 7/25/25 03:58, Roger Pau Monné wrote:
>>> On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
>>>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
>>>>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>>>> return rc;
>>>>> }
>>>>>
>>>>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
>>>>> + uint16_t cmd, bool rom_only)
>>>>> +{
>>>>> + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
>>>>
>>>> xvzalloc() preferably.
>>>>
>>>> This however introduces run-time allocations as a result of guest
>>>> actions, which is not ideal IMO. It would be preferable to do those
>>>> allocations as part of the header initialization, and re-use them.
>>>
>>> I've been thinking over this, as I've realized that while commenting
>>> on it, I didn't provide any alternatives.
>>>
>>> The usage of rangesets to figure out the regions to map is already not
>>> optimal, as adding/removing from a rangeset can lead to memory
>>> allocations. It would be good if we could create rangesets with a
>>> pre-allocated number of ranges (iow: a pool of struct ranges), but
>>> that's for another patchset. I think Jan already commented on this
>>> aspect long time ago.
>>
>> +1
>>
>>> I'm considering whether to allocate the deferred mapping structures
>>> per-vCPU instead of per-device. That would for example mean moving
>>> the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
>>> struct instead. The point would be to not have the rangesets per
>>> device (because there can be a lot of devices, specially for the
>>> hardware domain), but instead have those per-vCPU. This should work
>>> because a vCPU can only queue a single vPCI operation, from a single
>>> device.
>>>
>>> It should then be possible to allocate the deferred mapping structures
>>> at vCPU creation. I also ponder if we really need a linked list to
>>> queue them; AFAIK there can only ever be an unmapping and a mapping
>>> operation pending (so 2 operations at most). Hence we could use a
>>> more "fixed" structure like an array. For example in struct vpci_vcpu
>>> you could introduce a struct vpci_map_task task[2] field?
>>>
>>> Sorry, I know this is not a minor change to request. It shouldn't
>>> change the overall logic much, but it would inevitably affect the
>>> code. Let me know what you think.
>>
>> Thanks for the feedback and suggestion. Yeah, I'll give this a try.
>> Here's roughly what I'm thinking so far. I'll keep playing with it.
>>
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index 5241a1629eeb..942c9fe7d364 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
>> */
>> static int vcpu_teardown(struct vcpu *v)
>> {
>> +#ifdef CONFIG_HAS_VPCI
>> + for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
>> + {
>> + struct vpci_map_task *task = &v->vpci.task[i];
>> +
>> + for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
>> + rangeset_destroy(task->bars[j].mem);
>
> You might want to additionally do:
>
> task->bars[j].mem = NULL;
Should we perhaps introduce RANGESET_DESTROY() along the lines of XFREE() et al?
Jan
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations
2025-08-04 13:57 ` Jan Beulich
@ 2025-08-04 15:02 ` Roger Pau Monné
0 siblings, 0 replies; 15+ messages in thread
From: Roger Pau Monné @ 2025-08-04 15:02 UTC (permalink / raw)
To: Jan Beulich
Cc: xen-devel, Andrew Cooper, Anthony PERARD, Michal Orzel,
Julien Grall, Stefano Stabellini, Stewart Hildebrand
On Mon, Aug 04, 2025 at 03:57:24PM +0200, Jan Beulich wrote:
> On 04.08.2025 15:55, Roger Pau Monné wrote:
> > On Fri, Aug 01, 2025 at 05:06:32PM -0400, Stewart Hildebrand wrote:
> >> On 7/25/25 03:58, Roger Pau Monné wrote:
> >>> On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
> >>>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> >>>>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >>>>> return rc;
> >>>>> }
> >>>>>
> >>>>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>>>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> >>>>> + uint16_t cmd, bool rom_only)
> >>>>> +{
> >>>>> + struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> >>>>
> >>>> xvzalloc() preferably.
> >>>>
> >>>> This however introduces run-time allocations as a result of guest
> >>>> actions, which is not ideal IMO. It would be preferable to do those
> >>>> allocations as part of the header initialization, and re-use them.
> >>>
> >>> I've been thinking over this, as I've realized that while commenting
> >>> on it, I didn't provide any alternatives.
> >>>
> >>> The usage of rangesets to figure out the regions to map is already not
> >>> optimal, as adding/removing from a rangeset can lead to memory
> >>> allocations. It would be good if we could create rangesets with a
> >>> pre-allocated number of ranges (iow: a pool of struct ranges), but
> >>> that's for another patchset. I think Jan already commented on this
> >>> aspect long time ago.
> >>
> >> +1
> >>
> >>> I'm considering whether to allocate the deferred mapping structures
> >>> per-vCPU instead of per-device. That would for example mean moving
> >>> the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
> >>> struct instead. The point would be to not have the rangesets per
> >>> device (because there can be a lot of devices, specially for the
> >>> hardware domain), but instead have those per-vCPU. This should work
> >>> because a vCPU can only queue a single vPCI operation, from a single
> >>> device.
> >>>
> >>> It should then be possible to allocate the deferred mapping structures
> >>> at vCPU creation. I also ponder if we really need a linked list to
> >>> queue them; AFAIK there can only ever be an unmapping and a mapping
> >>> operation pending (so 2 operations at most). Hence we could use a
> >>> more "fixed" structure like an array. For example in struct vpci_vcpu
> >>> you could introduce a struct vpci_map_task task[2] field?
> >>>
> >>> Sorry, I know this is not a minor change to request. It shouldn't
> >>> change the overall logic much, but it would inevitably affect the
> >>> code. Let me know what you think.
> >>
> >> Thanks for the feedback and suggestion. Yeah, I'll give this a try.
> >> Here's roughly what I'm thinking so far. I'll keep playing with it.
> >>
> >> diff --git a/xen/common/domain.c b/xen/common/domain.c
> >> index 5241a1629eeb..942c9fe7d364 100644
> >> --- a/xen/common/domain.c
> >> +++ b/xen/common/domain.c
> >> @@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
> >> */
> >> static int vcpu_teardown(struct vcpu *v)
> >> {
> >> +#ifdef CONFIG_HAS_VPCI
> >> + for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
> >> + {
> >> + struct vpci_map_task *task = &v->vpci.task[i];
> >> +
> >> + for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
> >> + rangeset_destroy(task->bars[j].mem);
> >
> > You might want to additionally do:
> >
> > task->bars[j].mem = NULL;
>
> Should we perhaps introduce RANGESET_DESTROY() along the lines of XFREE() et al?
Yes, I was wondering whether to recommend it here, but didn't want to
add noise, so was planning on adding this to my queue. But yes, if
you can/will please do it Stewart.
Thanks, Roger.
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2025-08-04 15:02 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-23 16:37 [PATCH v2 0/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
2025-07-23 16:37 ` [PATCH v2 1/3] vpci: allow queueing of mapping operations Stewart Hildebrand
2025-07-24 3:00 ` dmkhn
2025-07-24 3:42 ` dmkhn
2025-07-24 7:47 ` Jan Beulich
2025-07-25 20:25 ` dmkhn
2025-07-24 16:44 ` Roger Pau Monné
2025-07-25 7:58 ` Roger Pau Monné
2025-08-01 21:06 ` Stewart Hildebrand
2025-08-04 13:55 ` Roger Pau Monné
2025-08-04 13:57 ` Jan Beulich
2025-08-04 15:02 ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit Stewart Hildebrand
2025-07-25 8:13 ` Roger Pau Monné
2025-07-23 16:37 ` [PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled Stewart Hildebrand
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.