* [PATCH 01/14] vfio: refactor out vfio_interrupt_setup()
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-23 12:20 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 02/14] vfio: refactor out vfio_pci_config_setup() John Levon
` (13 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Refactor the interrupt setup code out of vfio_realize() for readability.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/pci.c | 55 +++++++++++++++++++++++++++++++--------------------
1 file changed, 34 insertions(+), 21 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 05a7a62204..02f23efaba 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2963,6 +2963,38 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
vdev->req_enabled = false;
}
+static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
+{
+ PCIDevice *pdev = &vdev->pdev;
+
+ /* QEMU emulates all of MSI & MSIX */
+ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
+ memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
+ MSIX_CAP_LENGTH);
+ }
+
+ if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
+ memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
+ vdev->msi_cap_size);
+ }
+
+ if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
+ vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+ vfio_intx_mmap_enable, vdev);
+ pci_device_set_intx_routing_notifier(&vdev->pdev,
+ vfio_intx_routing_notifier);
+ vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
+ kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
+ if (!vfio_intx_enable(vdev, errp)) {
+ timer_free(vdev->intx.mmap_timer);
+ pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+ kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+ return false;
+ }
+ }
+ return true;
+}
+
static void vfio_realize(PCIDevice *pdev, Error **errp)
{
ERRP_GUARD();
@@ -3142,27 +3174,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vfio_bar_quirk_setup(vdev, i);
}
- /* QEMU emulates all of MSI & MSIX */
- if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
- memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
- MSIX_CAP_LENGTH);
- }
-
- if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
- memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
- vdev->msi_cap_size);
- }
-
- if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
- vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
- vfio_intx_mmap_enable, vdev);
- pci_device_set_intx_routing_notifier(&vdev->pdev,
- vfio_intx_routing_notifier);
- vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
- kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
- if (!vfio_intx_enable(vdev, errp)) {
- goto out_deregister;
- }
+ if (!vfio_interrupt_setup(vdev, errp)) {
+ goto out_unset_idev;
}
if (vdev->display != ON_OFF_AUTO_OFF) {
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 01/14] vfio: refactor out vfio_interrupt_setup()
2025-04-09 13:48 ` [PATCH 01/14] vfio: refactor out vfio_interrupt_setup() John Levon
@ 2025-04-23 12:20 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-23 12:20 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> Refactor the interrupt setup code out of vfio_realize() for readability.
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Thanks,
C.
> ---
> hw/vfio/pci.c | 55 +++++++++++++++++++++++++++++++--------------------
> 1 file changed, 34 insertions(+), 21 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 05a7a62204..02f23efaba 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2963,6 +2963,38 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> vdev->req_enabled = false;
> }
>
> +static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
> +{
> + PCIDevice *pdev = &vdev->pdev;
> +
> + /* QEMU emulates all of MSI & MSIX */
> + if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
> + memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
> + MSIX_CAP_LENGTH);
> + }
> +
> + if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
> + memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
> + vdev->msi_cap_size);
> + }
> +
> + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> + vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
> + vfio_intx_mmap_enable, vdev);
> + pci_device_set_intx_routing_notifier(&vdev->pdev,
> + vfio_intx_routing_notifier);
> + vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
> + kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
> + if (!vfio_intx_enable(vdev, errp)) {
> + timer_free(vdev->intx.mmap_timer);
> + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
> + return false;
> + }
> + }
> + return true;
> +}
> +
> static void vfio_realize(PCIDevice *pdev, Error **errp)
> {
> ERRP_GUARD();
> @@ -3142,27 +3174,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> vfio_bar_quirk_setup(vdev, i);
> }
>
> - /* QEMU emulates all of MSI & MSIX */
> - if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
> - memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
> - MSIX_CAP_LENGTH);
> - }
> -
> - if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
> - memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
> - vdev->msi_cap_size);
> - }
> -
> - if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> - vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
> - vfio_intx_mmap_enable, vdev);
> - pci_device_set_intx_routing_notifier(&vdev->pdev,
> - vfio_intx_routing_notifier);
> - vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
> - kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
> - if (!vfio_intx_enable(vdev, errp)) {
> - goto out_deregister;
> - }
> + if (!vfio_interrupt_setup(vdev, errp)) {
> + goto out_unset_idev;
> }
>
> if (vdev->display != ON_OFF_AUTO_OFF) {
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 02/14] vfio: refactor out vfio_pci_config_setup()
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
2025-04-09 13:48 ` [PATCH 01/14] vfio: refactor out vfio_interrupt_setup() John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-09 15:38 ` Tomita Moeko
2025-04-09 13:48 ` [PATCH 03/14] vfio: add vfio_prepare_device() John Levon
` (12 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Refactor the PCI config setup code out of vfio_realize() for
readability.
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/pci.c | 176 +++++++++++++++++++++++++++-----------------------
1 file changed, 94 insertions(+), 82 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 02f23efaba..81bf0dab28 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2963,6 +2963,99 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
vdev->req_enabled = false;
}
+static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
+{
+ PCIDevice *pdev = &vdev->pdev;
+ VFIODevice *vbasedev = &vdev->vbasedev;
+
+ /* vfio emulates a lot for us, but some bits need extra love */
+ vdev->emulated_config_bits = g_malloc0(vdev->config_size);
+
+ /* QEMU can choose to expose the ROM or not */
+ memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
+ /* QEMU can also add or extend BARs */
+ memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
+
+ /*
+ * The PCI spec reserves vendor ID 0xffff as an invalid value. The
+ * device ID is managed by the vendor and need only be a 16-bit value.
+ * Allow any 16-bit value for subsystem so they can be hidden or changed.
+ */
+ if (vdev->vendor_id != PCI_ANY_ID) {
+ if (vdev->vendor_id >= 0xffff) {
+ error_setg(errp, "invalid PCI vendor ID provided");
+ return false;
+ }
+ vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
+ trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
+ } else {
+ vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
+ }
+
+ if (vdev->device_id != PCI_ANY_ID) {
+ if (vdev->device_id > 0xffff) {
+ error_setg(errp, "invalid PCI device ID provided");
+ return false;
+ }
+ vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
+ trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
+ } else {
+ vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
+ }
+
+ if (vdev->sub_vendor_id != PCI_ANY_ID) {
+ if (vdev->sub_vendor_id > 0xffff) {
+ error_setg(errp, "invalid PCI subsystem vendor ID provided");
+ return false;
+ }
+ vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
+ vdev->sub_vendor_id, ~0);
+ trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
+ vdev->sub_vendor_id);
+ }
+
+ if (vdev->sub_device_id != PCI_ANY_ID) {
+ if (vdev->sub_device_id > 0xffff) {
+ error_setg(errp, "invalid PCI subsystem device ID provided");
+ return false;
+ }
+ vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
+ trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
+ vdev->sub_device_id);
+ }
+
+ /* QEMU can change multi-function devices to single function, or reverse */
+ vdev->emulated_config_bits[PCI_HEADER_TYPE] =
+ PCI_HEADER_TYPE_MULTI_FUNCTION;
+
+ /* Restore or clear multifunction, this is always controlled by QEMU */
+ if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+ vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
+ } else {
+ vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
+ }
+
+ /*
+ * Clear host resource mapping info. If we choose not to register a
+ * BAR, such as might be the case with the option ROM, we can get
+ * confusing, unwritable, residual addresses from the host here.
+ */
+ memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+ memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+ vfio_pci_size_rom(vdev);
+
+ vfio_bars_prepare(vdev);
+
+ if (!vfio_msix_early_setup(vdev, errp)) {
+ return false;
+ }
+
+ vfio_bars_register(vdev);
+
+ return true;
+}
+
static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
{
PCIDevice *pdev = &vdev->pdev;
@@ -3067,91 +3160,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto error;
}
- /* vfio emulates a lot for us, but some bits need extra love */
- vdev->emulated_config_bits = g_malloc0(vdev->config_size);
-
- /* QEMU can choose to expose the ROM or not */
- memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
- /* QEMU can also add or extend BARs */
- memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
-
- /*
- * The PCI spec reserves vendor ID 0xffff as an invalid value. The
- * device ID is managed by the vendor and need only be a 16-bit value.
- * Allow any 16-bit value for subsystem so they can be hidden or changed.
- */
- if (vdev->vendor_id != PCI_ANY_ID) {
- if (vdev->vendor_id >= 0xffff) {
- error_setg(errp, "invalid PCI vendor ID provided");
- goto error;
- }
- vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
- trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
- } else {
- vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
- }
-
- if (vdev->device_id != PCI_ANY_ID) {
- if (vdev->device_id > 0xffff) {
- error_setg(errp, "invalid PCI device ID provided");
- goto error;
- }
- vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
- trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
- } else {
- vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
- }
-
- if (vdev->sub_vendor_id != PCI_ANY_ID) {
- if (vdev->sub_vendor_id > 0xffff) {
- error_setg(errp, "invalid PCI subsystem vendor ID provided");
- goto error;
- }
- vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
- vdev->sub_vendor_id, ~0);
- trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
- vdev->sub_vendor_id);
- }
-
- if (vdev->sub_device_id != PCI_ANY_ID) {
- if (vdev->sub_device_id > 0xffff) {
- error_setg(errp, "invalid PCI subsystem device ID provided");
- goto error;
- }
- vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
- trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
- vdev->sub_device_id);
- }
-
- /* QEMU can change multi-function devices to single function, or reverse */
- vdev->emulated_config_bits[PCI_HEADER_TYPE] =
- PCI_HEADER_TYPE_MULTI_FUNCTION;
-
- /* Restore or clear multifunction, this is always controlled by QEMU */
- if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
- vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
- } else {
- vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
- }
-
- /*
- * Clear host resource mapping info. If we choose not to register a
- * BAR, such as might be the case with the option ROM, we can get
- * confusing, unwritable, residual addresses from the host here.
- */
- memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
- memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
-
- vfio_pci_size_rom(vdev);
-
- vfio_bars_prepare(vdev);
-
- if (!vfio_msix_early_setup(vdev, errp)) {
+ if (!vfio_pci_config_setup(vdev, errp)) {
goto error;
}
- vfio_bars_register(vdev);
-
if (!vbasedev->mdev &&
!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
error_prepend(errp, "Failed to set vIOMMU: ");
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 02/14] vfio: refactor out vfio_pci_config_setup()
2025-04-09 13:48 ` [PATCH 02/14] vfio: refactor out vfio_pci_config_setup() John Levon
@ 2025-04-09 15:38 ` Tomita Moeko
2025-04-09 15:41 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Tomita Moeko @ 2025-04-09 15:38 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
Hi, John
On 4/9/25 21:48, John Levon wrote:
> Refactor the PCI config setup code out of vfio_realize() for
> readability.
>
> Reviewed-by: Cédric Le Goater <clg@redhat.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/pci.c | 176 +++++++++++++++++++++++++++-----------------------
> 1 file changed, 94 insertions(+), 82 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 02f23efaba..81bf0dab28 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2963,6 +2963,99 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> vdev->req_enabled = false;
> }
>
> +static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
> +{
> + PCIDevice *pdev = &vdev->pdev;
> + VFIODevice *vbasedev = &vdev->vbasedev;
> +
> + /* vfio emulates a lot for us, but some bits need extra love */
> + vdev->emulated_config_bits = g_malloc0(vdev->config_size);
> +
> + /* QEMU can choose to expose the ROM or not */
> + memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
> + /* QEMU can also add or extend BARs */
> + memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
> +
> + /*
> + * The PCI spec reserves vendor ID 0xffff as an invalid value. The
> + * device ID is managed by the vendor and need only be a 16-bit value.
> + * Allow any 16-bit value for subsystem so they can be hidden or changed.
> + */
> + if (vdev->vendor_id != PCI_ANY_ID) {
> + if (vdev->vendor_id >= 0xffff) {
> + error_setg(errp, "invalid PCI vendor ID provided");
> + return false;
> + }
> + vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
> + trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
> + } else {
> + vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
> + }
> +
> + if (vdev->device_id != PCI_ANY_ID) {
> + if (vdev->device_id > 0xffff) {
> + error_setg(errp, "invalid PCI device ID provided");
> + return false;
> + }
> + vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
> + trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
> + } else {
> + vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
> + }
> +
> + if (vdev->sub_vendor_id != PCI_ANY_ID) {
> + if (vdev->sub_vendor_id > 0xffff) {
> + error_setg(errp, "invalid PCI subsystem vendor ID provided");
> + return false;
> + }
> + vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
> + vdev->sub_vendor_id, ~0);
> + trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
> + vdev->sub_vendor_id);
> + }
> +
> + if (vdev->sub_device_id != PCI_ANY_ID) {
> + if (vdev->sub_device_id > 0xffff) {
> + error_setg(errp, "invalid PCI subsystem device ID provided");
> + return false;
> + }
> + vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
> + trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
> + vdev->sub_device_id);
> + }
> +
> + /* QEMU can change multi-function devices to single function, or reverse */
> + vdev->emulated_config_bits[PCI_HEADER_TYPE] =
> + PCI_HEADER_TYPE_MULTI_FUNCTION;
> +
> + /* Restore or clear multifunction, this is always controlled by QEMU */
> + if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> + vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
> + } else {
> + vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
> + }
> +
> + /*
> + * Clear host resource mapping info. If we choose not to register a
> + * BAR, such as might be the case with the option ROM, we can get
> + * confusing, unwritable, residual addresses from the host here.
> + */
> + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> +
> + vfio_pci_size_rom(vdev);
> +
> + vfio_bars_prepare(vdev);
> +
> + if (!vfio_msix_early_setup(vdev, errp)) {
> + return false;
> + }
> +
> + vfio_bars_register(vdev);
Probably we can also put `vfio_config_quirk_setup` here as it deals with
device-spcific config space.
I would personally prefer keeping `vfio_bars_register` in `vfio_realize`
so that it matches `vfio_bars_exit` at the end, just a minor nit.
Thanks,
Moeko
> +
> + return true;
> +}
> +
> static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
> {
> PCIDevice *pdev = &vdev->pdev;
> @@ -3067,91 +3160,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> goto error;
> }
>
> - /* vfio emulates a lot for us, but some bits need extra love */
> - vdev->emulated_config_bits = g_malloc0(vdev->config_size);
> -
> - /* QEMU can choose to expose the ROM or not */
> - memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
> - /* QEMU can also add or extend BARs */
> - memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
> -
> - /*
> - * The PCI spec reserves vendor ID 0xffff as an invalid value. The
> - * device ID is managed by the vendor and need only be a 16-bit value.
> - * Allow any 16-bit value for subsystem so they can be hidden or changed.
> - */
> - if (vdev->vendor_id != PCI_ANY_ID) {
> - if (vdev->vendor_id >= 0xffff) {
> - error_setg(errp, "invalid PCI vendor ID provided");
> - goto error;
> - }
> - vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
> - trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
> - } else {
> - vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
> - }
> -
> - if (vdev->device_id != PCI_ANY_ID) {
> - if (vdev->device_id > 0xffff) {
> - error_setg(errp, "invalid PCI device ID provided");
> - goto error;
> - }
> - vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
> - trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
> - } else {
> - vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
> - }
> -
> - if (vdev->sub_vendor_id != PCI_ANY_ID) {
> - if (vdev->sub_vendor_id > 0xffff) {
> - error_setg(errp, "invalid PCI subsystem vendor ID provided");
> - goto error;
> - }
> - vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
> - vdev->sub_vendor_id, ~0);
> - trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
> - vdev->sub_vendor_id);
> - }
> -
> - if (vdev->sub_device_id != PCI_ANY_ID) {
> - if (vdev->sub_device_id > 0xffff) {
> - error_setg(errp, "invalid PCI subsystem device ID provided");
> - goto error;
> - }
> - vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
> - trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
> - vdev->sub_device_id);
> - }
> -
> - /* QEMU can change multi-function devices to single function, or reverse */
> - vdev->emulated_config_bits[PCI_HEADER_TYPE] =
> - PCI_HEADER_TYPE_MULTI_FUNCTION;
> -
> - /* Restore or clear multifunction, this is always controlled by QEMU */
> - if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> - vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
> - } else {
> - vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
> - }
> -
> - /*
> - * Clear host resource mapping info. If we choose not to register a
> - * BAR, such as might be the case with the option ROM, we can get
> - * confusing, unwritable, residual addresses from the host here.
> - */
> - memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> - memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> -
> - vfio_pci_size_rom(vdev);
> -
> - vfio_bars_prepare(vdev);
> -
> - if (!vfio_msix_early_setup(vdev, errp)) {
> + if (!vfio_pci_config_setup(vdev, errp)) {
> goto error;
> }
>
> - vfio_bars_register(vdev);
> -
> if (!vbasedev->mdev &&
> !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
> error_prepend(errp, "Failed to set vIOMMU: ");
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 02/14] vfio: refactor out vfio_pci_config_setup()
2025-04-09 15:38 ` Tomita Moeko
@ 2025-04-09 15:41 ` John Levon
0 siblings, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-09 15:41 UTC (permalink / raw)
To: Tomita Moeko
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella,
Cédric Le Goater, Peter Xu, Thomas Huth, Matthew Rosato,
David Hildenbrand, Michael S. Tsirkin, Alex Williamson,
qemu-s390x, Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On Wed, Apr 09, 2025 at 11:38:01PM +0800, Tomita Moeko wrote:
> On 4/9/25 21:48, John Levon wrote:
> > Refactor the PCI config setup code out of vfio_realize() for
> > readability.
> > + if (!vfio_msix_early_setup(vdev, errp)) {
> > + return false;
> > + }
> > +
> > + vfio_bars_register(vdev);
>
> Probably we can also put `vfio_config_quirk_setup` here as it deals with
> device-spcific config space.
This should be harmless (I think), but...
> I would personally prefer keeping `vfio_bars_register` in `vfio_realize`
> so that it matches `vfio_bars_exit` at the end, just a minor nit.
... this means that vfio-user (when it exists) can't re-use the function, which
is the underlying reason for this refactoring originally.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 03/14] vfio: add vfio_prepare_device()
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
2025-04-09 13:48 ` [PATCH 01/14] vfio: refactor out vfio_interrupt_setup() John Levon
2025-04-09 13:48 ` [PATCH 02/14] vfio: refactor out vfio_pci_config_setup() John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-23 12:45 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 04/14] vfio: add vfio_attach_device_by_iommu_type() John Levon
` (11 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Commonize some initialization code shared by the legacy and iommufd vfio
implementations.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/container.c | 14 ++------------
hw/vfio/device.c | 14 ++++++++++++++
hw/vfio/iommufd.c | 9 +--------
include/hw/vfio/vfio-device.h | 3 +++
4 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 23a3373470..4fc181d33b 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -811,18 +811,14 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
}
}
+ vfio_device_prepare(vbasedev, &group->container->bcontainer, info);
+
vbasedev->fd = fd;
vbasedev->group = group;
QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
- vbasedev->num_irqs = info->num_irqs;
- vbasedev->num_regions = info->num_regions;
- vbasedev->flags = info->flags;
-
trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs);
- vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
-
return true;
}
@@ -875,7 +871,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
int groupid = vfio_device_get_groupid(vbasedev, errp);
VFIODevice *vbasedev_iter;
VFIOGroup *group;
- VFIOContainerBase *bcontainer;
if (groupid < 0) {
return false;
@@ -904,11 +899,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
return false;
}
- bcontainer = &group->container->bcontainer;
- vbasedev->bcontainer = bcontainer;
- QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
- QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
-
return true;
}
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 4de6948cf4..4d940ddb3a 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -403,3 +403,17 @@ void vfio_device_detach(VFIODevice *vbasedev)
object_unref(vbasedev->hiod);
VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
}
+
+void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
+ struct vfio_device_info *info)
+{
+ vbasedev->num_irqs = info->num_irqs;
+ vbasedev->num_regions = info->num_regions;
+ vbasedev->flags = info->flags;
+ vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
+
+ vbasedev->bcontainer = bcontainer;
+ QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
+
+ QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 48db105422..1874185fcf 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -587,14 +587,7 @@ found_container:
iommufd_cdev_ram_block_discard_disable(false);
}
- vbasedev->group = 0;
- vbasedev->num_irqs = dev_info.num_irqs;
- vbasedev->num_regions = dev_info.num_regions;
- vbasedev->flags = dev_info.flags;
- vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
- vbasedev->bcontainer = bcontainer;
- QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
- QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+ vfio_device_prepare(vbasedev, bcontainer, &dev_info);
trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
vbasedev->num_regions, vbasedev->flags);
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 66797b4c92..1a2fe378d0 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -129,6 +129,9 @@ bool vfio_device_attach(char *name, VFIODevice *vbasedev,
void vfio_device_detach(VFIODevice *vbasedev);
VFIODevice *vfio_get_vfio_device(Object *obj);
+void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
+ struct vfio_device_info *info);
+
typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIODeviceList vfio_device_list;
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 03/14] vfio: add vfio_prepare_device()
2025-04-09 13:48 ` [PATCH 03/14] vfio: add vfio_prepare_device() John Levon
@ 2025-04-23 12:45 ` Cédric Le Goater
2025-04-23 13:19 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-23 12:45 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
Subject needs fix.
On 4/9/25 15:48, John Levon wrote:
> Commonize some initialization code shared by the legacy and iommufd vfio
> implementations.
>
May be vfio_device_set_info() would be a better name ? Anyhow,
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Thanks,
C.
> Signed-off-by: John Levon <john.levon@nutanix.com>> ---
> hw/vfio/container.c | 14 ++------------
> hw/vfio/device.c | 14 ++++++++++++++
> hw/vfio/iommufd.c | 9 +--------
> include/hw/vfio/vfio-device.h | 3 +++
> 4 files changed, 20 insertions(+), 20 deletions(-)
>
> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
> index 23a3373470..4fc181d33b 100644
> --- a/hw/vfio/container.c
> +++ b/hw/vfio/container.c
> @@ -811,18 +811,14 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
> }
> }
>
> + vfio_device_prepare(vbasedev, &group->container->bcontainer, info);
> +
> vbasedev->fd = fd;
> vbasedev->group = group;
> QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
>
> - vbasedev->num_irqs = info->num_irqs;
> - vbasedev->num_regions = info->num_regions;
> - vbasedev->flags = info->flags;
> -
> trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs);
>
> - vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
> -
> return true;
> }
>
> @@ -875,7 +871,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
> int groupid = vfio_device_get_groupid(vbasedev, errp);
> VFIODevice *vbasedev_iter;
> VFIOGroup *group;
> - VFIOContainerBase *bcontainer;
>
> if (groupid < 0) {
> return false;
> @@ -904,11 +899,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
> return false;
> }
>
> - bcontainer = &group->container->bcontainer;
> - vbasedev->bcontainer = bcontainer;
> - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
> - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
> -
> return true;
> }
>
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 4de6948cf4..4d940ddb3a 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -403,3 +403,17 @@ void vfio_device_detach(VFIODevice *vbasedev)
> object_unref(vbasedev->hiod);
> VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
> }
> +
> +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
> + struct vfio_device_info *info)
> +{
> + vbasedev->num_irqs = info->num_irqs;
> + vbasedev->num_regions = info->num_regions;
> + vbasedev->flags = info->flags;
> + vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
> +
> + vbasedev->bcontainer = bcontainer;
> + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
> +
> + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
> +}
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 48db105422..1874185fcf 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -587,14 +587,7 @@ found_container:
> iommufd_cdev_ram_block_discard_disable(false);
> }
>
> - vbasedev->group = 0;
> - vbasedev->num_irqs = dev_info.num_irqs;
> - vbasedev->num_regions = dev_info.num_regions;
> - vbasedev->flags = dev_info.flags;
> - vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
> - vbasedev->bcontainer = bcontainer;
> - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
> - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
> + vfio_device_prepare(vbasedev, bcontainer, &dev_info);
>
> trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
> vbasedev->num_regions, vbasedev->flags);
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 66797b4c92..1a2fe378d0 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -129,6 +129,9 @@ bool vfio_device_attach(char *name, VFIODevice *vbasedev,
> void vfio_device_detach(VFIODevice *vbasedev);
> VFIODevice *vfio_get_vfio_device(Object *obj);
>
> +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
> + struct vfio_device_info *info);
> +
> typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
> extern VFIODeviceList vfio_device_list;
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 03/14] vfio: add vfio_prepare_device()
2025-04-23 12:45 ` Cédric Le Goater
@ 2025-04-23 13:19 ` John Levon
0 siblings, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-23 13:19 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On Wed, Apr 23, 2025 at 02:45:41PM +0200, Cédric Le Goater wrote:
> !-------------------------------------------------------------------|
> CAUTION: External Email
>
> |-------------------------------------------------------------------!
>
> Subject needs fix.
>
> On 4/9/25 15:48, John Levon wrote:
> > Commonize some initialization code shared by the legacy and iommufd vfio
> > implementations.
> >
>
> May be vfio_device_set_info() would be a better name ? Anyhow,
As we're adding to lists too I think set_info() might be confusing?
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 04/14] vfio: add vfio_attach_device_by_iommu_type()
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (2 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 03/14] vfio: add vfio_prepare_device() John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-23 13:25 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks John Levon
` (10 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Allow attachment by explicitly passing a TYPE_VFIO_IOMMU_* string;
vfio-user will use this later.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/device.c | 24 ++++++++++++++++--------
include/hw/vfio/vfio-device.h | 3 +++
2 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 4d940ddb3a..f74b9c25ea 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -367,20 +367,17 @@ VFIODevice *vfio_get_vfio_device(Object *obj)
}
}
-bool vfio_device_attach(char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp)
+bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name,
+ VFIODevice *vbasedev, AddressSpace *as,
+ Error **errp)
{
- const VFIOIOMMUClass *ops =
- VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
HostIOMMUDevice *hiod = NULL;
- if (vbasedev->iommufd) {
- ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
- }
+ const VFIOIOMMUClass *ops =
+ VFIO_IOMMU_CLASS(object_class_by_name(iommu_type));
assert(ops);
-
if (!vbasedev->mdev) {
hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
vbasedev->hiod = hiod;
@@ -395,6 +392,17 @@ bool vfio_device_attach(char *name, VFIODevice *vbasedev,
return true;
}
+bool vfio_device_attach(char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp)
+{
+ const char *iommu_type = vbasedev->iommufd ?
+ TYPE_VFIO_IOMMU_IOMMUFD :
+ TYPE_VFIO_IOMMU_LEGACY;
+
+ return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev,
+ as, errp);
+}
+
void vfio_device_detach(VFIODevice *vbasedev)
{
if (!vbasedev->bcontainer) {
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 1a2fe378d0..3563a82ede 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -126,6 +126,9 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev);
bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp);
bool vfio_device_attach(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
+bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name,
+ VFIODevice *vbasedev, AddressSpace *as,
+ Error **errp);
void vfio_device_detach(VFIODevice *vbasedev);
VFIODevice *vfio_get_vfio_device(Object *obj);
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 04/14] vfio: add vfio_attach_device_by_iommu_type()
2025-04-09 13:48 ` [PATCH 04/14] vfio: add vfio_attach_device_by_iommu_type() John Levon
@ 2025-04-23 13:25 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-23 13:25 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
Same typo in Subject
On 4/9/25 15:48, John Levon wrote:
> Allow attachment by explicitly passing a TYPE_VFIO_IOMMU_* string;
> vfio-user will use this later.
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/device.c | 24 ++++++++++++++++--------
> include/hw/vfio/vfio-device.h | 3 +++
> 2 files changed, 19 insertions(+), 8 deletions(-)
>
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 4d940ddb3a..f74b9c25ea 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -367,20 +367,17 @@ VFIODevice *vfio_get_vfio_device(Object *obj)
> }
> }
>
> -bool vfio_device_attach(char *name, VFIODevice *vbasedev,
> - AddressSpace *as, Error **errp)
> +bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name,
> + VFIODevice *vbasedev, AddressSpace *as,
> + Error **errp)
> {
> - const VFIOIOMMUClass *ops =
> - VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
> HostIOMMUDevice *hiod = NULL;
>
> - if (vbasedev->iommufd) {
> - ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
> - }
> + const VFIOIOMMUClass *ops =
> + VFIO_IOMMU_CLASS(object_class_by_name(iommu_type));
There are minor conflicts with Zhenzhong's series :
https://lore.kernel.org/qemu-devel/20250423072824.3647952-1-zhenzhong.duan@intel.com/
which can be fixed easily.
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Thanks,
C.
> assert(ops);
>
> -
> if (!vbasedev->mdev) {
> hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
> vbasedev->hiod = hiod;
> @@ -395,6 +392,17 @@ bool vfio_device_attach(char *name, VFIODevice *vbasedev,
> return true;
> }
>
> +bool vfio_device_attach(char *name, VFIODevice *vbasedev,
> + AddressSpace *as, Error **errp)
> +{
> + const char *iommu_type = vbasedev->iommufd ?
> + TYPE_VFIO_IOMMU_IOMMUFD :
> + TYPE_VFIO_IOMMU_LEGACY;
> +
> + return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev,
> + as, errp);
> +}
> +
> void vfio_device_detach(VFIODevice *vbasedev)
> {
> if (!vbasedev->bcontainer) {
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 1a2fe378d0..3563a82ede 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -126,6 +126,9 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev);
> bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp);
> bool vfio_device_attach(char *name, VFIODevice *vbasedev,
> AddressSpace *as, Error **errp);
> +bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name,
> + VFIODevice *vbasedev, AddressSpace *as,
> + Error **errp);
> void vfio_device_detach(VFIODevice *vbasedev);
> VFIODevice *vfio_get_vfio_device(Object *obj);
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (3 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 04/14] vfio: add vfio_attach_device_by_iommu_type() John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-23 13:45 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 06/14] vfio: add flags parameter to DMA unmap callback John Levon
` (9 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
The vfio-user container will later need to hook into these callbacks;
set up vfio to use them, and optionally pass them through to the
container.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/listener.c | 28 +++++++++++++++++++++++++++
include/hw/vfio/vfio-container-base.h | 2 ++
2 files changed, 30 insertions(+)
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 285ca97a8c..9ffc2deb2d 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -412,6 +412,32 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
return true;
}
+static void vfio_listener_begin(MemoryListener *listener)
+{
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
+ void (*listener_begin)(VFIOContainerBase *bcontainer);
+
+ listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
+
+ if (listener_begin) {
+ listener_begin(bcontainer);
+ }
+}
+
+static void vfio_listener_commit(MemoryListener *listener)
+{
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
+ void (*listener_commit)(VFIOContainerBase *bcontainer);
+
+ listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
+
+ if (listener_commit) {
+ listener_commit(bcontainer);
+ }
+}
+
static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
{
/*
@@ -1166,6 +1192,8 @@ static void vfio_listener_log_sync(MemoryListener *listener,
static const MemoryListener vfio_memory_listener = {
.name = "vfio",
+ .begin = vfio_listener_begin,
+ .commit = vfio_listener_commit,
.region_add = vfio_listener_region_add,
.region_del = vfio_listener_region_del,
.log_global_start = vfio_listener_log_global_start,
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index a441932be7..67373e8db0 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -120,6 +120,8 @@ struct VFIOIOMMUClass {
/* basic feature */
bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
+ void (*listener_begin)(VFIOContainerBase *bcontainer);
+ void (*listener_commit)(VFIOContainerBase *bcontainer);
int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks
2025-04-09 13:48 ` [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks John Levon
@ 2025-04-23 13:45 ` Cédric Le Goater
2025-04-24 16:24 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-23 13:45 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> The vfio-user container will later need to hook into these callbacks;
> set up vfio to use them, and optionally pass them through to the
> container.
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/listener.c | 28 +++++++++++++++++++++++++++
> include/hw/vfio/vfio-container-base.h | 2 ++
> 2 files changed, 30 insertions(+)
>
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index 285ca97a8c..9ffc2deb2d 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -412,6 +412,32 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
> return true;
> }
>
> +static void vfio_listener_begin(MemoryListener *listener)
> +{
> + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
> + listener);
> + void (*listener_begin)(VFIOContainerBase *bcontainer);
> +
> + listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
> +
> + if (listener_begin) {
> + listener_begin(bcontainer);
> + }
> +}
> +
> +static void vfio_listener_commit(MemoryListener *listener)
> +{
> + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
> + listener);
> + void (*listener_commit)(VFIOContainerBase *bcontainer);
> +
> + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
> +
> + if (listener_commit) {
> + listener_commit(bcontainer);
> + }
> +}
> +
> static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
> {
> /*
> @@ -1166,6 +1192,8 @@ static void vfio_listener_log_sync(MemoryListener *listener,
>
> static const MemoryListener vfio_memory_listener = {
> .name = "vfio",
> + .begin = vfio_listener_begin,
> + .commit = vfio_listener_commit,
> .region_add = vfio_listener_region_add,
> .region_del = vfio_listener_region_del,
> .log_global_start = vfio_listener_log_global_start,
> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
> index a441932be7..67373e8db0 100644
> --- a/include/hw/vfio/vfio-container-base.h
> +++ b/include/hw/vfio/vfio-container-base.h
> @@ -120,6 +120,8 @@ struct VFIOIOMMUClass {
>
> /* basic feature */
> bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
> + void (*listener_begin)(VFIOContainerBase *bcontainer);
> + void (*listener_commit)(VFIOContainerBase *bcontainer);
Please add documentation for the new callbacks.
Thanks,
C.
> int (*dma_map)(const VFIOContainerBase *bcontainer,
> hwaddr iova, ram_addr_t size,
> void *vaddr, bool readonly);
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks
2025-04-23 13:45 ` Cédric Le Goater
@ 2025-04-24 16:24 ` Cédric Le Goater
2025-04-24 16:28 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 16:24 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/23/25 15:45, Cédric Le Goater wrote:
> On 4/9/25 15:48, John Levon wrote:
>> The vfio-user container will later need to hook into these callbacks;
>> set up vfio to use them, and optionally pass them through to the
>> container.
>>
>> Signed-off-by: John Levon <john.levon@nutanix.com>
>> ---
>> hw/vfio/listener.c | 28 +++++++++++++++++++++++++++
>> include/hw/vfio/vfio-container-base.h | 2 ++
>> 2 files changed, 30 insertions(+)
>>
>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>> index 285ca97a8c..9ffc2deb2d 100644
>> --- a/hw/vfio/listener.c
>> +++ b/hw/vfio/listener.c
>> @@ -412,6 +412,32 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
>> return true;
>> }
>> +static void vfio_listener_begin(MemoryListener *listener)
>> +{
>> + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
>> + listener);
>> + void (*listener_begin)(VFIOContainerBase *bcontainer);
>> +
>> + listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
>> +
>> + if (listener_begin) {
>> + listener_begin(bcontainer);
>> + }
>> +}
>> +
>> +static void vfio_listener_commit(MemoryListener *listener)
>> +{
>> + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
>> + listener);
>> + void (*listener_commit)(VFIOContainerBase *bcontainer);
>> +
>> + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
>> +
>> + if (listener_commit) {
>> + listener_commit(bcontainer);
>> + }
>> +}
>> +
>> static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
>> {
>> /*
>> @@ -1166,6 +1192,8 @@ static void vfio_listener_log_sync(MemoryListener *listener,
>> static const MemoryListener vfio_memory_listener = {
>> .name = "vfio",
>> + .begin = vfio_listener_begin,
>> + .commit = vfio_listener_commit,
>> .region_add = vfio_listener_region_add,
>> .region_del = vfio_listener_region_del,
>> .log_global_start = vfio_listener_log_global_start,
>> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
>> index a441932be7..67373e8db0 100644
>> --- a/include/hw/vfio/vfio-container-base.h
>> +++ b/include/hw/vfio/vfio-container-base.h
>> @@ -120,6 +120,8 @@ struct VFIOIOMMUClass {
>> /* basic feature */
>> bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
>> + void (*listener_begin)(VFIOContainerBase *bcontainer);
>> + void (*listener_commit)(VFIOContainerBase *bcontainer);
>
> Please add documentation for the new callbacks.
and it is not used in this series yet. So we can keep it for later.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks
2025-04-24 16:24 ` Cédric Le Goater
@ 2025-04-24 16:28 ` John Levon
2025-04-24 16:35 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-24 16:28 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On Thu, Apr 24, 2025 at 06:24:23PM +0200, Cédric Le Goater wrote:
> > On 4/9/25 15:48, John Levon wrote:
> > > The vfio-user container will later need to hook into these callbacks;
> > > set up vfio to use them, and optionally pass them through to the
> > > container.
> > > + void (*listener_begin)(VFIOContainerBase *bcontainer);
> > > + void (*listener_commit)(VFIOContainerBase *bcontainer);
> >
> > Please add documentation for the new callbacks.
> and it is not used in this series yet. So we can keep it for later.
Will do, I thought you wanted the general vfio stuff separated out, but can move
to the vfio-user specific queue.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks
2025-04-24 16:28 ` John Levon
@ 2025-04-24 16:35 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 16:35 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/24/25 18:28, John Levon wrote:
> On Thu, Apr 24, 2025 at 06:24:23PM +0200, Cédric Le Goater wrote:
>
>>> On 4/9/25 15:48, John Levon wrote:
>>>> The vfio-user container will later need to hook into these callbacks;
>>>> set up vfio to use them, and optionally pass them through to the
>>>> container.
>>>> + void (*listener_begin)(VFIOContainerBase *bcontainer);
>>>> + void (*listener_commit)(VFIOContainerBase *bcontainer);
>>>
>>> Please add documentation for the new callbacks.
>> and it is not used in this series yet. So we can keep it for later.
>
> Will do, I thought you wanted the general vfio stuff separated out,
yes. It is best to start with the invasive part.
> but can move to the vfio-user specific queue.
This is a simple addition which shouldn't conflict with the rest of
the changes. So it can come when needed.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 06/14] vfio: add flags parameter to DMA unmap callback
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (4 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 05/14] vfio/container: pass listener_begin/commit callbacks John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-09 13:48 ` [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback John Levon
` (8 subsequent siblings)
14 siblings, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
We'll use this parameter shortly; this just adds the plumbing.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/container-base.c | 4 ++--
hw/vfio/container.c | 8 ++++++--
hw/vfio/iommufd.c | 6 +++++-
hw/vfio/listener.c | 8 ++++----
include/hw/vfio/vfio-container-base.h | 4 ++--
5 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 726aac9827..613fe1a00d 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -85,12 +85,12 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb)
+ IOMMUTLBEntry *iotlb, int flags)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
g_assert(vioc->dma_unmap);
- return vioc->dma_unmap(bcontainer, iova, size, iotlb);
+ return vioc->dma_unmap(bcontainer, iova, size, iotlb, flags);
}
bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 4fc181d33b..625bbe82a7 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -124,7 +124,7 @@ unmap_exit:
*/
static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb)
+ IOMMUTLBEntry *iotlb, int flags)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
@@ -138,6 +138,10 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
int ret;
Error *local_err = NULL;
+ if (flags != 0) {
+ return -ENOTSUP;
+ }
+
if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) {
if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) &&
bcontainer->dirty_pages_supported) {
@@ -205,7 +209,7 @@ static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
*/
if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
(errno == EBUSY &&
- vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
+ vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, 0) == 0 &&
ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
return 0;
}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 1874185fcf..07334e65b5 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -46,11 +46,15 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb)
+ IOMMUTLBEntry *iotlb, int flags)
{
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+ if (flags != 0) {
+ return -ENOTSUP;
+ }
+
/* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
return iommufd_backend_unmap_dma(container->be,
container->ioas_id, iova, size);
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 9ffc2deb2d..c52d4a52ef 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -173,7 +173,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
}
} else {
ret = vfio_container_dma_unmap(bcontainer, iova,
- iotlb->addr_mask + 1, iotlb);
+ iotlb->addr_mask + 1, iotlb, 0);
if (ret) {
error_setg(&local_err,
"vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
@@ -202,7 +202,7 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
int ret;
/* Unmap with a single call. */
- ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
+ ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, 0);
if (ret) {
error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
strerror(-ret));
@@ -665,7 +665,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
/* The unmap ioctl doesn't accept a full 64-bit span. */
llsize = int128_rshift(llsize, 1);
ret = vfio_container_dma_unmap(bcontainer, iova,
- int128_get64(llsize), NULL);
+ int128_get64(llsize), NULL, 0);
if (ret) {
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
@@ -675,7 +675,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
iova += int128_get64(llsize);
}
ret = vfio_container_dma_unmap(bcontainer, iova,
- int128_get64(llsize), NULL);
+ int128_get64(llsize), NULL, 0);
if (ret) {
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 67373e8db0..6eaf2b2430 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -81,7 +81,7 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
void *vaddr, bool readonly);
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb);
+ IOMMUTLBEntry *iotlb, int flags);
bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
Error **errp);
@@ -127,7 +127,7 @@ struct VFIOIOMMUClass {
void *vaddr, bool readonly);
int (*dma_unmap)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb);
+ IOMMUTLBEntry *iotlb, int flags);
bool (*attach_device)(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
void (*detach_device)(VFIODevice *vbasedev);
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (5 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 06/14] vfio: add flags parameter to DMA unmap callback John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-23 17:01 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 08/14] vfio: add vfio-pci-base class John Levon
` (7 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Use the new flags parameter to indicate when we want to unmap
everything; no functional change is intended.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/container.c | 49 ++++++++++++++++++++++++++++++++++++---------
hw/vfio/iommufd.c | 19 +++++++++++++++++-
hw/vfio/listener.c | 19 ++++++------------
3 files changed, 63 insertions(+), 24 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 625bbe82a7..37b1217fd8 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -119,12 +119,9 @@ unmap_exit:
return ret;
}
-/*
- * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
- */
-static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
- hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb, int flags)
+static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
@@ -138,10 +135,6 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
int ret;
Error *local_err = NULL;
- if (flags != 0) {
- return -ENOTSUP;
- }
-
if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) {
if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) &&
bcontainer->dirty_pages_supported) {
@@ -185,6 +178,42 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
return 0;
}
+/*
+ * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+ */
+static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb, int flags)
+{
+ int ret;
+
+ if ((flags & ~(VFIO_DMA_UNMAP_FLAG_ALL)) != 0) {
+ return -ENOTSUP;
+ }
+
+ if (flags & VFIO_DMA_UNMAP_FLAG_ALL) {
+ /* The unmap ioctl doesn't accept a full 64-bit span. */
+ Int128 llsize = int128_rshift(int128_2_64(), 1);
+
+ ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize),
+ iotlb);
+
+ if (ret == 0) {
+ ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize),
+ int128_get64(llsize), iotlb);
+ }
+
+ } else {
+ ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb);
+ }
+
+ if (ret != 0) {
+ return -errno;
+ }
+
+ return 0;
+}
+
static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
{
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 07334e65b5..22e5b16967 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -51,10 +51,27 @@ static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
- if (flags != 0) {
+ if ((flags & ~(VFIO_DMA_UNMAP_FLAG_ALL)) != 0) {
return -ENOTSUP;
}
+ /* unmap in halves */
+ if (flags & VFIO_DMA_UNMAP_FLAG_ALL) {
+ Int128 llsize = int128_rshift(int128_2_64(), 1);
+ int ret;
+
+ ret = iommufd_backend_unmap_dma(container->be, container->ioas_id,
+ 0, int128_get64(llsize));
+
+ if (ret == 0) {
+ ret = iommufd_backend_unmap_dma(container->be, container->ioas_id,
+ int128_get64(llsize),
+ int128_get64(llsize));
+ }
+
+ return ret;
+ }
+
/* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
return iommufd_backend_unmap_dma(container->be,
container->ioas_id, iova, size);
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index c52d4a52ef..bcf2b98e79 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -661,21 +661,14 @@ static void vfio_listener_region_del(MemoryListener *listener,
}
if (try_unmap) {
+ int flags = 0;
+
if (int128_eq(llsize, int128_2_64())) {
- /* The unmap ioctl doesn't accept a full 64-bit span. */
- llsize = int128_rshift(llsize, 1);
- ret = vfio_container_dma_unmap(bcontainer, iova,
- int128_get64(llsize), NULL, 0);
- if (ret) {
- error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%s)",
- bcontainer, iova, int128_get64(llsize), ret,
- strerror(-ret));
- }
- iova += int128_get64(llsize);
+ flags = VFIO_DMA_UNMAP_FLAG_ALL;
+ llsize = 0;
}
- ret = vfio_container_dma_unmap(bcontainer, iova,
- int128_get64(llsize), NULL, 0);
+ ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
+ NULL, flags);
if (ret) {
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
2025-04-09 13:48 ` [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback John Levon
@ 2025-04-23 17:01 ` Cédric Le Goater
2025-04-23 17:17 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-23 17:01 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> Use the new flags parameter to indicate when we want to unmap
> everything; no functional change is intended.
I find these changes confusing. Most likely there are not well presented
or I am missing something. Some more below.
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/container.c | 49 ++++++++++++++++++++++++++++++++++++---------
> hw/vfio/iommufd.c | 19 +++++++++++++++++-
> hw/vfio/listener.c | 19 ++++++------------
> 3 files changed, 63 insertions(+), 24 deletions(-)
>
> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
> index 625bbe82a7..37b1217fd8 100644
> --- a/hw/vfio/container.c
> +++ b/hw/vfio/container.c
> @@ -119,12 +119,9 @@ unmap_exit:
> return ret;
> }
>
> -/*
> - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> - */
> -static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
> - hwaddr iova, ram_addr_t size,
> - IOMMUTLBEntry *iotlb, int flags)
> +static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer,
> + hwaddr iova, ram_addr_t size,
> + IOMMUTLBEntry *iotlb)
> {
> const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
> bcontainer);
> @@ -138,10 +135,6 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
> int ret;
> Error *local_err = NULL;
>
> - if (flags != 0) {
> - return -ENOTSUP;
> - }
> -
> if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) {
> if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) &&
> bcontainer->dirty_pages_supported) {
> @@ -185,6 +178,42 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
> return 0;
> }
>
> +/*
> + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> + */
> +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
> + hwaddr iova, ram_addr_t size,
> + IOMMUTLBEntry *iotlb, int flags)
> +{
> + int ret;
> +
> + if ((flags & ~(VFIO_DMA_UNMAP_FLAG_ALL)) != 0) {
VFIO_DMA_UNMAP_FLAG_ALL is a kernel uapi flag. It should be used only with
the corresponding ioctl(VFIO_IOMMU_UNMAP_DMA) and not internally between
QEMU routines.
I think adding a 'bool unmap_all' paremeter to vfio_legacy_dma_unmap() would
make more sense.
> + return -ENOTSUP;
> + }
> +
> + if (flags & VFIO_DMA_UNMAP_FLAG_ALL) {
> + /* The unmap ioctl doesn't accept a full 64-bit span. */
> + Int128 llsize = int128_rshift(int128_2_64(), 1);
> +
> + ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize),
> + iotlb);
> +
> + if (ret == 0) {
> + ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize),
> + int128_get64(llsize), iotlb);
> + }
> +
> + } else {
> + ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb);
> + }
> +
> + if (ret != 0) {
> + return -errno;
> + }
> +
> + return 0;
> +}
> +
> static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
> ram_addr_t size, void *vaddr, bool readonly)
> {
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 07334e65b5..22e5b16967 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -51,10 +51,27 @@ static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
> const VFIOIOMMUFDContainer *container =
> container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
>
> - if (flags != 0) {
> + if ((flags & ~(VFIO_DMA_UNMAP_FLAG_ALL)) != 0) {
> return -ENOTSUP;
> }
>
> + /* unmap in halves */
> + if (flags & VFIO_DMA_UNMAP_FLAG_ALL) {
> + Int128 llsize = int128_rshift(int128_2_64(), 1);
> + int ret;
> +
> + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id,
> + 0, int128_get64(llsize));
> +
> + if (ret == 0) {
> + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id,
> + int128_get64(llsize),
> + int128_get64(llsize));
> + }
> +
> + return ret;
> + }
> +
> /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
> return iommufd_backend_unmap_dma(container->be,
> container->ioas_id, iova, size);
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index c52d4a52ef..bcf2b98e79 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -661,21 +661,14 @@ static void vfio_listener_region_del(MemoryListener *listener,
> }
>
> if (try_unmap) {
> + int flags = 0;
> +
> if (int128_eq(llsize, int128_2_64())) {
> - /* The unmap ioctl doesn't accept a full 64-bit span. */
> - llsize = int128_rshift(llsize, 1);
> - ret = vfio_container_dma_unmap(bcontainer, iova,
> - int128_get64(llsize), NULL, 0);
> - if (ret) {
> - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> - "0x%"HWADDR_PRIx") = %d (%s)",
> - bcontainer, iova, int128_get64(llsize), ret,
> - strerror(-ret));
> - }
> - iova += int128_get64(llsize);
> + flags = VFIO_DMA_UNMAP_FLAG_ALL;
> + llsize = 0;
please change this initialization to :
llsize = int128_zero();
> }
> - ret = vfio_container_dma_unmap(bcontainer, iova,
> - int128_get64(llsize), NULL, 0);
> + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
> + NULL, flags);
Why not unmap the halves here instead of in the backends ?
Thanks,
C.
> if (ret) {
> error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> "0x%"HWADDR_PRIx") = %d (%s)",
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
2025-04-23 17:01 ` Cédric Le Goater
@ 2025-04-23 17:17 ` John Levon
2025-04-24 17:16 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-23 17:17 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On Wed, Apr 23, 2025 at 07:01:23PM +0200, Cédric Le Goater wrote:
> On 4/9/25 15:48, John Levon wrote:
> > Use the new flags parameter to indicate when we want to unmap
> > everything; no functional change is intended.
>
> I find these changes confusing. Most likely there are not well presented
> or I am missing something. Some more below.
I don't see any way to further break up the change unfortunately.
> > +/*
> > + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> > + */
> > +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
> > + hwaddr iova, ram_addr_t size,
> > + IOMMUTLBEntry *iotlb, int flags)
> > +{
> > + int ret;
> > +
> > + if ((flags & ~(VFIO_DMA_UNMAP_FLAG_ALL)) != 0) {
>
> VFIO_DMA_UNMAP_FLAG_ALL is a kernel uapi flag. It should be used only with
> the corresponding ioctl(VFIO_IOMMU_UNMAP_DMA) and not internally between
> QEMU routines.
Happy to use a different define for the flags if you like, but surely it's
better to have a flags field so it's extendable and it's always clear what the
meaning is? Problem with a boolean is you just see "true" or "false" in the
caller and have no real idea what it means until you look it up.
> I think adding a 'bool unmap_all' paremeter to vfio_legacy_dma_unmap() would
> make more sense.
Having said that I'm OK with going back to just a simple boolean if you'd really
prefer.
> > }
> > - ret = vfio_container_dma_unmap(bcontainer, iova,
> > - int128_get64(llsize), NULL, 0);
> > + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
> > + NULL, flags);
>
> Why not unmap the halves here instead of in the backends ?
The whole point of the change is that right now the generic listener.c code has
a workaround that is specific to one particular backend. vfio-user doesn't have
any need to unmap in halves and in fact *has* to pass an "unmap all" flag.
In theory, neither does vfio if the flag is supported, but I dropped that patch
as I couldn't figure out a clean way to use it WRT the dirty tracking code.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
2025-04-23 17:17 ` John Levon
@ 2025-04-24 17:16 ` Cédric Le Goater
2025-04-24 19:35 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 17:16 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/23/25 19:17, John Levon wrote:
> On Wed, Apr 23, 2025 at 07:01:23PM +0200, Cédric Le Goater wrote:
>
>> On 4/9/25 15:48, John Levon wrote:
>>> Use the new flags parameter to indicate when we want to unmap
>>> everything; no functional change is intended.
>>
>> I find these changes confusing. Most likely there are not well presented
>> or I am missing something. Some more below.
>
> I don't see any way to further break up the change unfortunately.
>
>>> +/*
>>> + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>>> + */
>>> +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
>>> + hwaddr iova, ram_addr_t size,
>>> + IOMMUTLBEntry *iotlb, int flags)
>>> +{
>>> + int ret;
>>> +
>>> + if ((flags & ~(VFIO_DMA_UNMAP_FLAG_ALL)) != 0) {
>>
>> VFIO_DMA_UNMAP_FLAG_ALL is a kernel uapi flag. It should be used only with
>> the corresponding ioctl(VFIO_IOMMU_UNMAP_DMA) and not internally between
>> QEMU routines.
>
> Happy to use a different define for the flags if you like, but surely it's
> better to have a flags field so it's extendable and it's always clear what the
> meaning is? Problem with a boolean is you just see "true" or "false" in the
> caller and have no real idea what it means until you look it up.
>
>> I think adding a 'bool unmap_all' paremeter to vfio_legacy_dma_unmap() would
>> make more sense.
>
> Having said that I'm OK with going back to just a simple boolean if you'd really
> prefer.
yes. VFIO_DMA_UNMAP_FLAG_ALL is a kernel interface and we don't
need more than one flag today.
>>> }
>>> - ret = vfio_container_dma_unmap(bcontainer, iova,
>>> - int128_get64(llsize), NULL, 0);
>>> + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
>>> + NULL, flags);
>>
>> Why not unmap the halves here instead of in the backends ?
>
> The whole point of the change is that right now the generic listener.c code has
> a workaround that is specific to one particular backend.
It's due to the ARM IO space size AFAICT.
> vfio-user doesn't have
> any need to unmap in halves and in fact *has* to pass an "unmap all" flag.
OK. So this flag is a vfio-user requirement. Why can't we call
vfio_container_dma_unmap() twice from vfio_listener_region_del() ?
Thanks,
C.
> In theory, neither does vfio if the flag is supported, but I dropped that patch
> as I couldn't figure out a clean way to use it WRT the dirty tracking code.
>
> regards
> john
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
2025-04-24 17:16 ` Cédric Le Goater
@ 2025-04-24 19:35 ` John Levon
2025-04-28 11:41 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-24 19:35 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On Thu, Apr 24, 2025 at 07:16:52PM +0200, Cédric Le Goater wrote:
> > Having said that I'm OK with going back to just a simple boolean if you'd really
> > prefer.
>
> yes. VFIO_DMA_UNMAP_FLAG_ALL is a kernel interface and we don't
> need more than one flag today.
OK
> > > Why not unmap the halves here instead of in the backends ?
> >
> > The whole point of the change is that right now the generic listener.c code has
> > a workaround that is specific to one particular backend.
>
> It's due to the ARM IO space size AFAICT.
>
> > vfio-user doesn't have
> > any need to unmap in halves and in fact *has* to pass an "unmap all" flag.
>
> OK. So this flag is a vfio-user requirement. Why can't we call
> vfio_container_dma_unmap() twice from vfio_listener_region_del() ?
Are you suggesting that the vfio-user backend - and the protocol - somehow
accounts for the two unmaps and translates it back into an unmap all? How would
that work?
Surely it's very ugly indeed to embed a foible of the (old) vfio kernel
interface into every backend.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
2025-04-24 19:35 ` John Levon
@ 2025-04-28 11:41 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-28 11:41 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/24/25 21:35, John Levon wrote:
> On Thu, Apr 24, 2025 at 07:16:52PM +0200, Cédric Le Goater wrote:
>
>>> Having said that I'm OK with going back to just a simple boolean if you'd really
>>> prefer.
>>
>> yes. VFIO_DMA_UNMAP_FLAG_ALL is a kernel interface and we don't
>> need more than one flag today.
>
> OK
>
>>>> Why not unmap the halves here instead of in the backends ?
>>>
>>> The whole point of the change is that right now the generic listener.c code has
>>> a workaround that is specific to one particular backend.
>>
>> It's due to the ARM IO space size AFAICT.
>>
>>> vfio-user doesn't have
>>> any need to unmap in halves and in fact *has* to pass an "unmap all" flag.
>>
>> OK. So this flag is a vfio-user requirement. Why can't we call
>> vfio_container_dma_unmap() twice from vfio_listener_region_del() ?
>
> Are you suggesting that the vfio-user backend - and the protocol - somehow
> accounts for the two unmaps and translates it back into an unmap all? How would
> that work?
ok. Let's keep that way. It's not too invasive a change.
Thanks,
C.
>
> Surely it's very ugly indeed to embed a foible of the (old) vfio kernel
> interface into every backend.
>
> regards
> john
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 08/14] vfio: add vfio-pci-base class
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (6 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 07/14] vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-24 15:17 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 09/14] vfio: add vfio_device_get_irq_info() helper John Levon
` (6 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon, John Johnson,
Elena Ufimtseva, Jagannathan Raman
Split out parts of TYPE_VFIO_PCI into a base TYPE_VFIO_PCI_BASE. The
base type contains properties generic to all vfio-pci implementations
(although we have not yet introduced another subclass).
Note that currently there is no need for additional data for
TYPE_VFIO_PCI, so it shares the same C struct type as
TYPE_VFIO_PCI_BASE, VFIOPCIDevice.
Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/device.c | 2 +-
hw/vfio/pci.c | 266 ++++++++++++++++++++++++++---------------------
hw/vfio/pci.h | 12 ++-
3 files changed, 156 insertions(+), 124 deletions(-)
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index f74b9c25ea..b9473878fc 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -361,7 +361,7 @@ bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp)
VFIODevice *vfio_get_vfio_device(Object *obj)
{
if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
- return &VFIO_PCI(obj)->vbasedev;
+ return &VFIO_PCI_BASE(obj)->vbasedev;
} else {
return NULL;
}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 81bf0dab28..090b2f2ef0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -241,7 +241,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
static void vfio_intx_routing_notifier(PCIDevice *pdev)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
PCIINTxRoute route;
if (vdev->interrupt != VFIO_INT_INTx) {
@@ -516,7 +516,7 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
MSIMessage *msg, IOHandler *handler)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIOMSIVector *vector;
int ret;
bool resizing = !!(vdev->nr_vectors < nr + 1);
@@ -621,7 +621,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIOMSIVector *vector = &vdev->msi_vectors[nr];
trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
@@ -1169,7 +1169,7 @@ static const MemoryRegionOps vfio_vga_ops = {
*/
static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIORegion *region = &vdev->bars[bar].region;
MemoryRegion *mmap_mr, *region_mr, *base_mr;
PCIIORegion *r;
@@ -1215,7 +1215,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
*/
uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
@@ -1248,7 +1248,7 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
void vfio_pci_write_config(PCIDevice *pdev,
uint32_t addr, uint32_t val, int len)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
uint32_t val_le = cpu_to_le32(val);
trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
@@ -3091,7 +3091,7 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
static void vfio_realize(PCIDevice *pdev, Error **errp)
{
ERRP_GUARD();
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIODevice *vbasedev = &vdev->vbasedev;
int i, ret;
char uuid[UUID_STR_LEN];
@@ -3260,7 +3260,7 @@ error:
static void vfio_instance_finalize(Object *obj)
{
- VFIOPCIDevice *vdev = VFIO_PCI(obj);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
vfio_display_finalize(vdev);
vfio_bars_finalize(vdev);
@@ -3278,7 +3278,7 @@ static void vfio_instance_finalize(Object *obj)
static void vfio_exitfn(PCIDevice *pdev)
{
- VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
VFIODevice *vbasedev = &vdev->vbasedev;
vfio_unregister_req_notifier(vdev);
@@ -3302,7 +3302,7 @@ static void vfio_exitfn(PCIDevice *pdev)
static void vfio_pci_reset(DeviceState *dev)
{
- VFIOPCIDevice *vdev = VFIO_PCI(dev);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
trace_vfio_pci_reset(vdev->vbasedev.name);
@@ -3342,7 +3342,7 @@ post_reset:
static void vfio_instance_init(Object *obj)
{
PCIDevice *pci_dev = PCI_DEVICE(obj);
- VFIOPCIDevice *vdev = VFIO_PCI(obj);
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
VFIODevice *vbasedev = &vdev->vbasedev;
device_add_bootindex_property(obj, &vdev->bootindex,
@@ -3365,32 +3365,15 @@ static void vfio_instance_init(Object *obj)
static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
-static const Property vfio_pci_dev_properties[] = {
- DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
- DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
- DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
+static const Property vfio_pci_base_dev_properties[] = {
DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
vbasedev.pre_copy_dirty_page_tracking,
ON_OFF_AUTO_ON),
DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
vbasedev.device_dirty_page_tracking,
ON_OFF_AUTO_ON),
- DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
- display, ON_OFF_AUTO_OFF),
- DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
- DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
intx.mmap_timeout, 1100),
- DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
- VFIO_FEATURE_ENABLE_VGA_BIT, false),
- DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
- VFIO_FEATURE_ENABLE_REQ_BIT, true),
- DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
- VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
- DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
- VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
- DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
- igd_legacy_mode, ON_OFF_AUTO_AUTO),
DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
@@ -3405,8 +3388,6 @@ static const Property vfio_pci_dev_properties[] = {
DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
- DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
- no_geforce_quirks, false),
DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
false),
DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
@@ -3417,61 +3398,55 @@ static const Property vfio_pci_dev_properties[] = {
sub_vendor_id, PCI_ANY_ID),
DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
sub_device_id, PCI_ANY_ID),
- DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
- DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
- nv_gpudirect_clique,
- qdev_prop_nv_gpudirect_clique, uint8_t),
DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
OFF_AUTO_PCIBAR_OFF),
-#ifdef CONFIG_IOMMUFD
- DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
- TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
-#endif
- DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
};
-#ifdef CONFIG_IOMMUFD
-static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
-{
- vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
-}
-#endif
-static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
+static void vfio_pci_base_dev_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
- device_class_set_legacy_reset(dc, vfio_pci_reset);
- device_class_set_props(dc, vfio_pci_dev_properties);
-#ifdef CONFIG_IOMMUFD
- object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
-#endif
- dc->desc = "VFIO-based PCI device assignment";
+ device_class_set_props(dc, vfio_pci_base_dev_properties);
+ dc->desc = "VFIO PCI base device";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
- pdc->realize = vfio_realize;
pdc->exit = vfio_exitfn;
pdc->config_read = vfio_pci_read_config;
pdc->config_write = vfio_pci_write_config;
- object_class_property_set_description(klass, /* 1.3 */
- "host",
- "Host PCI address [domain:]<bus:slot.function> of assigned device");
+ object_class_property_set_description(klass, /* 5.2 */
+ "x-pre-copy-dirty-page-tracking",
+ "Disable dirty pages tracking during iterative phase "
+ "(DEBUG)");
+ object_class_property_set_description(klass, /* 9.1 */
+ "x-device-dirty-page-tracking",
+ "Disable device dirty page tracking and use "
+ "container-based dirty page tracking");
object_class_property_set_description(klass, /* 1.3 */
"x-intx-mmap-timeout-ms",
"When EOI is not provided by KVM/QEMU, wait time "
"(milliseconds) to re-enable device direct access "
"after INTx (DEBUG)");
- object_class_property_set_description(klass, /* 1.5 */
- "x-vga",
- "Expose VGA address spaces for device");
- object_class_property_set_description(klass, /* 2.3 */
- "x-req",
- "Disable device request notification support (DEBUG)");
+ object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
+ "enable-migration",
+ "Enale device migration. Also requires a host VFIO PCI "
+ "variant or mdev driver with migration support enabled");
+ object_class_property_set_description(klass, /* 10.0 */
+ "x-migration-multifd-transfer",
+ "Transfer this device state via "
+ "multifd channels when live migrating it");
+ object_class_property_set_description(klass, /* 9.1 */
+ "migration-events",
+ "Emit VFIO migration QAPI event when a VFIO device "
+ "changes its migration state. For management applications");
object_class_property_set_description(klass, /* 2.4 and 2.5 */
"x-no-mmap",
"Disable MMAP for device. Allows to trace MMIO "
"accesses (DEBUG)");
+ object_class_property_set_description(klass, /* 3.1 */
+ "x-balloon-allowed",
+ "Override allowing ballooning with device (DEBUG, DANGER)");
object_class_property_set_description(klass, /* 2.5 */
"x-no-kvm-intx",
"Disable direct VFIO->KVM INTx injection. Allows to "
@@ -3484,6 +3459,13 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
"x-no-kvm-msix",
"Disable direct VFIO->KVM MSIx injection. Allows to "
"trace MSIx interrupts (DEBUG)");
+ object_class_property_set_description(klass, /* 3.0 */
+ "x-no-kvm-ioeventfd",
+ "Disable registration of ioeventfds with KVM (DEBUG)");
+ object_class_property_set_description(klass, /* 3.0 */
+ "x-no-vfio-ioeventfd",
+ "Disable linking of KVM ioeventfds to VFIO ioeventfds "
+ "(DEBUG)");
object_class_property_set_description(klass, /* 2.5 */
"x-pci-vendor-id",
"Override PCI Vendor ID with provided value (DEBUG)");
@@ -3498,95 +3480,136 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
"x-pci-sub-device-id",
"Override PCI Subsystem Device ID with provided value "
"(DEBUG)");
+ object_class_property_set_description(klass, /* 2.12 */
+ "x-msix-relocation",
+ "Specify MSI-X MMIO relocation to the end of specified "
+ "existing BAR or new BAR to avoid virtualization overhead "
+ "due to adjacent device registers");
+}
+
+static const TypeInfo vfio_pci_base_dev_info = {
+ .name = TYPE_VFIO_PCI_BASE,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = 0,
+ .abstract = true,
+ .class_init = vfio_pci_base_dev_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_PCIE_DEVICE },
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { }
+ },
+};
+
+static const Property vfio_pci_dev_properties[] = {
+ DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
+ DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
+ DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
+ DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
+ display, ON_OFF_AUTO_OFF),
+ DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
+ DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
+ DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
+ VFIO_FEATURE_ENABLE_VGA_BIT, false),
+ DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
+ VFIO_FEATURE_ENABLE_REQ_BIT, true),
+ DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
+ VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
+ DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
+ DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
+ VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
+ DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
+ igd_legacy_mode, ON_OFF_AUTO_AUTO),
+ DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
+ no_geforce_quirks, false),
+ DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
+ nv_gpudirect_clique,
+ qdev_prop_nv_gpudirect_clique, uint8_t),
+#ifdef CONFIG_IOMMUFD
+ DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
+ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
+ DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
+};
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
+{
+ VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
+ vfio_device_set_fd(&vdev->vbasedev, str, errp);
+}
+#endif
+
+static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
+
+ device_class_set_legacy_reset(dc, vfio_pci_reset);
+ device_class_set_props(dc, vfio_pci_dev_properties);
+#ifdef CONFIG_IOMMUFD
+ object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
+#endif
+ dc->desc = "VFIO-based PCI device assignment";
+ pdc->realize = vfio_realize;
+
+ object_class_property_set_description(klass, /* 1.3 */
+ "host",
+ "Host PCI address [domain:]<bus:slot.function> of assigned device");
+ object_class_property_set_description(klass, /* 8.1 */
+ "vf-token",
+ "Specify UUID VF token. Required for VF when PF is owned "
+ "by another VFIO driver");
object_class_property_set_description(klass, /* 2.6 */
"sysfsdev",
"Host sysfs path of assigned device");
+ object_class_property_set_description(klass, /* 2.12 */
+ "display",
+ "Enable display support for device, ex. vGPU");
+ object_class_property_set_description(klass, /* 3.2 */
+ "xres",
+ "Set X display resolution the vGPU should use");
+ object_class_property_set_description(klass, /* 3.2 */
+ "yres",
+ "Set Y display resolution the vGPU should use");
+ object_class_property_set_description(klass, /* 1.5 */
+ "x-vga",
+ "Expose VGA address spaces for device");
+ object_class_property_set_description(klass, /* 2.3 */
+ "x-req",
+ "Disable device request notification support (DEBUG)");
object_class_property_set_description(klass, /* 2.7 */
"x-igd-opregion",
"Expose host IGD OpRegion to guest");
object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
"x-igd-gms",
"Override IGD data stolen memory size (32MiB units)");
- object_class_property_set_description(klass, /* 2.11 */
- "x-nv-gpudirect-clique",
- "Add NVIDIA GPUDirect capability indicating P2P DMA "
- "clique for device [0-15]");
object_class_property_set_description(klass, /* 2.12 */
"x-no-geforce-quirks",
"Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
"Improves performance");
- object_class_property_set_description(klass, /* 2.12 */
- "display",
- "Enable display support for device, ex. vGPU");
- object_class_property_set_description(klass, /* 2.12 */
- "x-msix-relocation",
- "Specify MSI-X MMIO relocation to the end of specified "
- "existing BAR or new BAR to avoid virtualization overhead "
- "due to adjacent device registers");
- object_class_property_set_description(klass, /* 3.0 */
- "x-no-kvm-ioeventfd",
- "Disable registration of ioeventfds with KVM (DEBUG)");
- object_class_property_set_description(klass, /* 3.0 */
- "x-no-vfio-ioeventfd",
- "Disable linking of KVM ioeventfds to VFIO ioeventfds "
- "(DEBUG)");
- object_class_property_set_description(klass, /* 3.1 */
- "x-balloon-allowed",
- "Override allowing ballooning with device (DEBUG, DANGER)");
- object_class_property_set_description(klass, /* 3.2 */
- "xres",
- "Set X display resolution the vGPU should use");
- object_class_property_set_description(klass, /* 3.2 */
- "yres",
- "Set Y display resolution the vGPU should use");
- object_class_property_set_description(klass, /* 5.2 */
- "x-pre-copy-dirty-page-tracking",
- "Disable dirty pages tracking during iterative phase "
- "(DEBUG)");
- object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
- "enable-migration",
- "Enale device migration. Also requires a host VFIO PCI "
- "variant or mdev driver with migration support enabled");
- object_class_property_set_description(klass, /* 8.1 */
- "vf-token",
- "Specify UUID VF token. Required for VF when PF is owned "
- "by another VFIO driver");
+ object_class_property_set_description(klass, /* 2.11 */
+ "x-nv-gpudirect-clique",
+ "Add NVIDIA GPUDirect capability indicating P2P DMA "
+ "clique for device [0-15]");
#ifdef CONFIG_IOMMUFD
object_class_property_set_description(klass, /* 9.0 */
"iommufd",
"Set host IOMMUFD backend device");
#endif
- object_class_property_set_description(klass, /* 9.1 */
- "x-device-dirty-page-tracking",
- "Disable device dirty page tracking and use "
- "container-based dirty page tracking");
- object_class_property_set_description(klass, /* 9.1 */
- "migration-events",
- "Emit VFIO migration QAPI event when a VFIO device "
- "changes its migration state. For management applications");
object_class_property_set_description(klass, /* 9.1 */
"skip-vsc-check",
"Skip config space check for Vendor Specific Capability. "
"Setting to false will enforce strict checking of VSC content "
"(DEBUG)");
- object_class_property_set_description(klass, /* 10.0 */
- "x-migration-multifd-transfer",
- "Transfer this device state via "
- "multifd channels when live migrating it");
}
static const TypeInfo vfio_pci_dev_info = {
.name = TYPE_VFIO_PCI,
- .parent = TYPE_PCI_DEVICE,
+ .parent = TYPE_VFIO_PCI_BASE,
.instance_size = sizeof(VFIOPCIDevice),
.class_init = vfio_pci_dev_class_init,
.instance_init = vfio_instance_init,
.instance_finalize = vfio_instance_finalize,
- .interfaces = (InterfaceInfo[]) {
- { INTERFACE_PCIE_DEVICE },
- { INTERFACE_CONVENTIONAL_PCI_DEVICE },
- { }
- },
};
static const Property vfio_pci_dev_nohotplug_properties[] = {
@@ -3632,6 +3655,7 @@ static void register_vfio_pci_dev_type(void)
vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
+ type_register_static(&vfio_pci_base_dev_info);
type_register_static(&vfio_pci_dev_info);
type_register_static(&vfio_pci_nohotplug_dev_info);
}
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index cbea3be029..4000ba804c 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -118,8 +118,13 @@ typedef struct VFIOMSIXInfo {
bool noresize;
} VFIOMSIXInfo;
-#define TYPE_VFIO_PCI "vfio-pci"
-OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI)
+/*
+ * TYPE_VFIO_PCI_BASE is an abstract type used to share code
+ * between VFIO implementations that use a kernel driver
+ * with those that use user sockets.
+ */
+#define TYPE_VFIO_PCI_BASE "vfio-pci-base"
+OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_BASE)
struct VFIOPCIDevice {
PCIDevice pdev;
@@ -187,6 +192,9 @@ struct VFIOPCIDevice {
Notifier irqchip_change_notifier;
};
+#define TYPE_VFIO_PCI "vfio-pci"
+/* TYPE_VFIO_PCI shares struct VFIOPCIDevice. */
+
/* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
static inline bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
{
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 08/14] vfio: add vfio-pci-base class
2025-04-09 13:48 ` [PATCH 08/14] vfio: add vfio-pci-base class John Levon
@ 2025-04-24 15:17 ` Cédric Le Goater
2025-04-24 21:52 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 15:17 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/9/25 15:48, John Levon wrote:
> Split out parts of TYPE_VFIO_PCI into a base TYPE_VFIO_PCI_BASE. The
> base type contains properties generic to all vfio-pci implementations
> (although we have not yet introduced another subclass).
>
> Note that currently there is no need for additional data for
> TYPE_VFIO_PCI, so it shares the same C struct type as
> TYPE_VFIO_PCI_BASE, VFIOPCIDevice.
I don't understand how the properties are distributed between the
abstract vfio-pci base class and the vfio-pci class. What's the
rationale ?
Can you remind me why the vfio-pci class for vfio-user can not
inherit directly from vfio-pci ?
Thanks,
C.
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/device.c | 2 +-
> hw/vfio/pci.c | 266 ++++++++++++++++++++++++++---------------------
> hw/vfio/pci.h | 12 ++-
> 3 files changed, 156 insertions(+), 124 deletions(-)
>
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index f74b9c25ea..b9473878fc 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -361,7 +361,7 @@ bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp)
> VFIODevice *vfio_get_vfio_device(Object *obj)
> {
> if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
> - return &VFIO_PCI(obj)->vbasedev;
> + return &VFIO_PCI_BASE(obj)->vbasedev;
> } else {
> return NULL;
> }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 81bf0dab28..090b2f2ef0 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -241,7 +241,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
>
> static void vfio_intx_routing_notifier(PCIDevice *pdev)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> PCIINTxRoute route;
>
> if (vdev->interrupt != VFIO_INT_INTx) {
> @@ -516,7 +516,7 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
> static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> MSIMessage *msg, IOHandler *handler)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> VFIOMSIVector *vector;
> int ret;
> bool resizing = !!(vdev->nr_vectors < nr + 1);
> @@ -621,7 +621,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>
> static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> VFIOMSIVector *vector = &vdev->msi_vectors[nr];
>
> trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
> @@ -1169,7 +1169,7 @@ static const MemoryRegionOps vfio_vga_ops = {
> */
> static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> VFIORegion *region = &vdev->bars[bar].region;
> MemoryRegion *mmap_mr, *region_mr, *base_mr;
> PCIIORegion *r;
> @@ -1215,7 +1215,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
> */
> uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
>
> memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> @@ -1248,7 +1248,7 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr, uint32_t val, int len)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> uint32_t val_le = cpu_to_le32(val);
>
> trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
> @@ -3091,7 +3091,7 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
> static void vfio_realize(PCIDevice *pdev, Error **errp)
> {
> ERRP_GUARD();
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> VFIODevice *vbasedev = &vdev->vbasedev;
> int i, ret;
> char uuid[UUID_STR_LEN];
> @@ -3260,7 +3260,7 @@ error:
>
> static void vfio_instance_finalize(Object *obj)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(obj);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
>
> vfio_display_finalize(vdev);
> vfio_bars_finalize(vdev);
> @@ -3278,7 +3278,7 @@ static void vfio_instance_finalize(Object *obj)
>
> static void vfio_exitfn(PCIDevice *pdev)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> VFIODevice *vbasedev = &vdev->vbasedev;
>
> vfio_unregister_req_notifier(vdev);
> @@ -3302,7 +3302,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>
> static void vfio_pci_reset(DeviceState *dev)
> {
> - VFIOPCIDevice *vdev = VFIO_PCI(dev);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
>
> trace_vfio_pci_reset(vdev->vbasedev.name);
>
> @@ -3342,7 +3342,7 @@ post_reset:
> static void vfio_instance_init(Object *obj)
> {
> PCIDevice *pci_dev = PCI_DEVICE(obj);
> - VFIOPCIDevice *vdev = VFIO_PCI(obj);
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
> VFIODevice *vbasedev = &vdev->vbasedev;
>
> device_add_bootindex_property(obj, &vdev->bootindex,
> @@ -3365,32 +3365,15 @@ static void vfio_instance_init(Object *obj)
>
> static PropertyInfo vfio_pci_migration_multifd_transfer_prop;
>
> -static const Property vfio_pci_dev_properties[] = {
> - DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
> - DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
> - DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
> +static const Property vfio_pci_base_dev_properties[] = {
> DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
> vbasedev.pre_copy_dirty_page_tracking,
> ON_OFF_AUTO_ON),
> DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
> vbasedev.device_dirty_page_tracking,
> ON_OFF_AUTO_ON),
> - DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
> - display, ON_OFF_AUTO_OFF),
> - DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
> - DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
> DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
> intx.mmap_timeout, 1100),
> - DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
> - VFIO_FEATURE_ENABLE_VGA_BIT, false),
> - DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
> - VFIO_FEATURE_ENABLE_REQ_BIT, true),
> - DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
> - VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
> - DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
> - VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
> - DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
> - igd_legacy_mode, ON_OFF_AUTO_AUTO),
> DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
> vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
> DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
> @@ -3405,8 +3388,6 @@ static const Property vfio_pci_dev_properties[] = {
> DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
> DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
> DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
> - DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
> - no_geforce_quirks, false),
> DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> false),
> DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> @@ -3417,61 +3398,55 @@ static const Property vfio_pci_dev_properties[] = {
> sub_vendor_id, PCI_ANY_ID),
> DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
> sub_device_id, PCI_ANY_ID),
> - DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
> - DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
> - nv_gpudirect_clique,
> - qdev_prop_nv_gpudirect_clique, uint8_t),
> DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
> OFF_AUTO_PCIBAR_OFF),
> -#ifdef CONFIG_IOMMUFD
> - DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
> - TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
> -#endif
> - DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
> };
>
> -#ifdef CONFIG_IOMMUFD
> -static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
> -{
> - vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
> -}
> -#endif
>
> -static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> +static void vfio_pci_base_dev_class_init(ObjectClass *klass, void *data)
> {
> DeviceClass *dc = DEVICE_CLASS(klass);
> PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
>
> - device_class_set_legacy_reset(dc, vfio_pci_reset);
> - device_class_set_props(dc, vfio_pci_dev_properties);
> -#ifdef CONFIG_IOMMUFD
> - object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
> -#endif
> - dc->desc = "VFIO-based PCI device assignment";
> + device_class_set_props(dc, vfio_pci_base_dev_properties);
> + dc->desc = "VFIO PCI base device";
> set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> - pdc->realize = vfio_realize;
> pdc->exit = vfio_exitfn;
> pdc->config_read = vfio_pci_read_config;
> pdc->config_write = vfio_pci_write_config;
>
> - object_class_property_set_description(klass, /* 1.3 */
> - "host",
> - "Host PCI address [domain:]<bus:slot.function> of assigned device");
> + object_class_property_set_description(klass, /* 5.2 */
> + "x-pre-copy-dirty-page-tracking",
> + "Disable dirty pages tracking during iterative phase "
> + "(DEBUG)");
> + object_class_property_set_description(klass, /* 9.1 */
> + "x-device-dirty-page-tracking",
> + "Disable device dirty page tracking and use "
> + "container-based dirty page tracking");
> object_class_property_set_description(klass, /* 1.3 */
> "x-intx-mmap-timeout-ms",
> "When EOI is not provided by KVM/QEMU, wait time "
> "(milliseconds) to re-enable device direct access "
> "after INTx (DEBUG)");
> - object_class_property_set_description(klass, /* 1.5 */
> - "x-vga",
> - "Expose VGA address spaces for device");
> - object_class_property_set_description(klass, /* 2.3 */
> - "x-req",
> - "Disable device request notification support (DEBUG)");
> + object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
> + "enable-migration",
> + "Enale device migration. Also requires a host VFIO PCI "
> + "variant or mdev driver with migration support enabled");
> + object_class_property_set_description(klass, /* 10.0 */
> + "x-migration-multifd-transfer",
> + "Transfer this device state via "
> + "multifd channels when live migrating it");
> + object_class_property_set_description(klass, /* 9.1 */
> + "migration-events",
> + "Emit VFIO migration QAPI event when a VFIO device "
> + "changes its migration state. For management applications");
> object_class_property_set_description(klass, /* 2.4 and 2.5 */
> "x-no-mmap",
> "Disable MMAP for device. Allows to trace MMIO "
> "accesses (DEBUG)");
> + object_class_property_set_description(klass, /* 3.1 */
> + "x-balloon-allowed",
> + "Override allowing ballooning with device (DEBUG, DANGER)");
> object_class_property_set_description(klass, /* 2.5 */
> "x-no-kvm-intx",
> "Disable direct VFIO->KVM INTx injection. Allows to "
> @@ -3484,6 +3459,13 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> "x-no-kvm-msix",
> "Disable direct VFIO->KVM MSIx injection. Allows to "
> "trace MSIx interrupts (DEBUG)");
> + object_class_property_set_description(klass, /* 3.0 */
> + "x-no-kvm-ioeventfd",
> + "Disable registration of ioeventfds with KVM (DEBUG)");
> + object_class_property_set_description(klass, /* 3.0 */
> + "x-no-vfio-ioeventfd",
> + "Disable linking of KVM ioeventfds to VFIO ioeventfds "
> + "(DEBUG)");
> object_class_property_set_description(klass, /* 2.5 */
> "x-pci-vendor-id",
> "Override PCI Vendor ID with provided value (DEBUG)");
> @@ -3498,95 +3480,136 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> "x-pci-sub-device-id",
> "Override PCI Subsystem Device ID with provided value "
> "(DEBUG)");
> + object_class_property_set_description(klass, /* 2.12 */
> + "x-msix-relocation",
> + "Specify MSI-X MMIO relocation to the end of specified "
> + "existing BAR or new BAR to avoid virtualization overhead "
> + "due to adjacent device registers");
> +}
> +
> +static const TypeInfo vfio_pci_base_dev_info = {
> + .name = TYPE_VFIO_PCI_BASE,
> + .parent = TYPE_PCI_DEVICE,
> + .instance_size = 0,
> + .abstract = true,
> + .class_init = vfio_pci_base_dev_class_init,
> + .interfaces = (InterfaceInfo[]) {
> + { INTERFACE_PCIE_DEVICE },
> + { INTERFACE_CONVENTIONAL_PCI_DEVICE },
> + { }
> + },
> +};
> +
> +static const Property vfio_pci_dev_properties[] = {
> + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
> + DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
> + DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
> + DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
> + display, ON_OFF_AUTO_OFF),
> + DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
> + DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
> + DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
> + VFIO_FEATURE_ENABLE_VGA_BIT, false),
> + DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
> + VFIO_FEATURE_ENABLE_REQ_BIT, true),
> + DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
> + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
> + DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
> + DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
> + VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
> + DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
> + igd_legacy_mode, ON_OFF_AUTO_AUTO),
> + DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
> + no_geforce_quirks, false),
> + DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
> + nv_gpudirect_clique,
> + qdev_prop_nv_gpudirect_clique, uint8_t),
> +#ifdef CONFIG_IOMMUFD
> + DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
> + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
> +#endif
> + DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
> +};
> +
> +#ifdef CONFIG_IOMMUFD
> +static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
> +{
> + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
> + vfio_device_set_fd(&vdev->vbasedev, str, errp);
> +}
> +#endif
> +
> +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> +{
> + DeviceClass *dc = DEVICE_CLASS(klass);
> + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
> +
> + device_class_set_legacy_reset(dc, vfio_pci_reset);
> + device_class_set_props(dc, vfio_pci_dev_properties);
> +#ifdef CONFIG_IOMMUFD
> + object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
> +#endif
> + dc->desc = "VFIO-based PCI device assignment";
> + pdc->realize = vfio_realize;
> +
> + object_class_property_set_description(klass, /* 1.3 */
> + "host",
> + "Host PCI address [domain:]<bus:slot.function> of assigned device");
> + object_class_property_set_description(klass, /* 8.1 */
> + "vf-token",
> + "Specify UUID VF token. Required for VF when PF is owned "
> + "by another VFIO driver");
> object_class_property_set_description(klass, /* 2.6 */
> "sysfsdev",
> "Host sysfs path of assigned device");
> + object_class_property_set_description(klass, /* 2.12 */
> + "display",
> + "Enable display support for device, ex. vGPU");
> + object_class_property_set_description(klass, /* 3.2 */
> + "xres",
> + "Set X display resolution the vGPU should use");
> + object_class_property_set_description(klass, /* 3.2 */
> + "yres",
> + "Set Y display resolution the vGPU should use");
> + object_class_property_set_description(klass, /* 1.5 */
> + "x-vga",
> + "Expose VGA address spaces for device");
> + object_class_property_set_description(klass, /* 2.3 */
> + "x-req",
> + "Disable device request notification support (DEBUG)");
> object_class_property_set_description(klass, /* 2.7 */
> "x-igd-opregion",
> "Expose host IGD OpRegion to guest");
> object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */
> "x-igd-gms",
> "Override IGD data stolen memory size (32MiB units)");
> - object_class_property_set_description(klass, /* 2.11 */
> - "x-nv-gpudirect-clique",
> - "Add NVIDIA GPUDirect capability indicating P2P DMA "
> - "clique for device [0-15]");
> object_class_property_set_description(klass, /* 2.12 */
> "x-no-geforce-quirks",
> "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). "
> "Improves performance");
> - object_class_property_set_description(klass, /* 2.12 */
> - "display",
> - "Enable display support for device, ex. vGPU");
> - object_class_property_set_description(klass, /* 2.12 */
> - "x-msix-relocation",
> - "Specify MSI-X MMIO relocation to the end of specified "
> - "existing BAR or new BAR to avoid virtualization overhead "
> - "due to adjacent device registers");
> - object_class_property_set_description(klass, /* 3.0 */
> - "x-no-kvm-ioeventfd",
> - "Disable registration of ioeventfds with KVM (DEBUG)");
> - object_class_property_set_description(klass, /* 3.0 */
> - "x-no-vfio-ioeventfd",
> - "Disable linking of KVM ioeventfds to VFIO ioeventfds "
> - "(DEBUG)");
> - object_class_property_set_description(klass, /* 3.1 */
> - "x-balloon-allowed",
> - "Override allowing ballooning with device (DEBUG, DANGER)");
> - object_class_property_set_description(klass, /* 3.2 */
> - "xres",
> - "Set X display resolution the vGPU should use");
> - object_class_property_set_description(klass, /* 3.2 */
> - "yres",
> - "Set Y display resolution the vGPU should use");
> - object_class_property_set_description(klass, /* 5.2 */
> - "x-pre-copy-dirty-page-tracking",
> - "Disable dirty pages tracking during iterative phase "
> - "(DEBUG)");
> - object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */
> - "enable-migration",
> - "Enale device migration. Also requires a host VFIO PCI "
> - "variant or mdev driver with migration support enabled");
> - object_class_property_set_description(klass, /* 8.1 */
> - "vf-token",
> - "Specify UUID VF token. Required for VF when PF is owned "
> - "by another VFIO driver");
> + object_class_property_set_description(klass, /* 2.11 */
> + "x-nv-gpudirect-clique",
> + "Add NVIDIA GPUDirect capability indicating P2P DMA "
> + "clique for device [0-15]");
> #ifdef CONFIG_IOMMUFD
> object_class_property_set_description(klass, /* 9.0 */
> "iommufd",
> "Set host IOMMUFD backend device");
> #endif
> - object_class_property_set_description(klass, /* 9.1 */
> - "x-device-dirty-page-tracking",
> - "Disable device dirty page tracking and use "
> - "container-based dirty page tracking");
> - object_class_property_set_description(klass, /* 9.1 */
> - "migration-events",
> - "Emit VFIO migration QAPI event when a VFIO device "
> - "changes its migration state. For management applications");
> object_class_property_set_description(klass, /* 9.1 */
> "skip-vsc-check",
> "Skip config space check for Vendor Specific Capability. "
> "Setting to false will enforce strict checking of VSC content "
> "(DEBUG)");
> - object_class_property_set_description(klass, /* 10.0 */
> - "x-migration-multifd-transfer",
> - "Transfer this device state via "
> - "multifd channels when live migrating it");
> }
>
> static const TypeInfo vfio_pci_dev_info = {
> .name = TYPE_VFIO_PCI,
> - .parent = TYPE_PCI_DEVICE,
> + .parent = TYPE_VFIO_PCI_BASE,
> .instance_size = sizeof(VFIOPCIDevice),
> .class_init = vfio_pci_dev_class_init,
> .instance_init = vfio_instance_init,
> .instance_finalize = vfio_instance_finalize,
> - .interfaces = (InterfaceInfo[]) {
> - { INTERFACE_PCIE_DEVICE },
> - { INTERFACE_CONVENTIONAL_PCI_DEVICE },
> - { }
> - },
> };
>
> static const Property vfio_pci_dev_nohotplug_properties[] = {
> @@ -3632,6 +3655,7 @@ static void register_vfio_pci_dev_type(void)
> vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto;
> vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true;
>
> + type_register_static(&vfio_pci_base_dev_info);
> type_register_static(&vfio_pci_dev_info);
> type_register_static(&vfio_pci_nohotplug_dev_info);
> }
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index cbea3be029..4000ba804c 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -118,8 +118,13 @@ typedef struct VFIOMSIXInfo {
> bool noresize;
> } VFIOMSIXInfo;
>
> -#define TYPE_VFIO_PCI "vfio-pci"
> -OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI)
> +/*
> + * TYPE_VFIO_PCI_BASE is an abstract type used to share code
> + * between VFIO implementations that use a kernel driver
> + * with those that use user sockets.
> + */
> +#define TYPE_VFIO_PCI_BASE "vfio-pci-base"
> +OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_BASE)
>
> struct VFIOPCIDevice {
> PCIDevice pdev;
> @@ -187,6 +192,9 @@ struct VFIOPCIDevice {
> Notifier irqchip_change_notifier;
> };
>
> +#define TYPE_VFIO_PCI "vfio-pci"
> +/* TYPE_VFIO_PCI shares struct VFIOPCIDevice. */
> +
> /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
> static inline bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
> {
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 08/14] vfio: add vfio-pci-base class
2025-04-24 15:17 ` Cédric Le Goater
@ 2025-04-24 21:52 ` John Levon
2025-04-25 12:46 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-24 21:52 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On Thu, Apr 24, 2025 at 05:17:28PM +0200, Cédric Le Goater wrote:
> !-------------------------------------------------------------------|
> CAUTION: External Email
>
> |-------------------------------------------------------------------!
>
> On 4/9/25 15:48, John Levon wrote:
> > Split out parts of TYPE_VFIO_PCI into a base TYPE_VFIO_PCI_BASE. The
> > base type contains properties generic to all vfio-pci implementations
> > (although we have not yet introduced another subclass).
> >
> > Note that currently there is no need for additional data for
> > TYPE_VFIO_PCI, so it shares the same C struct type as
> > TYPE_VFIO_PCI_BASE, VFIOPCIDevice.
>
> I don't understand how the properties are distributed between the
> abstract vfio-pci base class and the vfio-pci class. What's the
> rationale ?
It's for properties that apply to all vfio pci classes, and those that are
specific to the kernel vfio pci implementation.
> Can you remind me why the vfio-pci class for vfio-user can not
> inherit directly from vfio-pci ?
For the above reason: we'd inherit many properties that don't work for
vfio-user.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 08/14] vfio: add vfio-pci-base class
2025-04-24 21:52 ` John Levon
@ 2025-04-25 12:46 ` Cédric Le Goater
2025-04-25 13:01 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-25 12:46 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/24/25 23:52, John Levon wrote:
> On Thu, Apr 24, 2025 at 05:17:28PM +0200, Cédric Le Goater wrote:
>
>> !-------------------------------------------------------------------|
>> CAUTION: External Email
>>
>> |-------------------------------------------------------------------!
>>
>> On 4/9/25 15:48, John Levon wrote:
>>> Split out parts of TYPE_VFIO_PCI into a base TYPE_VFIO_PCI_BASE. The
>>> base type contains properties generic to all vfio-pci implementations
>>> (although we have not yet introduced another subclass).
>>>
>>> Note that currently there is no need for additional data for
>>> TYPE_VFIO_PCI, so it shares the same C struct type as
>>> TYPE_VFIO_PCI_BASE, VFIOPCIDevice.
>>
>> I don't understand how the properties are distributed between the
>> abstract vfio-pci base class and the vfio-pci class. What's the
>> rationale ?
>
> It's for properties that apply to all vfio pci classes, and those that are
> specific to the kernel vfio pci implementation.
It seems quite vague to me.
After this patch, here is what we have for the base class :
static const Property vfio_pci_base_dev_properties[] = {
DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
vbasedev.pre_copy_dirty_page_tracking,
ON_OFF_AUTO_ON),
DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
vbasedev.device_dirty_page_tracking,
ON_OFF_AUTO_ON),
DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
intx.mmap_timeout, 1100),
DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
vbasedev.migration_multifd_transfer,
vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
.set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
vbasedev.migration_events, false),
DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
vbasedev.ram_block_discard_allowed, false),
DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
false),
DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
false),
DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
sub_vendor_id, PCI_ANY_ID),
DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
sub_device_id, PCI_ANY_ID),
DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
OFF_AUTO_PCIBAR_OFF),
};
and for vfio-pci :
static const Property vfio_pci_dev_properties[] = {
DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
display, ON_OFF_AUTO_OFF),
DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
VFIO_FEATURE_ENABLE_VGA_BIT, false),
DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
VFIO_FEATURE_ENABLE_REQ_BIT, true),
DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
igd_legacy_mode, ON_OFF_AUTO_AUTO),
DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
no_geforce_quirks, false),
DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
nv_gpudirect_clique,
qdev_prop_nv_gpudirect_clique, uint8_t),
#ifdef CONFIG_IOMMUFD
DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
#endif
DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
};
Graphic property and host device definitions are excluded from the
base class it seems. This might fit vfio-user needs but it looks
like a quick hack from the vfio-pci side. It needs more work.
>> Can you remind me why the vfio-pci class for vfio-user can not
>> inherit directly from vfio-pci ?
>
> For the above reason: we'd inherit many properties that don't work for
> vfio-user.
What do you mean by "don't work" ? functionally irrelevant ?
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 08/14] vfio: add vfio-pci-base class
2025-04-25 12:46 ` Cédric Le Goater
@ 2025-04-25 13:01 ` John Levon
2025-04-28 12:53 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-25 13:01 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On Fri, Apr 25, 2025 at 02:46:48PM +0200, Cédric Le Goater wrote:
> After this patch, here is what we have for the base class :
> static const Property vfio_pci_base_dev_properties[] = {
> DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
> vbasedev.pre_copy_dirty_page_tracking,
> ON_OFF_AUTO_ON),
> DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
> vbasedev.device_dirty_page_tracking,
> ON_OFF_AUTO_ON),
> DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
> intx.mmap_timeout, 1100),
> DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
> vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
> DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
> vbasedev.migration_multifd_transfer,
> vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
> .set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
> DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
> vbasedev.migration_events, false),
> DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
> DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
> vbasedev.ram_block_discard_allowed, false),
> DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
> DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
> DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
> DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> false),
> DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> false),
> DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
> DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
> DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
> sub_vendor_id, PCI_ANY_ID),
> DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
> sub_device_id, PCI_ANY_ID),
> DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
> OFF_AUTO_PCIBAR_OFF),
> };
> and for vfio-pci :
> static const Property vfio_pci_dev_properties[] = {
> DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
> DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
> DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
> DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
> display, ON_OFF_AUTO_OFF),
> DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
> DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
> DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
> VFIO_FEATURE_ENABLE_VGA_BIT, false),
> DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
> VFIO_FEATURE_ENABLE_REQ_BIT, true),
> DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
> VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
> DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
> DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
> VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
> DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
> igd_legacy_mode, ON_OFF_AUTO_AUTO),
> DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
> no_geforce_quirks, false),
> DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
> nv_gpudirect_clique,
> qdev_prop_nv_gpudirect_clique, uint8_t),
> #ifdef CONFIG_IOMMUFD
> DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
> TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
> #endif
> DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
> };
> Graphic property and host device definitions are excluded from the
> base class it seems. This might fit vfio-user needs but it looks
> like a quick hack from the vfio-pci side. It needs more work.
Can you suggest a preferred approach? I'm definitely not wedded to the current
way (after all, I didn't write it !), but I'm not sure how else we could do it.
Perhaps if there's some way to deregister properties when vfio-user
instantiates?
> > > Can you remind me why the vfio-pci class for vfio-user can not
> > > inherit directly from vfio-pci ?
> >
> > For the above reason: we'd inherit many properties that don't work for
> > vfio-user.
>
> What do you mean by "don't work" ? functionally irrelevant ?
I don't know the answer to that in general. Certainly some are just irrelevant
(like sysfsdev), but it's entirely possible the other stuff actively breaks.
Presumably you agree it's not good to introduce potential footguns for users
here?
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 08/14] vfio: add vfio-pci-base class
2025-04-25 13:01 ` John Levon
@ 2025-04-28 12:53 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-28 12:53 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/25/25 15:01, John Levon wrote:
> On Fri, Apr 25, 2025 at 02:46:48PM +0200, Cédric Le Goater wrote:
>
>> After this patch, here is what we have for the base class :
>> static const Property vfio_pci_base_dev_properties[] = {
>> DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
>> vbasedev.pre_copy_dirty_page_tracking,
>> ON_OFF_AUTO_ON),
>> DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
>> vbasedev.device_dirty_page_tracking,
>> ON_OFF_AUTO_ON),
>> DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
>> intx.mmap_timeout, 1100),
>> DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
>> vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
>> DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice,
>> vbasedev.migration_multifd_transfer,
>> vfio_pci_migration_multifd_transfer_prop, OnOffAuto,
>> .set_default = true, .defval.i = ON_OFF_AUTO_AUTO),
>> DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice,
>> vbasedev.migration_events, false),
>> DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
>> DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
>> vbasedev.ram_block_discard_allowed, false),
>> DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
>> DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
>> DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
>> DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
>> false),
>> DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
>> false),
>> DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
>> DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
>> DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
>> sub_vendor_id, PCI_ANY_ID),
>> DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
>> sub_device_id, PCI_ANY_ID),
>> DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
>> OFF_AUTO_PCIBAR_OFF),
>> };
>> and for vfio-pci :
>> static const Property vfio_pci_dev_properties[] = {
>> DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
>> DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
>> DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
>> DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
>> display, ON_OFF_AUTO_OFF),
>> DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
>> DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
>> DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
>> VFIO_FEATURE_ENABLE_VGA_BIT, false),
>> DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
>> VFIO_FEATURE_ENABLE_REQ_BIT, true),
>> DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
>> VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
>> DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
>> DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features,
>> VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false),
>> DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice,
>> igd_legacy_mode, ON_OFF_AUTO_AUTO),
>> DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
>> no_geforce_quirks, false),
>> DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
>> nv_gpudirect_clique,
>> qdev_prop_nv_gpudirect_clique, uint8_t),
>> #ifdef CONFIG_IOMMUFD
>> DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
>> TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
>> #endif
>> DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
>> };
>> Graphic property and host device definitions are excluded from the
>> base class it seems. This might fit vfio-user needs but it looks
>> like a quick hack from the vfio-pci side. It needs more work.
>
> Can you suggest a preferred approach? I'm definitely not wedded to the current
> way (after all, I didn't write it !), but I'm not sure how else we could do it.
>
> Perhaps if there's some way to deregister properties when vfio-user
> instantiates?
>
>>>> Can you remind me why the vfio-pci class for vfio-user can not
>>>> inherit directly from vfio-pci ?
>>>
>>> For the above reason: we'd inherit many properties that don't work for
>>> vfio-user.
>>
>> What do you mean by "don't work" ? functionally irrelevant ?
>
> I don't know the answer to that in general. Certainly some are just irrelevant
> (like sysfsdev), but it's entirely possible the other stuff actively breaks.
> Presumably you agree it's not good to introduce potential footguns for users
> here?
Do we know which properties are required for the vfio-user variant of the
vfio-pci device ?
I'd be tempted to start with an empty abstract vfio-pci-base device class.
This wouldn't change the current vfio-pci device much, and the vfio-user
variant would duplicate the necessary properties.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 09/14] vfio: add vfio_device_get_irq_info() helper
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (7 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 08/14] vfio: add vfio-pci-base class John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-23 17:16 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 10/14] vfio: consistently handle return value for helpers John Levon
` (5 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Add a helper similar to vfio_device_get_region_info() and use it
everywhere.
Replace a couple of needless allocations with stack variables.
As a side-effect, this fixes a minor error reporting issue in the call
from vfio_msix_early_setup().
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/ap.c | 19 ++++++++++---------
hw/vfio/ccw.c | 20 +++++++++++---------
hw/vfio/device.c | 15 +++++++++++++++
hw/vfio/pci.c | 23 +++++++++++------------
hw/vfio/platform.c | 6 +++---
include/hw/vfio/vfio-device.h | 3 +++
6 files changed, 53 insertions(+), 33 deletions(-)
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 4af7379d4f..f311bca5b6 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -74,10 +74,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
unsigned int irq, Error **errp)
{
int fd;
- size_t argsz;
+ int ret;
IOHandler *fd_read;
EventNotifier *notifier;
- g_autofree struct vfio_irq_info *irq_info = NULL;
+ struct vfio_irq_info irq_info;
VFIODevice *vdev = &vapdev->vdev;
switch (irq) {
@@ -96,14 +96,15 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
return false;
}
- argsz = sizeof(*irq_info);
- irq_info = g_malloc0(argsz);
- irq_info->index = irq;
- irq_info->argsz = argsz;
+ ret = vfio_device_get_irq_info(vdev, irq, &irq_info);
+
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "vfio: Error getting irq info");
+ return false;
+ }
- if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO,
- irq_info) < 0 || irq_info->count < 1) {
- error_setg_errno(errp, errno, "vfio: Error getting irq info");
+ if (irq_info.count < 1) {
+ error_setg_errno(errp, EINVAL, "vfio: Error getting irq info, count=0");
return false;
}
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 98aa0000da..dac8769925 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -376,8 +376,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev,
Error **errp)
{
VFIODevice *vdev = &vcdev->vdev;
- g_autofree struct vfio_irq_info *irq_info = NULL;
- size_t argsz;
+ struct vfio_irq_info irq_info;
+ int ret;
int fd;
EventNotifier *notifier;
IOHandler *fd_read;
@@ -406,13 +406,15 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev,
return false;
}
- argsz = sizeof(*irq_info);
- irq_info = g_malloc0(argsz);
- irq_info->index = irq;
- irq_info->argsz = argsz;
- if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO,
- irq_info) < 0 || irq_info->count < 1) {
- error_setg_errno(errp, errno, "vfio: Error getting irq info");
+ ret = vfio_device_get_irq_info(vdev, irq, &irq_info);
+
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "vfio: Error getting irq info");
+ return false;
+ }
+
+ if (irq_info.count < 1) {
+ error_setg_errno(errp, EINVAL, "vfio: Error getting irq info, count=0");
return false;
}
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index b9473878fc..2966171118 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -185,6 +185,21 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
return false;
}
+int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
+ struct vfio_irq_info *info)
+{
+ int ret;
+
+ memset(info, 0, sizeof(*info));
+
+ info->argsz = sizeof(*info);
+ info->index = index;
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
+
+ return ret < 0 ? -errno : ret;
+}
+
int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
struct vfio_region_info **info)
{
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 090b2f2ef0..ac53c43f2b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1555,8 +1555,7 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
uint16_t ctrl;
uint32_t table, pba;
int ret, fd = vdev->vbasedev.fd;
- struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
- .index = VFIO_PCI_MSIX_IRQ_INDEX };
+ struct vfio_irq_info irq_info;
VFIOMSIXInfo *msix;
pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
@@ -1593,7 +1592,8 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+ ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
+ &irq_info);
if (ret < 0) {
error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
g_free(msix);
@@ -2737,7 +2737,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
g_autofree struct vfio_region_info *reg_info = NULL;
- struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+ struct vfio_irq_info irq_info;
int i, ret = -1;
/* Sanity check device */
@@ -2798,12 +2798,10 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
}
}
- irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
-
- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+ ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
if (ret) {
/* This can fail for an old kernel or legacy PCI dev */
- trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
+ trace_vfio_populate_device_get_irq_info_failure(strerror(-ret));
} else if (irq_info.count == 1) {
vdev->pci_aer = true;
} else {
@@ -2912,17 +2910,18 @@ static void vfio_req_notifier_handler(void *opaque)
static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
{
- struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
- .index = VFIO_PCI_REQ_IRQ_INDEX };
+ struct vfio_irq_info irq_info;
Error *err = NULL;
int32_t fd;
+ int ret;
if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
return;
}
- if (ioctl(vdev->vbasedev.fd,
- VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
+ ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX,
+ &irq_info);
+ if (ret < 0 || irq_info.count < 1) {
return;
}
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 877d69b7aa..fd176c18a4 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -475,10 +475,10 @@ static bool vfio_populate_device(VFIODevice *vbasedev, Error **errp)
QSIMPLEQ_INIT(&vdev->pending_intp_queue);
for (i = 0; i < vbasedev->num_irqs; i++) {
- struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+ struct vfio_irq_info irq;
+
+ ret = vfio_device_get_irq_info(vbasedev, i, &irq);
- irq.index = i;
- ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
if (ret) {
error_setg_errno(errp, -ret, "failed to get device irq info");
goto irq_err;
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 3563a82ede..9522a09c48 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -144,6 +144,9 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
uint32_t subtype, struct vfio_region_info **info);
bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type);
+
+int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
+ struct vfio_irq_info *info);
#endif
/* Returns 0 on success, or a negative errno. */
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 09/14] vfio: add vfio_device_get_irq_info() helper
2025-04-09 13:48 ` [PATCH 09/14] vfio: add vfio_device_get_irq_info() helper John Levon
@ 2025-04-23 17:16 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-23 17:16 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> Add a helper similar to vfio_device_get_region_info() and use it
> everywhere.
>
> Replace a couple of needless allocations with stack variables.
>
> As a side-effect, this fixes a minor error reporting issue in the call
> from vfio_msix_early_setup().
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Some comments below,
> ---> hw/vfio/ap.c | 19 ++++++++++---------
> hw/vfio/ccw.c | 20 +++++++++++---------
> hw/vfio/device.c | 15 +++++++++++++++
> hw/vfio/pci.c | 23 +++++++++++------------
> hw/vfio/platform.c | 6 +++---
> include/hw/vfio/vfio-device.h | 3 +++
> 6 files changed, 53 insertions(+), 33 deletions(-)
>
> diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
> index 4af7379d4f..f311bca5b6 100644
> --- a/hw/vfio/ap.c
> +++ b/hw/vfio/ap.c
> @@ -74,10 +74,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
> unsigned int irq, Error **errp)
> {
> int fd;
> - size_t argsz;
> + int ret;
> IOHandler *fd_read;
> EventNotifier *notifier;
> - g_autofree struct vfio_irq_info *irq_info = NULL;
> + struct vfio_irq_info irq_info;
> VFIODevice *vdev = &vapdev->vdev;
>
> switch (irq) {
> @@ -96,14 +96,15 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev,
> return false;
> }
>
> - argsz = sizeof(*irq_info);
> - irq_info = g_malloc0(argsz);
> - irq_info->index = irq;
> - irq_info->argsz = argsz;
> + ret = vfio_device_get_irq_info(vdev, irq, &irq_info);
> +
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "vfio: Error getting irq info");
> + return false;
> + }
>
> - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO,
> - irq_info) < 0 || irq_info->count < 1) {
> - error_setg_errno(errp, errno, "vfio: Error getting irq info");
> + if (irq_info.count < 1) {
> + error_setg_errno(errp, EINVAL, "vfio: Error getting irq info, count=0");
I am not sure using error_setg_errno() is interesting in that case. May be simply
use error_setg(). Same below.
Thanks,
C.
> return false;
> }
>
> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
> index 98aa0000da..dac8769925 100644
> --- a/hw/vfio/ccw.c
> +++ b/hw/vfio/ccw.c
> @@ -376,8 +376,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev,
> Error **errp)
> {
> VFIODevice *vdev = &vcdev->vdev;
> - g_autofree struct vfio_irq_info *irq_info = NULL;
> - size_t argsz;
> + struct vfio_irq_info irq_info;
> + int ret;
> int fd;
> EventNotifier *notifier;
> IOHandler *fd_read;
> @@ -406,13 +406,15 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev,
> return false;
> }
>
> - argsz = sizeof(*irq_info);
> - irq_info = g_malloc0(argsz);
> - irq_info->index = irq;
> - irq_info->argsz = argsz;
> - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO,
> - irq_info) < 0 || irq_info->count < 1) {
> - error_setg_errno(errp, errno, "vfio: Error getting irq info");
> + ret = vfio_device_get_irq_info(vdev, irq, &irq_info);
> +
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "vfio: Error getting irq info");
> + return false;
> + }
> +
> + if (irq_info.count < 1) {
> + error_setg_errno(errp, EINVAL, "vfio: Error getting irq info, count=0");
> return false;
> }
>
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index b9473878fc..2966171118 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -185,6 +185,21 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
> return false;
> }
>
> +int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
> + struct vfio_irq_info *info)
> +{
> + int ret;
> +
> + memset(info, 0, sizeof(*info));
> +
> + info->argsz = sizeof(*info);
> + info->index = index;
> +
> + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> struct vfio_region_info **info)
> {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 090b2f2ef0..ac53c43f2b 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1555,8 +1555,7 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> uint16_t ctrl;
> uint32_t table, pba;
> int ret, fd = vdev->vbasedev.fd;
> - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
> - .index = VFIO_PCI_MSIX_IRQ_INDEX };
> + struct vfio_irq_info irq_info;
> VFIOMSIXInfo *msix;
>
> pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
> @@ -1593,7 +1592,8 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
> msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
>
> - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX,
> + &irq_info);
> if (ret < 0) {
> error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
> g_free(msix);
> @@ -2737,7 +2737,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
> {
> VFIODevice *vbasedev = &vdev->vbasedev;
> g_autofree struct vfio_region_info *reg_info = NULL;
> - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
> + struct vfio_irq_info irq_info;
> int i, ret = -1;
>
> /* Sanity check device */
> @@ -2798,12 +2798,10 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
> }
> }
>
> - irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
> -
> - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> + ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info);
> if (ret) {
> /* This can fail for an old kernel or legacy PCI dev */
> - trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
> + trace_vfio_populate_device_get_irq_info_failure(strerror(-ret));
> } else if (irq_info.count == 1) {
> vdev->pci_aer = true;
> } else {
> @@ -2912,17 +2910,18 @@ static void vfio_req_notifier_handler(void *opaque)
>
> static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
> {
> - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
> - .index = VFIO_PCI_REQ_IRQ_INDEX };
> + struct vfio_irq_info irq_info;
> Error *err = NULL;
> int32_t fd;
> + int ret;
>
> if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
> return;
> }
>
> - if (ioctl(vdev->vbasedev.fd,
> - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
> + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX,
> + &irq_info);
> + if (ret < 0 || irq_info.count < 1) {
> return;
> }
>
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> index 877d69b7aa..fd176c18a4 100644
> --- a/hw/vfio/platform.c
> +++ b/hw/vfio/platform.c
> @@ -475,10 +475,10 @@ static bool vfio_populate_device(VFIODevice *vbasedev, Error **errp)
> QSIMPLEQ_INIT(&vdev->pending_intp_queue);
>
> for (i = 0; i < vbasedev->num_irqs; i++) {
> - struct vfio_irq_info irq = { .argsz = sizeof(irq) };
> + struct vfio_irq_info irq;
> +
> + ret = vfio_device_get_irq_info(vbasedev, i, &irq);
>
> - irq.index = i;
> - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
> if (ret) {
> error_setg_errno(errp, -ret, "failed to get device irq info");
> goto irq_err;
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 3563a82ede..9522a09c48 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -144,6 +144,9 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
> uint32_t subtype, struct vfio_region_info **info);
> bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type);
> +
> +int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
> + struct vfio_irq_info *info);
> #endif
>
> /* Returns 0 on success, or a negative errno. */
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 10/14] vfio: consistently handle return value for helpers
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (8 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 09/14] vfio: add vfio_device_get_irq_info() helper John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-24 15:19 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 11/14] vfio: add vfio_pci_config_space_read/write() John Levon
` (4 subsequent siblings)
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Various bits of code that call vfio device APIs should consistently use
the "return -errno" approach for passing errors back, rather than
presuming errno is (still) set correctly.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/pci.c | 30 +++++++++++++++++-------------
1 file changed, 17 insertions(+), 13 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ac53c43f2b..ddeee33aa9 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -398,7 +398,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
- return ret;
+ return ret < 0 ? -errno : ret;
}
static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
@@ -459,7 +459,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
g_free(irq_set);
- return ret;
+ return ret < 0 ? -errno : ret;
}
static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
@@ -581,7 +581,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
ret = vfio_enable_vectors(vdev, true);
if (ret) {
- error_report("vfio: failed to enable vectors, %d", ret);
+ error_report("vfio: failed to enable vectors, %d", -ret);
}
} else {
Error *err = NULL;
@@ -695,7 +695,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
if (vdev->nr_vectors) {
ret = vfio_enable_vectors(vdev, true);
if (ret) {
- error_report("vfio: failed to enable vectors, %d", ret);
+ error_report("vfio: failed to enable vectors, %d", -ret);
}
} else {
/*
@@ -712,7 +712,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
*/
ret = vfio_enable_msix_no_vec(vdev);
if (ret) {
- error_report("vfio: failed to enable MSI-X, %d", ret);
+ error_report("vfio: failed to enable MSI-X, %d", -ret);
}
}
@@ -765,7 +765,8 @@ retry:
ret = vfio_enable_vectors(vdev, false);
if (ret) {
if (ret < 0) {
- error_report("vfio: Error: Failed to setup MSI fds: %m");
+ error_report("vfio: Error: Failed to setup MSI fds: %s",
+ strerror(-ret));
} else {
error_report("vfio: Error: Failed to enable %d "
"MSI vectors, retry with %d", vdev->nr_vectors, ret);
@@ -882,17 +883,21 @@ static void vfio_update_msi(VFIOPCIDevice *vdev)
static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
{
g_autofree struct vfio_region_info *reg_info = NULL;
+ VFIODevice *vbasedev = &vdev->vbasedev;
uint64_t size;
off_t off = 0;
ssize_t bytes;
+ int ret;
+
+ ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX,
+ ®_info);
- if (vfio_device_get_region_info(&vdev->vbasedev,
- VFIO_PCI_ROM_REGION_INDEX, ®_info)) {
- error_report("vfio: Error getting ROM info: %m");
+ if (ret != 0) {
+ error_report("vfio: Error getting ROM info: %s", strerror(-ret));
return;
}
- trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
+ trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size,
(unsigned long)reg_info->offset,
(unsigned long)reg_info->flags);
@@ -901,8 +906,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
if (!vdev->rom_size) {
vdev->rom_read_failed = true;
- error_report("vfio-pci: Cannot read device rom at "
- "%s", vdev->vbasedev.name);
+ error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name);
error_printf("Device option ROM contents are probably invalid "
"(check dmesg).\nSkip option ROM probe with rombar=0, "
"or load from file with romfile=\n");
@@ -913,7 +917,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
memset(vdev->rom, 0xff, size);
while (size) {
- bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
+ bytes = pread(vbasedev->fd, vdev->rom + off,
size, vdev->rom_offset + off);
if (bytes == 0) {
break;
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 10/14] vfio: consistently handle return value for helpers
2025-04-09 13:48 ` [PATCH 10/14] vfio: consistently handle return value for helpers John Levon
@ 2025-04-24 15:19 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 15:19 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> Various bits of code that call vfio device APIs should consistently use
> the "return -errno" approach for passing errors back, rather than
> presuming errno is (still) set correctly.
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/pci.c | 30 +++++++++++++++++-------------
> 1 file changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ac53c43f2b..ddeee33aa9 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -398,7 +398,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
>
> ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
>
> - return ret;
> + return ret < 0 ? -errno : ret;
> }
>
> static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
> @@ -459,7 +459,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>
> g_free(irq_set);
>
> - return ret;
> + return ret < 0 ? -errno : ret;
> }
>
> static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
> @@ -581,7 +581,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
> ret = vfio_enable_vectors(vdev, true);
> if (ret) {
> - error_report("vfio: failed to enable vectors, %d", ret);
> + error_report("vfio: failed to enable vectors, %d", -ret);
while at changing error reports, could you please add literal errors using
strerror() here and below.
Thanks,
C.
> }
> } else {
> Error *err = NULL;
> @@ -695,7 +695,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
> if (vdev->nr_vectors) {
> ret = vfio_enable_vectors(vdev, true);
> if (ret) {
> - error_report("vfio: failed to enable vectors, %d", ret);
> + error_report("vfio: failed to enable vectors, %d", -ret);
> }
> } else {
> /*
> @@ -712,7 +712,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
> */
> ret = vfio_enable_msix_no_vec(vdev);
> if (ret) {
> - error_report("vfio: failed to enable MSI-X, %d", ret);
> + error_report("vfio: failed to enable MSI-X, %d", -ret);
> }
> }
>
> @@ -765,7 +765,8 @@ retry:
> ret = vfio_enable_vectors(vdev, false);
> if (ret) {
> if (ret < 0) {
> - error_report("vfio: Error: Failed to setup MSI fds: %m");
> + error_report("vfio: Error: Failed to setup MSI fds: %s",
> + strerror(-ret));
> } else {
> error_report("vfio: Error: Failed to enable %d "
> "MSI vectors, retry with %d", vdev->nr_vectors, ret);
> @@ -882,17 +883,21 @@ static void vfio_update_msi(VFIOPCIDevice *vdev)
> static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> {
> g_autofree struct vfio_region_info *reg_info = NULL;
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint64_t size;
> off_t off = 0;
> ssize_t bytes;
> + int ret;
> +
> + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX,
> + ®_info);
>
> - if (vfio_device_get_region_info(&vdev->vbasedev,
> - VFIO_PCI_ROM_REGION_INDEX, ®_info)) {
> - error_report("vfio: Error getting ROM info: %m");
> + if (ret != 0) {
> + error_report("vfio: Error getting ROM info: %s", strerror(-ret));
> return;
> }
>
> - trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
> + trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size,
> (unsigned long)reg_info->offset,
> (unsigned long)reg_info->flags);
>
> @@ -901,8 +906,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>
> if (!vdev->rom_size) {
> vdev->rom_read_failed = true;
> - error_report("vfio-pci: Cannot read device rom at "
> - "%s", vdev->vbasedev.name);
> + error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name);
> error_printf("Device option ROM contents are probably invalid "
> "(check dmesg).\nSkip option ROM probe with rombar=0, "
> "or load from file with romfile=\n");
> @@ -913,7 +917,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> memset(vdev->rom, 0xff, size);
>
> while (size) {
> - bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
> + bytes = pread(vbasedev->fd, vdev->rom + off,
> size, vdev->rom_offset + off);
> if (bytes == 0) {
> break;
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 11/14] vfio: add vfio_pci_config_space_read/write()
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (9 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 10/14] vfio: consistently handle return value for helpers John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-09 15:51 ` Tomita Moeko
2025-04-24 16:06 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 12/14] vfio: add region info cache John Levon
` (3 subsequent siblings)
14 siblings, 2 replies; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon
Add these helpers that access config space and return an -errno style
return.
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/pci.c | 134 ++++++++++++++++++++++++++++++++++----------------
1 file changed, 91 insertions(+), 43 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ddeee33aa9..c3842d2f8d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -964,6 +964,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
}
}
+/* "Raw" read of underlying config space. */
+static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
+ uint32_t size, void *data)
+{
+ ssize_t ret;
+
+ ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
+
+ return ret < 0 ? -errno : (int)ret;
+}
+
+/* "Raw" write of underlying config space. */
+static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
+ uint32_t size, void *data)
+{
+ ssize_t ret;
+
+ ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
+
+ return ret < 0 ? -errno : (int)ret;
+}
+
static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
{
VFIOPCIDevice *vdev = opaque;
@@ -1016,10 +1038,9 @@ static const MemoryRegionOps vfio_rom_ops = {
static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
{
+ VFIODevice *vbasedev = &vdev->vbasedev;
uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
- off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
char *name;
- int fd = vdev->vbasedev.fd;
if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
/* Since pci handles romfile, just print a message and return */
@@ -1036,11 +1057,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
* Use the same size ROM BAR as the physical device. The contents
* will get filled in later when the guest tries to read it.
*/
- if (pread(fd, &orig, 4, offset) != 4 ||
- pwrite(fd, &size, 4, offset) != 4 ||
- pread(fd, &size, 4, offset) != 4 ||
- pwrite(fd, &orig, 4, offset) != 4) {
- error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
+ if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
+ vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
+ vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
+ vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
+
+ error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
return;
}
@@ -1220,6 +1242,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
{
VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
+ VFIODevice *vbasedev = &vdev->vbasedev;
uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
@@ -1232,12 +1255,13 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
ssize_t ret;
- ret = pread(vdev->vbasedev.fd, &phys_val, len,
- vdev->config_offset + addr);
+ ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
if (ret != len) {
- error_report("%s(%s, 0x%x, 0x%x) failed: %m",
- __func__, vdev->vbasedev.name, addr, len);
- return -errno;
+ const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+ error_report("%s(%s, 0x%x, 0x%x) failed: %s",
+ __func__, vbasedev->name, addr, len, err);
+ return -1;
}
phys_val = le32_to_cpu(phys_val);
}
@@ -1253,15 +1277,19 @@ void vfio_pci_write_config(PCIDevice *pdev,
uint32_t addr, uint32_t val, int len)
{
VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
+ VFIODevice *vbasedev = &vdev->vbasedev;
uint32_t val_le = cpu_to_le32(val);
+ int ret;
trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
/* Write everything to VFIO, let it filter out what we can't write */
- if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
- != len) {
- error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
- __func__, vdev->vbasedev.name, addr, val, len);
+ ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
+ if (ret != len) {
+ const char *err = ret < 0 ? strerror(-ret) : "short write";
+
+ error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
+ __func__, vbasedev->name, addr, val, len, err);
}
/* MSI/MSI-X Enabling/Disabling */
@@ -1349,9 +1377,12 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
int ret, entries;
Error *err = NULL;
- if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
- vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
- error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
+ ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
+ sizeof(ctrl), &ctrl);
+ if (ret != sizeof(ctrl)) {
+ const char *errmsg = ret < 0 ? strerror(-ret) : "short read";
+
+ error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", errmsg);
return false;
}
ctrl = le16_to_cpu(ctrl);
@@ -1558,30 +1589,39 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
uint8_t pos;
uint16_t ctrl;
uint32_t table, pba;
- int ret, fd = vdev->vbasedev.fd;
struct vfio_irq_info irq_info;
VFIOMSIXInfo *msix;
+ int ret;
pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
if (!pos) {
return true;
}
- if (pread(fd, &ctrl, sizeof(ctrl),
- vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
- error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
+ ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
+ sizeof(ctrl), &ctrl);
+ if (ret != sizeof(ctrl)) {
+ const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+ error_setg(errp, "failed to read PCI MSIX FLAGS: %s", err);
return false;
}
- if (pread(fd, &table, sizeof(table),
- vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
- error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
+ ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
+ sizeof(table), &table);
+ if (ret != sizeof(table)) {
+ const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+ error_setg(errp, "failed to read PCI MSIX TABLE: %s", err);
return false;
}
- if (pread(fd, &pba, sizeof(pba),
- vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
- error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
+ ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
+ sizeof(pba), &pba);
+ if (ret != sizeof(pba)) {
+ const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+ error_setg(errp, "failed to read PCI MSIX PBA: %s", err);
return false;
}
@@ -1741,10 +1781,12 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
}
/* Determine what type of BAR this is for registration */
- ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
- vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
+ ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
+ sizeof(pci_bar), &pci_bar);
if (ret != sizeof(pci_bar)) {
- error_report("vfio: Failed to read BAR %d (%m)", nr);
+ const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+ error_report("vfio: Failed to read BAR %d: %s", nr, err);
return;
}
@@ -2448,21 +2490,25 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
void vfio_pci_post_reset(VFIOPCIDevice *vdev)
{
+ VFIODevice *vbasedev = &vdev->vbasedev;
Error *err = NULL;
- int nr;
+ int ret, nr;
if (!vfio_intx_enable(vdev, &err)) {
error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
}
for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
- off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
+ off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
uint32_t val = 0;
uint32_t len = sizeof(val);
- if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
- error_report("%s(%s) reset bar %d failed: %m", __func__,
- vdev->vbasedev.name, nr);
+ ret = vfio_pci_config_space_write(vdev, addr, len, &val);
+ if (ret != len) {
+ const char *errmsg = ret < 0 ? strerror(-ret) : "short write";
+
+ error_report("%s(%s) reset bar %d failed: %s", __func__,
+ vbasedev->name, nr, errmsg);
}
}
@@ -3099,6 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
int i, ret;
char uuid[UUID_STR_LEN];
g_autofree char *name = NULL;
+ size_t config_space_size;
if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -3153,13 +3200,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto error;
}
+ config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
+
/* Get a copy of config space */
- ret = pread(vbasedev->fd, vdev->pdev.config,
- MIN(pci_config_size(&vdev->pdev), vdev->config_size),
- vdev->config_offset);
- if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
- ret = ret < 0 ? -errno : -EFAULT;
- error_setg_errno(errp, -ret, "failed to read device config space");
+ ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
+ vdev->pdev.config);
+ if (ret < (int)config_space_size) {
+ ret = ret < 0 ? -ret : EFAULT;
+ error_setg_errno(errp, ret, "failed to read device config space");
goto error;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 11/14] vfio: add vfio_pci_config_space_read/write()
2025-04-09 13:48 ` [PATCH 11/14] vfio: add vfio_pci_config_space_read/write() John Levon
@ 2025-04-09 15:51 ` Tomita Moeko
2025-04-09 15:54 ` John Levon
2025-04-24 16:06 ` Cédric Le Goater
1 sibling, 1 reply; 53+ messages in thread
From: Tomita Moeko @ 2025-04-09 15:51 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 21:48, John Levon wrote:
> Add these helpers that access config space and return an -errno style
> return.
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/pci.c | 134 ++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 91 insertions(+), 43 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ddeee33aa9..c3842d2f8d 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -964,6 +964,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> }
> }
>
> +/* "Raw" read of underlying config space. */
> +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
> + uint32_t size, void *data)
Returning ssize_t here might be better here to avoid casting issues,
though we would never read/write something exceeds INT32_MAX.
Thanks,
Moeko
> +{
> + ssize_t ret;
> +
> + ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> +
> + return ret < 0 ? -errno : (int)ret;
> +}
> +
> +/* "Raw" write of underlying config space. */
> +static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
> + uint32_t size, void *data)
> +{
> + ssize_t ret;
> +
> + ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> +
> + return ret < 0 ? -errno : (int)ret;
> +}
> +
> static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
> {
> VFIOPCIDevice *vdev = opaque;
> @@ -1016,10 +1038,9 @@ static const MemoryRegionOps vfio_rom_ops = {
>
> static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
> {
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
> - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
> char *name;
> - int fd = vdev->vbasedev.fd;
>
> if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
> /* Since pci handles romfile, just print a message and return */
> @@ -1036,11 +1057,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
> * Use the same size ROM BAR as the physical device. The contents
> * will get filled in later when the guest tries to read it.
> */
> - if (pread(fd, &orig, 4, offset) != 4 ||
> - pwrite(fd, &size, 4, offset) != 4 ||
> - pread(fd, &size, 4, offset) != 4 ||
> - pwrite(fd, &orig, 4, offset) != 4) {
> - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
> + if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
> + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
> + vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
> + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
> +
> + error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
> return;
> }
>
> @@ -1220,6 +1242,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
> uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> {
> VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
>
> memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> @@ -1232,12 +1255,13 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
> ssize_t ret;
>
> - ret = pread(vdev->vbasedev.fd, &phys_val, len,
> - vdev->config_offset + addr);
> + ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
> if (ret != len) {
> - error_report("%s(%s, 0x%x, 0x%x) failed: %m",
> - __func__, vdev->vbasedev.name, addr, len);
> - return -errno;
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_report("%s(%s, 0x%x, 0x%x) failed: %s",
> + __func__, vbasedev->name, addr, len, err);
> + return -1;
> }
> phys_val = le32_to_cpu(phys_val);
> }
> @@ -1253,15 +1277,19 @@ void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr, uint32_t val, int len)
> {
> VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint32_t val_le = cpu_to_le32(val);
> + int ret;
>
> trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
>
> /* Write everything to VFIO, let it filter out what we can't write */
> - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
> - != len) {
> - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
> - __func__, vdev->vbasedev.name, addr, val, len);
> + ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
> + if (ret != len) {
> + const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
> + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
> + __func__, vbasedev->name, addr, val, len, err);
> }
>
> /* MSI/MSI-X Enabling/Disabling */
> @@ -1349,9 +1377,12 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> int ret, entries;
> Error *err = NULL;
>
> - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
> - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
> + sizeof(ctrl), &ctrl);
> + if (ret != sizeof(ctrl)) {
> + const char *errmsg = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", errmsg);
> return false;
> }
> ctrl = le16_to_cpu(ctrl);
> @@ -1558,30 +1589,39 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> uint8_t pos;
> uint16_t ctrl;
> uint32_t table, pba;
> - int ret, fd = vdev->vbasedev.fd;
> struct vfio_irq_info irq_info;
> VFIOMSIXInfo *msix;
> + int ret;
>
> pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
> if (!pos) {
> return true;
> }
>
> - if (pread(fd, &ctrl, sizeof(ctrl),
> - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
> - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
> + sizeof(ctrl), &ctrl);
> + if (ret != sizeof(ctrl)) {
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed to read PCI MSIX FLAGS: %s", err);
> return false;
> }
>
> - if (pread(fd, &table, sizeof(table),
> - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
> - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
> + sizeof(table), &table);
> + if (ret != sizeof(table)) {
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed to read PCI MSIX TABLE: %s", err);
> return false;
> }
>
> - if (pread(fd, &pba, sizeof(pba),
> - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
> - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
> + sizeof(pba), &pba);
> + if (ret != sizeof(pba)) {
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed to read PCI MSIX PBA: %s", err);
> return false;
> }
>
> @@ -1741,10 +1781,12 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
> }
>
> /* Determine what type of BAR this is for registration */
> - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
> - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
> + ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
> + sizeof(pci_bar), &pci_bar);
> if (ret != sizeof(pci_bar)) {
> - error_report("vfio: Failed to read BAR %d (%m)", nr);
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_report("vfio: Failed to read BAR %d: %s", nr, err);
> return;
> }
>
> @@ -2448,21 +2490,25 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
>
> void vfio_pci_post_reset(VFIOPCIDevice *vdev)
> {
> + VFIODevice *vbasedev = &vdev->vbasedev;
> Error *err = NULL;
> - int nr;
> + int ret, nr;
>
> if (!vfio_intx_enable(vdev, &err)) {
> error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> }
>
> for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
> - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
> + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
> uint32_t val = 0;
> uint32_t len = sizeof(val);
>
> - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
> - error_report("%s(%s) reset bar %d failed: %m", __func__,
> - vdev->vbasedev.name, nr);
> + ret = vfio_pci_config_space_write(vdev, addr, len, &val);
> + if (ret != len) {
> + const char *errmsg = ret < 0 ? strerror(-ret) : "short write";
> +
> + error_report("%s(%s) reset bar %d failed: %s", __func__,
> + vbasedev->name, nr, errmsg);
> }
> }
>
> @@ -3099,6 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> int i, ret;
> char uuid[UUID_STR_LEN];
> g_autofree char *name = NULL;
> + size_t config_space_size;
>
> if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
> if (!(~vdev->host.domain || ~vdev->host.bus ||
> @@ -3153,13 +3200,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> goto error;
> }
>
> + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
> +
> /* Get a copy of config space */
> - ret = pread(vbasedev->fd, vdev->pdev.config,
> - MIN(pci_config_size(&vdev->pdev), vdev->config_size),
> - vdev->config_offset);
> - if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
> - ret = ret < 0 ? -errno : -EFAULT;
> - error_setg_errno(errp, -ret, "failed to read device config space");
> + ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
> + vdev->pdev.config);
> + if (ret < (int)config_space_size) {
> + ret = ret < 0 ? -ret : EFAULT;
> + error_setg_errno(errp, ret, "failed to read device config space");
> goto error;
> }
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 11/14] vfio: add vfio_pci_config_space_read/write()
2025-04-09 15:51 ` Tomita Moeko
@ 2025-04-09 15:54 ` John Levon
2025-04-09 16:30 ` Tomita Moeko
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 15:54 UTC (permalink / raw)
To: Tomita Moeko
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella,
Cédric Le Goater, Peter Xu, Thomas Huth, Matthew Rosato,
David Hildenbrand, Michael S. Tsirkin, Alex Williamson,
qemu-s390x, Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On Wed, Apr 09, 2025 at 11:51:09PM +0800, Tomita Moeko wrote:
> On 4/9/25 21:48, John Levon wrote:
> > Add these helpers that access config space and return an -errno style
> > return.
> >
> > Signed-off-by: John Levon <john.levon@nutanix.com>
> > ---
> > hw/vfio/pci.c | 134 ++++++++++++++++++++++++++++++++++----------------
> > 1 file changed, 91 insertions(+), 43 deletions(-)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index ddeee33aa9..c3842d2f8d 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -964,6 +964,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> > }
> > }
> >
> > +/* "Raw" read of underlying config space. */
> > +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
> > + uint32_t size, void *data)
>
> Returning ssize_t here might be better here to avoid casting issues,
> though we would never read/write something exceeds INT32_MAX.
I considered this (and the later helpers in the patch), but most of the existing
code already uses int. Happy to look at fixing the callers too (e.g.
vfio_msi_setup()) if that's everyone's preference.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 11/14] vfio: add vfio_pci_config_space_read/write()
2025-04-09 15:54 ` John Levon
@ 2025-04-09 16:30 ` Tomita Moeko
2025-04-24 15:59 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: Tomita Moeko @ 2025-04-09 16:30 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella,
Cédric Le Goater, Peter Xu, Thomas Huth, Matthew Rosato,
David Hildenbrand, Michael S. Tsirkin, Alex Williamson,
qemu-s390x, Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 23:54, John Levon wrote:
> On Wed, Apr 09, 2025 at 11:51:09PM +0800, Tomita Moeko wrote:
>
>> On 4/9/25 21:48, John Levon wrote:
>>> Add these helpers that access config space and return an -errno style
>>> return.
>>>
>>> Signed-off-by: John Levon <john.levon@nutanix.com>
>>> ---
>>> hw/vfio/pci.c | 134 ++++++++++++++++++++++++++++++++++----------------
>>> 1 file changed, 91 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index ddeee33aa9..c3842d2f8d 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -964,6 +964,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>>> }
>>> }
>>>
>>> +/* "Raw" read of underlying config space. */
>>> +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
>>> + uint32_t size, void *data)
>>
>> Returning ssize_t here might be better here to avoid casting issues,
>> though we would never read/write something exceeds INT32_MAX.
>
> I considered this (and the later helpers in the patch), but most of the existing
> code already uses int. Happy to look at fixing the callers too (e.g.
> vfio_msi_setup()) if that's everyone's preference.
>
> regards
> john
I checked the code, caller casts return of pread/pwrite to int because
the `count` argument, bytes read/write at most, they passed does not
exceed int. Given that uint32_t can exceed int, returning ssize_t here
and let callers to determine cast or not is better I believe.
Thanks,
Moeko
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 11/14] vfio: add vfio_pci_config_space_read/write()
2025-04-09 16:30 ` Tomita Moeko
@ 2025-04-24 15:59 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 15:59 UTC (permalink / raw)
To: Tomita Moeko, John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 18:30, Tomita Moeko wrote:
> On 4/9/25 23:54, John Levon wrote:
>> On Wed, Apr 09, 2025 at 11:51:09PM +0800, Tomita Moeko wrote:
>>
>>> On 4/9/25 21:48, John Levon wrote:
>>>> Add these helpers that access config space and return an -errno style
>>>> return.
>>>>
>>>> Signed-off-by: John Levon <john.levon@nutanix.com>
>>>> ---
>>>> hw/vfio/pci.c | 134 ++++++++++++++++++++++++++++++++++----------------
>>>> 1 file changed, 91 insertions(+), 43 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index ddeee33aa9..c3842d2f8d 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -964,6 +964,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>>>> }
>>>> }
>>>>
>>>> +/* "Raw" read of underlying config space. */
>>>> +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
>>>> + uint32_t size, void *data)
>>>
>>> Returning ssize_t here might be better here to avoid casting issues,
>>> though we would never read/write something exceeds INT32_MAX.
>>
>> I considered this (and the later helpers in the patch), but most of the existing
>> code already uses int. Happy to look at fixing the callers too (e.g.
>> vfio_msi_setup()) if that's everyone's preference.
>>
>> regards
>> john
>
> I checked the code, caller casts return of pread/pwrite to int because
> the `count` argument, bytes read/write at most, they passed does not
> exceed int. Given that uint32_t can exceed int, returning ssize_t here
> and let callers to determine cast or not is better I believe.
I have no strong preference. The less change the better I would say.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 11/14] vfio: add vfio_pci_config_space_read/write()
2025-04-09 13:48 ` [PATCH 11/14] vfio: add vfio_pci_config_space_read/write() John Levon
2025-04-09 15:51 ` Tomita Moeko
@ 2025-04-24 16:06 ` Cédric Le Goater
1 sibling, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 16:06 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> Add these helpers that access config space and return an -errno style
> return.
>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/pci.c | 134 ++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 91 insertions(+), 43 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ddeee33aa9..c3842d2f8d 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -964,6 +964,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> }
> }
>
> +/* "Raw" read of underlying config space. */
> +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
> + uint32_t size, void *data)
> +{
> + ssize_t ret;
> +
> + ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> +
> + return ret < 0 ? -errno : (int)ret;
> +}
> +
> +/* "Raw" write of underlying config space. */
> +static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
> + uint32_t size, void *data)
> +{
> + ssize_t ret;
> +
> + ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> +
> + return ret < 0 ? -errno : (int)ret;
> +}
> +
> static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
> {
> VFIOPCIDevice *vdev = opaque;
> @@ -1016,10 +1038,9 @@ static const MemoryRegionOps vfio_rom_ops = {
>
> static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
> {
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
> - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
> char *name;
> - int fd = vdev->vbasedev.fd;
>
> if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
> /* Since pci handles romfile, just print a message and return */
> @@ -1036,11 +1057,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
> * Use the same size ROM BAR as the physical device. The contents
> * will get filled in later when the guest tries to read it.
> */
> - if (pread(fd, &orig, 4, offset) != 4 ||
> - pwrite(fd, &size, 4, offset) != 4 ||
> - pread(fd, &size, 4, offset) != 4 ||
> - pwrite(fd, &orig, 4, offset) != 4) {
> - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
> + if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 ||
> + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
> + vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 ||
> + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) {
> +
> + error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
> return;
> }
>
> @@ -1220,6 +1242,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
> uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> {
> VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
>
> memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> @@ -1232,12 +1255,13 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
> ssize_t ret;
>
> - ret = pread(vdev->vbasedev.fd, &phys_val, len,
> - vdev->config_offset + addr);
> + ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val);
> if (ret != len) {
> - error_report("%s(%s, 0x%x, 0x%x) failed: %m",
> - __func__, vdev->vbasedev.name, addr, len);
> - return -errno;
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
These "short read/write" messages, here and below, are a bit invasive
in the code but they are interesting to keep. I wonder if we could
improve readability with some helper.
> +
> + error_report("%s(%s, 0x%x, 0x%x) failed: %s",
> + __func__, vbasedev->name, addr, len, err);
> + return -1;
-1 (all ones) seems more correct than returning -errno as before.
> }
> phys_val = le32_to_cpu(phys_val);
> }
> @@ -1253,15 +1277,19 @@ void vfio_pci_write_config(PCIDevice *pdev,
> uint32_t addr, uint32_t val, int len)
> {
> VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> + VFIODevice *vbasedev = &vdev->vbasedev;
> uint32_t val_le = cpu_to_le32(val);
> + int ret;
>
> trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
>
> /* Write everything to VFIO, let it filter out what we can't write */
> - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
> - != len) {
> - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
> - __func__, vdev->vbasedev.name, addr, val, len);
> + ret = vfio_pci_config_space_write(vdev, addr, len, &val_le);
> + if (ret != len) {
> + const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
> + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
> + __func__, vbasedev->name, addr, val, len, err);
> }
>
> /* MSI/MSI-X Enabling/Disabling */
> @@ -1349,9 +1377,12 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> int ret, entries;
> Error *err = NULL;
>
> - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
> - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS,
> + sizeof(ctrl), &ctrl);
> + if (ret != sizeof(ctrl)) {
> + const char *errmsg = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", errmsg);
> return false;
> }
> ctrl = le16_to_cpu(ctrl);
> @@ -1558,30 +1589,39 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> uint8_t pos;
> uint16_t ctrl;
> uint32_t table, pba;
> - int ret, fd = vdev->vbasedev.fd;
> struct vfio_irq_info irq_info;
> VFIOMSIXInfo *msix;
> + int ret;
>
> pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
> if (!pos) {
> return true;
> }
>
> - if (pread(fd, &ctrl, sizeof(ctrl),
> - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
> - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS,
> + sizeof(ctrl), &ctrl);
> + if (ret != sizeof(ctrl)) {
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed to read PCI MSIX FLAGS: %s", err);
> return false;
> }
>
> - if (pread(fd, &table, sizeof(table),
> - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
> - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE,
> + sizeof(table), &table);
> + if (ret != sizeof(table)) {
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed to read PCI MSIX TABLE: %s", err);
> return false;
> }
>
> - if (pread(fd, &pba, sizeof(pba),
> - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
> - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
> + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA,
> + sizeof(pba), &pba);
> + if (ret != sizeof(pba)) {
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_setg(errp, "failed to read PCI MSIX PBA: %s", err);
> return false;
> }
>
> @@ -1741,10 +1781,12 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
> }
>
> /* Determine what type of BAR this is for registration */
> - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
> - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
> + ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr),
> + sizeof(pci_bar), &pci_bar);
> if (ret != sizeof(pci_bar)) {
> - error_report("vfio: Failed to read BAR %d (%m)", nr);
> + const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> + error_report("vfio: Failed to read BAR %d: %s", nr, err);
> return;
> }
>
> @@ -2448,21 +2490,25 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
>
> void vfio_pci_post_reset(VFIOPCIDevice *vdev)
> {
> + VFIODevice *vbasedev = &vdev->vbasedev;
> Error *err = NULL;
> - int nr;
> + int ret, nr;
>
> if (!vfio_intx_enable(vdev, &err)) {
> error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> }
>
> for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
> - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
> + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
> uint32_t val = 0;
> uint32_t len = sizeof(val);
>
> - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
> - error_report("%s(%s) reset bar %d failed: %m", __func__,
> - vdev->vbasedev.name, nr);
> + ret = vfio_pci_config_space_write(vdev, addr, len, &val);
> + if (ret != len) {
> + const char *errmsg = ret < 0 ? strerror(-ret) : "short write";
> +
> + error_report("%s(%s) reset bar %d failed: %s", __func__,
> + vbasedev->name, nr, errmsg);
> }
> }
>
> @@ -3099,6 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> int i, ret;
> char uuid[UUID_STR_LEN];
> g_autofree char *name = NULL;
> + size_t config_space_size;
why not use uint32_t ?
Thanks,
C.
>
> if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
> if (!(~vdev->host.domain || ~vdev->host.bus ||
> @@ -3153,13 +3200,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> goto error;
> }
>
> + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size);
> +
> /* Get a copy of config space */
> - ret = pread(vbasedev->fd, vdev->pdev.config,
> - MIN(pci_config_size(&vdev->pdev), vdev->config_size),
> - vdev->config_offset);
> - if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
> - ret = ret < 0 ? -errno : -EFAULT;
> - error_setg_errno(errp, -ret, "failed to read device config space");
> + ret = vfio_pci_config_space_read(vdev, 0, config_space_size,
> + vdev->pdev.config);
> + if (ret < (int)config_space_size) {
> + ret = ret < 0 ? -ret : EFAULT;
> + error_setg_errno(errp, ret, "failed to read device config space");
> goto error;
> }
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 12/14] vfio: add region info cache
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (10 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 11/14] vfio: add vfio_pci_config_space_read/write() John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-24 16:08 ` Cédric Le Goater
2025-04-28 15:39 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 13/14] vfio: add device IO ops vector John Levon
` (2 subsequent siblings)
14 siblings, 2 replies; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon, John Johnson,
Elena Ufimtseva, Jagannathan Raman
Instead of requesting region information on demand with
VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
necessary for performance for vfio-user, where this call becomes a
message over the control socket, so is of higher overhead than the
traditional path.
We will also need it to generalize region accesses, as that means we
can't use ->config_offset for configuration space accesses, but must
look up the region offset (if relevant) each time.
Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/ccw.c | 5 -----
hw/vfio/container.c | 10 ++++++++++
hw/vfio/device.c | 31 +++++++++++++++++++++++++++----
hw/vfio/igd.c | 8 ++++----
hw/vfio/pci.c | 6 +++---
hw/vfio/region.c | 2 +-
include/hw/vfio/vfio-device.h | 1 +
7 files changed, 46 insertions(+), 17 deletions(-)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index dac8769925..14dee7cd19 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -504,7 +504,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
vcdev->io_region_offset = info->offset;
vcdev->io_region = g_malloc0(info->size);
- g_free(info);
/* check for the optional async command region */
ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
@@ -517,7 +516,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
}
vcdev->async_cmd_region_offset = info->offset;
vcdev->async_cmd_region = g_malloc0(info->size);
- g_free(info);
}
ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
@@ -530,7 +528,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
}
vcdev->schib_region_offset = info->offset;
vcdev->schib_region = g_malloc(info->size);
- g_free(info);
}
ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
@@ -544,7 +541,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
}
vcdev->crw_region_offset = info->offset;
vcdev->crw_region = g_malloc(info->size);
- g_free(info);
}
return true;
@@ -554,7 +550,6 @@ out_err:
g_free(vcdev->schib_region);
g_free(vcdev->async_cmd_region);
g_free(vcdev->io_region);
- g_free(info);
return false;
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 37b1217fd8..61333d7fc4 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -857,6 +857,16 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
static void vfio_device_put(VFIODevice *vbasedev)
{
+ if (vbasedev->reginfo != NULL) {
+ int i;
+
+ for (i = 0; i < vbasedev->num_regions; i++) {
+ g_free(vbasedev->reginfo[i]);
+ }
+ g_free(vbasedev->reginfo);
+ vbasedev->reginfo = NULL;
+ }
+
if (!vbasedev->group) {
return;
}
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 2966171118..102fa5a9b4 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -205,6 +205,17 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
{
size_t argsz = sizeof(struct vfio_region_info);
+ /* create region info cache */
+ if (vbasedev->reginfo == NULL) {
+ vbasedev->reginfo = g_new0(struct vfio_region_info *,
+ vbasedev->num_regions);
+ }
+ /* check cache */
+ if (vbasedev->reginfo[index] != NULL) {
+ *info = vbasedev->reginfo[index];
+ return 0;
+ }
+
*info = g_malloc0(argsz);
(*info)->index = index;
@@ -224,6 +235,9 @@ retry:
goto retry;
}
+ /* fill cache */
+ vbasedev->reginfo[index] = *info;
+
return 0;
}
@@ -242,7 +256,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
if (!hdr) {
- g_free(*info);
continue;
}
@@ -254,8 +267,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
if (cap_type->type == type && cap_type->subtype == subtype) {
return 0;
}
-
- g_free(*info);
}
*info = NULL;
@@ -264,7 +275,7 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
{
- g_autofree struct vfio_region_info *info = NULL;
+ struct vfio_region_info *info = NULL;
bool ret = false;
if (!vfio_device_get_region_info(vbasedev, region, &info)) {
@@ -427,6 +438,16 @@ void vfio_device_detach(VFIODevice *vbasedev)
VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
}
+static void vfio_device_get_all_region_info(VFIODevice *vbasedev)
+{
+ struct vfio_region_info *info;
+ int i;
+
+ for (i = 0; i < vbasedev->num_regions; i++) {
+ vfio_device_get_region_info(vbasedev, i, &info);
+ }
+}
+
void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
struct vfio_device_info *info)
{
@@ -439,4 +460,6 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+
+ vfio_device_get_all_region_info(vbasedev);
}
diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
index e1cba16399..d70da1ce38 100644
--- a/hw/vfio/igd.c
+++ b/hw/vfio/igd.c
@@ -198,7 +198,7 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp)
{
- g_autofree struct vfio_region_info *opregion = NULL;
+ struct vfio_region_info *opregion = NULL;
int ret;
/* Hotplugging is not supported for opregion access */
@@ -361,8 +361,8 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp)
{
- g_autofree struct vfio_region_info *host = NULL;
- g_autofree struct vfio_region_info *lpc = NULL;
+ struct vfio_region_info *host = NULL;
+ struct vfio_region_info *lpc = NULL;
PCIDevice *lpc_bridge;
int ret;
@@ -526,7 +526,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp)
* - OpRegion
* - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host
*/
- g_autofree struct vfio_region_info *rom = NULL;
+ struct vfio_region_info *rom = NULL;
legacy_mode_enabled = true;
info_report("IGD legacy mode enabled, "
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c3842d2f8d..b40d5abdfd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -882,8 +882,8 @@ static void vfio_update_msi(VFIOPCIDevice *vdev)
static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
{
- g_autofree struct vfio_region_info *reg_info = NULL;
VFIODevice *vbasedev = &vdev->vbasedev;
+ struct vfio_region_info *reg_info = NULL;
uint64_t size;
off_t off = 0;
ssize_t bytes;
@@ -2721,7 +2721,7 @@ static VFIODeviceOps vfio_pci_ops = {
bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
- g_autofree struct vfio_region_info *reg_info = NULL;
+ struct vfio_region_info *reg_info = NULL;
int ret;
ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info);
@@ -2786,7 +2786,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
{
VFIODevice *vbasedev = &vdev->vbasedev;
- g_autofree struct vfio_region_info *reg_info = NULL;
+ struct vfio_region_info *reg_info = NULL;
struct vfio_irq_info irq_info;
int i, ret = -1;
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index 04bf9eb098..ef2630cac3 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -182,7 +182,7 @@ static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
int index, const char *name)
{
- g_autofree struct vfio_region_info *info = NULL;
+ struct vfio_region_info *info = NULL;
int ret;
ret = vfio_device_get_region_info(vbasedev, index, &info);
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 9522a09c48..967b07cd89 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -81,6 +81,7 @@ typedef struct VFIODevice {
IOMMUFDBackend *iommufd;
VFIOIOASHwpt *hwpt;
QLIST_ENTRY(VFIODevice) hwpt_next;
+ struct vfio_region_info **reginfo;
} VFIODevice;
struct VFIODeviceOps {
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-09 13:48 ` [PATCH 12/14] vfio: add region info cache John Levon
@ 2025-04-24 16:08 ` Cédric Le Goater
2025-04-24 16:26 ` John Levon
2025-04-28 15:39 ` Cédric Le Goater
1 sibling, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 16:08 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/9/25 15:48, John Levon wrote:
> Instead of requesting region information on demand with
> VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
> necessary for performance for vfio-user, where this call becomes a
> message over the control socket, so is of higher overhead than the
> traditional path.
>
> We will also need it to generalize region accesses, as that means we
> can't use ->config_offset for configuration space accesses, but must
> look up the region offset (if relevant) each time.
This change is an optimization for vfio-user. I would prefer to keep it
for after enabling vfio-user.
Thanks,
C.
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/ccw.c | 5 -----
> hw/vfio/container.c | 10 ++++++++++
> hw/vfio/device.c | 31 +++++++++++++++++++++++++++----
> hw/vfio/igd.c | 8 ++++----
> hw/vfio/pci.c | 6 +++---
> hw/vfio/region.c | 2 +-
> include/hw/vfio/vfio-device.h | 1 +
> 7 files changed, 46 insertions(+), 17 deletions(-)
>
> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
> index dac8769925..14dee7cd19 100644
> --- a/hw/vfio/ccw.c
> +++ b/hw/vfio/ccw.c
> @@ -504,7 +504,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
>
> vcdev->io_region_offset = info->offset;
> vcdev->io_region = g_malloc0(info->size);
> - g_free(info);
>
> /* check for the optional async command region */
> ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
> @@ -517,7 +516,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
> }
> vcdev->async_cmd_region_offset = info->offset;
> vcdev->async_cmd_region = g_malloc0(info->size);
> - g_free(info);
> }
>
> ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
> @@ -530,7 +528,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
> }
> vcdev->schib_region_offset = info->offset;
> vcdev->schib_region = g_malloc(info->size);
> - g_free(info);
> }
>
> ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
> @@ -544,7 +541,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
> }
> vcdev->crw_region_offset = info->offset;
> vcdev->crw_region = g_malloc(info->size);
> - g_free(info);
> }
>
> return true;
> @@ -554,7 +550,6 @@ out_err:
> g_free(vcdev->schib_region);
> g_free(vcdev->async_cmd_region);
> g_free(vcdev->io_region);
> - g_free(info);
> return false;
> }
>
> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
> index 37b1217fd8..61333d7fc4 100644
> --- a/hw/vfio/container.c
> +++ b/hw/vfio/container.c
> @@ -857,6 +857,16 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
>
> static void vfio_device_put(VFIODevice *vbasedev)
> {
> + if (vbasedev->reginfo != NULL) {
> + int i;
> +
> + for (i = 0; i < vbasedev->num_regions; i++) {
> + g_free(vbasedev->reginfo[i]);
> + }
> + g_free(vbasedev->reginfo);
> + vbasedev->reginfo = NULL;
> + }
> +
> if (!vbasedev->group) {
> return;
> }
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 2966171118..102fa5a9b4 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -205,6 +205,17 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> {
> size_t argsz = sizeof(struct vfio_region_info);
>
> + /* create region info cache */
> + if (vbasedev->reginfo == NULL) {
> + vbasedev->reginfo = g_new0(struct vfio_region_info *,
> + vbasedev->num_regions);
> + }
> + /* check cache */
> + if (vbasedev->reginfo[index] != NULL) {
> + *info = vbasedev->reginfo[index];
> + return 0;
> + }
> +
> *info = g_malloc0(argsz);
>
> (*info)->index = index;
> @@ -224,6 +235,9 @@ retry:
> goto retry;
> }
>
> + /* fill cache */
> + vbasedev->reginfo[index] = *info;
> +
> return 0;
> }
>
> @@ -242,7 +256,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
>
> hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
> if (!hdr) {
> - g_free(*info);
> continue;
> }
>
> @@ -254,8 +267,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
> if (cap_type->type == type && cap_type->subtype == subtype) {
> return 0;
> }
> -
> - g_free(*info);
> }
>
> *info = NULL;
> @@ -264,7 +275,7 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
>
> bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
> {
> - g_autofree struct vfio_region_info *info = NULL;
> + struct vfio_region_info *info = NULL;
> bool ret = false;
>
> if (!vfio_device_get_region_info(vbasedev, region, &info)) {
> @@ -427,6 +438,16 @@ void vfio_device_detach(VFIODevice *vbasedev)
> VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
> }
>
> +static void vfio_device_get_all_region_info(VFIODevice *vbasedev)
> +{
> + struct vfio_region_info *info;
> + int i;
> +
> + for (i = 0; i < vbasedev->num_regions; i++) {
> + vfio_device_get_region_info(vbasedev, i, &info);
> + }
> +}
> +
> void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
> struct vfio_device_info *info)
> {
> @@ -439,4 +460,6 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
>
> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
> +
> + vfio_device_get_all_region_info(vbasedev);
> }
> diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
> index e1cba16399..d70da1ce38 100644
> --- a/hw/vfio/igd.c
> +++ b/hw/vfio/igd.c
> @@ -198,7 +198,7 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
>
> static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp)
> {
> - g_autofree struct vfio_region_info *opregion = NULL;
> + struct vfio_region_info *opregion = NULL;
> int ret;
>
> /* Hotplugging is not supported for opregion access */
> @@ -361,8 +361,8 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
>
> static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp)
> {
> - g_autofree struct vfio_region_info *host = NULL;
> - g_autofree struct vfio_region_info *lpc = NULL;
> + struct vfio_region_info *host = NULL;
> + struct vfio_region_info *lpc = NULL;
> PCIDevice *lpc_bridge;
> int ret;
>
> @@ -526,7 +526,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp)
> * - OpRegion
> * - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host
> */
> - g_autofree struct vfio_region_info *rom = NULL;
> + struct vfio_region_info *rom = NULL;
>
> legacy_mode_enabled = true;
> info_report("IGD legacy mode enabled, "
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index c3842d2f8d..b40d5abdfd 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -882,8 +882,8 @@ static void vfio_update_msi(VFIOPCIDevice *vdev)
>
> static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> {
> - g_autofree struct vfio_region_info *reg_info = NULL;
> VFIODevice *vbasedev = &vdev->vbasedev;
> + struct vfio_region_info *reg_info = NULL;
> uint64_t size;
> off_t off = 0;
> ssize_t bytes;
> @@ -2721,7 +2721,7 @@ static VFIODeviceOps vfio_pci_ops = {
> bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> {
> VFIODevice *vbasedev = &vdev->vbasedev;
> - g_autofree struct vfio_region_info *reg_info = NULL;
> + struct vfio_region_info *reg_info = NULL;
> int ret;
>
> ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info);
> @@ -2786,7 +2786,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
> {
> VFIODevice *vbasedev = &vdev->vbasedev;
> - g_autofree struct vfio_region_info *reg_info = NULL;
> + struct vfio_region_info *reg_info = NULL;
> struct vfio_irq_info irq_info;
> int i, ret = -1;
>
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index 04bf9eb098..ef2630cac3 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -182,7 +182,7 @@ static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
> int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
> int index, const char *name)
> {
> - g_autofree struct vfio_region_info *info = NULL;
> + struct vfio_region_info *info = NULL;
> int ret;
>
> ret = vfio_device_get_region_info(vbasedev, index, &info);
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 9522a09c48..967b07cd89 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -81,6 +81,7 @@ typedef struct VFIODevice {
> IOMMUFDBackend *iommufd;
> VFIOIOASHwpt *hwpt;
> QLIST_ENTRY(VFIODevice) hwpt_next;
> + struct vfio_region_info **reginfo;
> } VFIODevice;
>
> struct VFIODeviceOps {
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-24 16:08 ` Cédric Le Goater
@ 2025-04-24 16:26 ` John Levon
2025-04-28 15:16 ` Cédric Le Goater
0 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-24 16:26 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On Thu, Apr 24, 2025 at 06:08:21PM +0200, Cédric Le Goater wrote:
> On 4/9/25 15:48, John Levon wrote:
> > Instead of requesting region information on demand with
> > VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
> > necessary for performance for vfio-user, where this call becomes a
> > message over the control socket, so is of higher overhead than the
> > traditional path.
> >
> > We will also need it to generalize region accesses, as that means we
> > can't use ->config_offset for configuration space accesses, but must
> > look up the region offset (if relevant) each time.
>
> This change is an optimization for vfio-user. I would prefer to keep it
> for after enabling vfio-user.
It's not vfio-user specific. Just to clarify, you want this code:
static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
uint32_t size, void *data, bool post)
{
struct vfio_region_info *info = vbasedev->regions[index];
int ret;
ret = pwrite(vbasedev->fd, data, size, info->offset + off);
return ret < 0 ? -errno : ret;
}
to become:
static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
uint32_t size, void *data, bool post)
{
struct vfio_region_info info;
ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &info);
struct vfio_region_info *info = vbasedev->regions[index];
int ret;
ret = pwrite(vbasedev->fd, data, size, info->offset + off);
return ret < 0 ? -errno : ret;
}
i.e. every region read/write needs to look up info each time?
If not, what are you suggesting?
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-24 16:26 ` John Levon
@ 2025-04-28 15:16 ` Cédric Le Goater
2025-04-28 15:26 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-28 15:16 UTC (permalink / raw)
To: John Levon
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/24/25 18:26, John Levon wrote:
> On Thu, Apr 24, 2025 at 06:08:21PM +0200, Cédric Le Goater wrote:
>
>> On 4/9/25 15:48, John Levon wrote:
>>> Instead of requesting region information on demand with
>>> VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
>>> necessary for performance for vfio-user, where this call becomes a
>>> message over the control socket, so is of higher overhead than the
>>> traditional path.
>>>
>>> We will also need it to generalize region accesses, as that means we
>>> can't use ->config_offset for configuration space accesses, but must
>>> look up the region offset (if relevant) each time.
>>
>> This change is an optimization for vfio-user. I would prefer to keep it
>> for after enabling vfio-user.
>
> It's not vfio-user specific. Just to clarify, you want this code:
>
> static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
> uint32_t size, void *data, bool post)
> {
> struct vfio_region_info *info = vbasedev->regions[index];
> int ret;
>
> ret = pwrite(vbasedev->fd, data, size, info->offset + off);
>
> return ret < 0 ? -errno : ret;
> }
>
> to become:
>
> static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
> uint32_t size, void *data, bool post)
> {
> struct vfio_region_info info;
>
> ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &info);
> > struct vfio_region_info *info = vbasedev->regions[index];
> int ret;
>
> ret = pwrite(vbasedev->fd, data, size, info->offset + off);
>
> return ret < 0 ? -errno : ret;
> }
>
>
> i.e. every region read/write needs to look up info each time?
Oh I didn't this. So the introduction VFIODeviceIOOps is not seamless.
> If not, what are you suggesting?
vfio_device_io_region_read and vfio_device_io_region_write should come
separately in patch 13.
Let me comment more this patch.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-28 15:16 ` Cédric Le Goater
@ 2025-04-28 15:26 ` John Levon
0 siblings, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-28 15:26 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On Mon, Apr 28, 2025 at 05:16:50PM +0200, Cédric Le Goater wrote:
> > i.e. every region read/write needs to look up info each time?
>
> Oh I didn't this. So the introduction VFIODeviceIOOps is not seamless.
Correct.
> > If not, what are you suggesting?
>
> vfio_device_io_region_read and vfio_device_io_region_write should come
> separately in patch 13.
OK, can do.
thanks
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-09 13:48 ` [PATCH 12/14] vfio: add region info cache John Levon
2025-04-24 16:08 ` Cédric Le Goater
@ 2025-04-28 15:39 ` Cédric Le Goater
2025-04-28 16:09 ` John Levon
2025-04-29 22:41 ` John Levon
1 sibling, 2 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-28 15:39 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/9/25 15:48, John Levon wrote:
> Instead of requesting region information on demand with
> VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
> necessary for performance for vfio-user, where this call becomes a
> message over the control socket, so is of higher overhead than the
> traditional path.
>
> We will also need it to generalize region accesses, as that means we
> can't use ->config_offset for configuration space accesses, but must
> look up the region offset (if relevant) each time.
>
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/ccw.c | 5 -----
> hw/vfio/container.c | 10 ++++++++++
> hw/vfio/device.c | 31 +++++++++++++++++++++++++++----
> hw/vfio/igd.c | 8 ++++----
> hw/vfio/pci.c | 6 +++---
> hw/vfio/region.c | 2 +-
> include/hw/vfio/vfio-device.h | 1 +
> 7 files changed, 46 insertions(+), 17 deletions(-)
>
> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
> index dac8769925..14dee7cd19 100644
> --- a/hw/vfio/ccw.c
> +++ b/hw/vfio/ccw.c
> @@ -504,7 +504,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
>
> vcdev->io_region_offset = info->offset;
> vcdev->io_region = g_malloc0(info->size);
> - g_free(info);
>
> /* check for the optional async command region */
> ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
> @@ -517,7 +516,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
> }
> vcdev->async_cmd_region_offset = info->offset;
> vcdev->async_cmd_region = g_malloc0(info->size);
> - g_free(info);
> }
>
> ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
> @@ -530,7 +528,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
> }
> vcdev->schib_region_offset = info->offset;
> vcdev->schib_region = g_malloc(info->size);
> - g_free(info);
> }
>
> ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW,
> @@ -544,7 +541,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp)
> }
> vcdev->crw_region_offset = info->offset;
> vcdev->crw_region = g_malloc(info->size);
> - g_free(info);
> }
>
> return true;
> @@ -554,7 +550,6 @@ out_err:
> g_free(vcdev->schib_region);
> g_free(vcdev->async_cmd_region);
> g_free(vcdev->io_region);
> - g_free(info);
> return false;
> }
>
> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
> index 37b1217fd8..61333d7fc4 100644
> --- a/hw/vfio/container.c
> +++ b/hw/vfio/container.c
> @@ -857,6 +857,16 @@ static bool vfio_device_get(VFIOGroup *group, const char *name,
>
> static void vfio_device_put(VFIODevice *vbasedev)
> {
> + if (vbasedev->reginfo != NULL) {
> + int i;
> +
> + for (i = 0; i < vbasedev->num_regions; i++) {
> + g_free(vbasedev->reginfo[i]);
> + }
> + g_free(vbasedev->reginfo);
> + vbasedev->reginfo = NULL;
> + }
> +
Can we have a vfio_device_unprepare() routine for symmetry with
routine vfio_device_get_all_region_info() ? Naming should be
improved too.
> if (!vbasedev->group) {> return;
> }
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 2966171118..102fa5a9b4 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -205,6 +205,17 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> {
> size_t argsz = sizeof(struct vfio_region_info);
>
> + /* create region info cache */
> + if (vbasedev->reginfo == NULL) {
> + vbasedev->reginfo = g_new0(struct vfio_region_info *,
> + vbasedev->num_regions);
> + }
I guess we could allocate ->reginfo[] array sooner in the VFIODevice
object life cycle. Since we lack a realize handler, may be in
vfio_device_prepare() ?
> + /* check cache */
> + if (vbasedev->reginfo[index] != NULL) {
> + *info = vbasedev->reginfo[index];
> + return 0;
> + }
> +
> *info = g_malloc0(argsz);
>
> (*info)->index = index;
> @@ -224,6 +235,9 @@ retry:
> goto retry;
> }
>
> + /* fill cache */
> + vbasedev->reginfo[index] = *info;
> +
> return 0;
> }
>
> @@ -242,7 +256,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
>
> hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
> if (!hdr) {
> - g_free(*info);
> continue;
> }
>
> @@ -254,8 +267,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
> if (cap_type->type == type && cap_type->subtype == subtype) {
> return 0;
> }
> -
> - g_free(*info);
> }
>
> *info = NULL;
> @@ -264,7 +275,7 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
>
> bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
> {
> - g_autofree struct vfio_region_info *info = NULL;
> + struct vfio_region_info *info = NULL;
> bool ret = false;
>
> if (!vfio_device_get_region_info(vbasedev, region, &info)) {
> @@ -427,6 +438,16 @@ void vfio_device_detach(VFIODevice *vbasedev)
> VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
> }
>
> +static void vfio_device_get_all_region_info(VFIODevice *vbasedev)
> +{
> + struct vfio_region_info *info;
> + int i;
> +
> + for (i = 0; i < vbasedev->num_regions; i++) {
> + vfio_device_get_region_info(vbasedev, i, &info);
> + }
> +}
> +
if the vfio_device_get_all_region_info() routine queries *all* region
infos to fill the ->reginfo[] cache array, why do we also need the
lazy cache filling method in vfio_device_get_region_info() ? This looks
redundant to me. I would rather have vfio_device_get_region_info()
operate on the cache only.
Thanks,
C.
> void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
> struct vfio_device_info *info)
> {
> @@ -439,4 +460,6 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
> QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
>
> QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
> +
> + vfio_device_get_all_region_info(vbasedev);
> }
> diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c
> index e1cba16399..d70da1ce38 100644
> --- a/hw/vfio/igd.c
> +++ b/hw/vfio/igd.c
> @@ -198,7 +198,7 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
>
> static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp)
> {
> - g_autofree struct vfio_region_info *opregion = NULL;
> + struct vfio_region_info *opregion = NULL;
> int ret;
>
> /* Hotplugging is not supported for opregion access */
> @@ -361,8 +361,8 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
>
> static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp)
> {
> - g_autofree struct vfio_region_info *host = NULL;
> - g_autofree struct vfio_region_info *lpc = NULL;
> + struct vfio_region_info *host = NULL;
> + struct vfio_region_info *lpc = NULL;
> PCIDevice *lpc_bridge;
> int ret;
>
> @@ -526,7 +526,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp)
> * - OpRegion
> * - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host
> */
> - g_autofree struct vfio_region_info *rom = NULL;
> + struct vfio_region_info *rom = NULL;
>
> legacy_mode_enabled = true;
> info_report("IGD legacy mode enabled, "
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index c3842d2f8d..b40d5abdfd 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -882,8 +882,8 @@ static void vfio_update_msi(VFIOPCIDevice *vdev)
>
> static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> {
> - g_autofree struct vfio_region_info *reg_info = NULL;
> VFIODevice *vbasedev = &vdev->vbasedev;
> + struct vfio_region_info *reg_info = NULL;
> uint64_t size;
> off_t off = 0;
> ssize_t bytes;
> @@ -2721,7 +2721,7 @@ static VFIODeviceOps vfio_pci_ops = {
> bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> {
> VFIODevice *vbasedev = &vdev->vbasedev;
> - g_autofree struct vfio_region_info *reg_info = NULL;
> + struct vfio_region_info *reg_info = NULL;
> int ret;
>
> ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info);
> @@ -2786,7 +2786,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
> {
> VFIODevice *vbasedev = &vdev->vbasedev;
> - g_autofree struct vfio_region_info *reg_info = NULL;
> + struct vfio_region_info *reg_info = NULL;
> struct vfio_irq_info irq_info;
> int i, ret = -1;
>
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index 04bf9eb098..ef2630cac3 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -182,7 +182,7 @@ static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
> int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
> int index, const char *name)
> {
> - g_autofree struct vfio_region_info *info = NULL;
> + struct vfio_region_info *info = NULL;
> int ret;
>
> ret = vfio_device_get_region_info(vbasedev, index, &info);
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 9522a09c48..967b07cd89 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -81,6 +81,7 @@ typedef struct VFIODevice {
> IOMMUFDBackend *iommufd;
> VFIOIOASHwpt *hwpt;
> QLIST_ENTRY(VFIODevice) hwpt_next;
> + struct vfio_region_info **reginfo;
> } VFIODevice;
>
> struct VFIODeviceOps {
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-28 15:39 ` Cédric Le Goater
@ 2025-04-28 16:09 ` John Levon
2025-04-29 22:41 ` John Levon
1 sibling, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-28 16:09 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, Elena Ufimtseva, Jagannathan Raman
On Mon, Apr 28, 2025 at 05:39:46PM +0200, Cédric Le Goater wrote:
> On 4/9/25 15:48, John Levon wrote:
> > Instead of requesting region information on demand with
> > VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
> > necessary for performance for vfio-user, where this call becomes a
> > message over the control socket, so is of higher overhead than the
> > traditional path.
>
> > +static void vfio_device_get_all_region_info(VFIODevice *vbasedev)
> > +{
> > + struct vfio_region_info *info;
> > + int i;
> > +
> > + for (i = 0; i < vbasedev->num_regions; i++) {
> > + vfio_device_get_region_info(vbasedev, i, &info);
> > + }
> > +}
> > +
>
> if the vfio_device_get_all_region_info() routine queries *all* region
> infos to fill the ->reginfo[] cache array, why do we also need the
> lazy cache filling method in vfio_device_get_region_info() ? This looks
> redundant to me. I would rather have vfio_device_get_region_info()
> operate on the cache only.
I think we briefly talked about this last time. I don't know why the cache fill
code is there; I can drop it.
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 12/14] vfio: add region info cache
2025-04-28 15:39 ` Cédric Le Goater
2025-04-28 16:09 ` John Levon
@ 2025-04-29 22:41 ` John Levon
1 sibling, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-29 22:41 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On Mon, Apr 28, 2025 at 05:39:46PM +0200, Cédric Le Goater wrote:
> > +static void vfio_device_get_all_region_info(VFIODevice *vbasedev)
> > +{
> > + struct vfio_region_info *info;
> > + int i;
> > +
> > + for (i = 0; i < vbasedev->num_regions; i++) {
> > + vfio_device_get_region_info(vbasedev, i, &info);
>
> if the vfio_device_get_all_region_info() routine queries *all* region
> infos to fill the ->reginfo[] cache array, why do we also need the
> lazy cache filling method in vfio_device_get_region_info() ? This looks
> redundant to me. I would rather have vfio_device_get_region_info()
> operate on the cache only.
I realised I'm not confident about doing this: in theory, a vfio device region
could later become valid based on some change in operation (and hence get region
info would then subsequently work post setup). Instead, I'm going to drop the
"get all" and operate only in caching mode, does that sound OK?
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 13/14] vfio: add device IO ops vector
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (11 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 12/14] vfio: add region info cache John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-24 16:18 ` Cédric Le Goater
2025-04-09 13:48 ` [PATCH 14/14] vfio/container: pass MemoryRegion to DMA operations John Levon
2025-04-25 7:59 ` [PATCH 00/14] vfio: preparation for vfio-user Cédric Le Goater
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon, John Johnson,
Elena Ufimtseva, Jagannathan Raman
For vfio-user, device operations such as IRQ handling and region
read/writes are implemented in userspace over the control socket, not
ioctl() or read()/write() to the vfio kernel driver; add an ops vector
to generalize this, and implement vfio_device_io_ops_ioctl for
interacting with the kernel vfio driver.
Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/ap.c | 2 +-
hw/vfio/ccw.c | 2 +-
hw/vfio/container-base.c | 6 +-
hw/vfio/device.c | 102 ++++++++++++++++++++++++++++++----
hw/vfio/listener.c | 13 +++--
hw/vfio/pci.c | 40 +++++++------
hw/vfio/platform.c | 2 +-
hw/vfio/region.c | 17 ++++--
include/hw/vfio/vfio-device.h | 24 +++++++-
9 files changed, 155 insertions(+), 53 deletions(-)
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index f311bca5b6..b6233b2107 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -229,7 +229,7 @@ static void vfio_ap_instance_init(Object *obj)
* handle ram_block_discard_disable().
*/
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
- DEVICE(vapdev), true);
+ &vfio_device_io_ops_ioctl, DEVICE(vapdev), true);
/* AP device is mdev type device */
vbasedev->mdev = true;
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 14dee7cd19..aee52b5a8d 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -676,7 +676,7 @@ static void vfio_ccw_instance_init(Object *obj)
* ram_block_discard_disable().
*/
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
- DEVICE(vcdev), true);
+ &vfio_device_io_ops_ioctl, DEVICE(vcdev), true);
}
#ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 613fe1a00d..16fe5f79d2 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -198,11 +198,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
feature->flags = VFIO_DEVICE_FEATURE_GET |
VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
- if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
- return -errno;
- }
-
- return 0;
+ return vbasedev->io_ops->device_feature(vbasedev, feature);
}
static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 102fa5a9b4..545d9f1faf 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -82,7 +82,7 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
.count = 0,
};
- ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+ vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
}
void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
@@ -95,7 +95,7 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
.count = 1,
};
- ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+ vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
}
void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
@@ -108,7 +108,7 @@ void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
.count = 1,
};
- ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+ vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
}
static inline const char *action_to_str(int action)
@@ -155,6 +155,7 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
int argsz;
const char *name;
int32_t *pfd;
+ int ret;
argsz = sizeof(*irq_set) + sizeof(*pfd);
@@ -167,7 +168,9 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
pfd = (int32_t *)&irq_set->data;
*pfd = fd;
- if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
+ ret = vbasedev->io_ops->set_irqs(vbasedev, irq_set);
+
+ if (!ret) {
return true;
}
@@ -188,22 +191,19 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
struct vfio_irq_info *info)
{
- int ret;
-
memset(info, 0, sizeof(*info));
info->argsz = sizeof(*info);
info->index = index;
- ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
-
- return ret < 0 ? -errno : ret;
+ return vbasedev->io_ops->get_irq_info(vbasedev, info);
}
int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
struct vfio_region_info **info)
{
size_t argsz = sizeof(struct vfio_region_info);
+ int ret;
/* create region info cache */
if (vbasedev->reginfo == NULL) {
@@ -222,10 +222,11 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
retry:
(*info)->argsz = argsz;
- if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
+ ret = vbasedev->io_ops->get_region_info(vbasedev, *info);
+ if (ret != 0) {
g_free(*info);
*info = NULL;
- return -errno;
+ return ret;
}
if ((*info)->argsz > argsz) {
@@ -332,10 +333,12 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
}
void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
- DeviceState *dev, bool ram_discard)
+ VFIODeviceIOOps *io_ops, DeviceState *dev,
+ bool ram_discard)
{
vbasedev->type = type;
vbasedev->ops = ops;
+ vbasedev->io_ops = io_ops;
vbasedev->dev = dev;
vbasedev->fd = -1;
@@ -463,3 +466,78 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
vfio_device_get_all_region_info(vbasedev);
}
+
+/*
+ * Traditional ioctl() based io
+ */
+
+static int vfio_device_io_device_feature(VFIODevice *vbasedev,
+ struct vfio_device_feature *feature)
+{
+ int ret;
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+
+ return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
+ struct vfio_region_info *info)
+{
+ int ret;
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+
+ return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_get_irq_info(VFIODevice *vbasedev,
+ struct vfio_irq_info *info)
+{
+ int ret;
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
+
+ return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_set_irqs(VFIODevice *vbasedev,
+ struct vfio_irq_set *irqs)
+{
+ int ret;
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
+
+ return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
+ off_t off, uint32_t size, void *data)
+{
+ struct vfio_region_info *info = vbasedev->reginfo[index];
+ int ret;
+
+ ret = pread(vbasedev->fd, data, size, info->offset + off);
+
+ return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
+ off_t off, uint32_t size, void *data)
+{
+ struct vfio_region_info *info = vbasedev->reginfo[index];
+ int ret;
+
+ ret = pwrite(vbasedev->fd, data, size, info->offset + off);
+
+ return ret < 0 ? -errno : ret;
+}
+
+VFIODeviceIOOps vfio_device_io_ops_ioctl = {
+ .device_feature = vfio_device_io_device_feature,
+ .get_region_info = vfio_device_io_get_region_info,
+ .get_irq_info = vfio_device_io_get_irq_info,
+ .set_irqs = vfio_device_io_set_irqs,
+ .region_read = vfio_device_io_region_read,
+ .region_write = vfio_device_io_region_write,
+};
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index bcf2b98e79..7ea9e0dfb7 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -821,13 +821,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+ int ret;
+
if (!vbasedev->dirty_tracking) {
continue;
}
- if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ ret = vbasedev->io_ops->device_feature(vbasedev, feature);
+
+ if (ret != 0) {
warn_report("%s: Failed to stop DMA logging, err %d (%s)",
- vbasedev->name, -errno, strerror(errno));
+ vbasedev->name, -ret, strerror(-ret));
}
vbasedev->dirty_tracking = false;
}
@@ -928,10 +932,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
continue;
}
- ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+ ret = vbasedev->io_ops->device_feature(vbasedev, feature);
if (ret) {
- ret = -errno;
- error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
+ error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
vbasedev->name);
goto out;
}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b40d5abdfd..ff2b15ff02 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -381,7 +381,7 @@ static void vfio_msi_interrupt(void *opaque)
static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
{
g_autofree struct vfio_irq_set *irq_set = NULL;
- int ret = 0, argsz;
+ int argsz;
int32_t *fd;
argsz = sizeof(*irq_set) + sizeof(*fd);
@@ -396,9 +396,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
fd = (int32_t *)&irq_set->data;
*fd = -1;
- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- return ret < 0 ? -errno : ret;
+ return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
}
static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
@@ -455,11 +453,11 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
fds[i] = fd;
}
- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
g_free(irq_set);
- return ret < 0 ? -errno : ret;
+ return ret;
}
static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
@@ -917,18 +915,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
memset(vdev->rom, 0xff, size);
while (size) {
- bytes = pread(vbasedev->fd, vdev->rom + off,
- size, vdev->rom_offset + off);
+ bytes = vbasedev->io_ops->region_read(vbasedev,
+ VFIO_PCI_ROM_REGION_INDEX,
+ off, size, vdev->rom + off);
+
if (bytes == 0) {
break;
} else if (bytes > 0) {
off += bytes;
size -= bytes;
} else {
- if (errno == EINTR || errno == EAGAIN) {
+ if (bytes == -EINTR || bytes == -EAGAIN) {
continue;
}
- error_report("vfio: Error reading device ROM: %m");
+ error_report("vfio: Error reading device ROM: %s",
+ strerror(-bytes));
+
break;
}
}
@@ -968,22 +970,18 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
uint32_t size, void *data)
{
- ssize_t ret;
-
- ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
-
- return ret < 0 ? -errno : (int)ret;
+ return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
+ VFIO_PCI_CONFIG_REGION_INDEX,
+ offset, size, data);
}
/* "Raw" write of underlying config space. */
static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
uint32_t size, void *data)
{
- ssize_t ret;
-
- ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
-
- return ret < 0 ? -errno : (int)ret;
+ return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
+ VFIO_PCI_CONFIG_REGION_INDEX,
+ offset, size, data);
}
static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
@@ -3405,7 +3403,7 @@ static void vfio_instance_init(Object *obj)
vdev->host.function = ~0U;
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
- DEVICE(vdev), false);
+ &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
vdev->nv_gpudirect_clique = 0xFF;
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index fd176c18a4..28eedfa571 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -650,7 +650,7 @@ static void vfio_platform_instance_init(Object *obj)
VFIODevice *vbasedev = &vdev->vbasedev;
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
- DEVICE(vdev), false);
+ &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
}
#ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index ef2630cac3..35fb81c04a 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -45,6 +45,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
uint32_t dword;
uint64_t qword;
} buf;
+ int ret;
switch (size) {
case 1:
@@ -64,11 +65,13 @@ void vfio_region_write(void *opaque, hwaddr addr,
break;
}
- if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+ ret = vbasedev->io_ops->region_write(vbasedev, region->nr,
+ addr, size, &buf);
+ if (ret != size) {
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
- ",%d) failed: %m",
+ ",%d) failed: %s",
__func__, vbasedev->name, region->nr,
- addr, data, size);
+ addr, data, size, ret < 0 ? strerror(ret) : "short write");
}
trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
@@ -96,11 +99,13 @@ uint64_t vfio_region_read(void *opaque,
uint64_t qword;
} buf;
uint64_t data = 0;
+ int ret;
- if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
- error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
+ ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf);
+ if (ret != size) {
+ error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
__func__, vbasedev->name, region->nr,
- addr, size);
+ addr, size, ret < 0 ? strerror(ret) : "short read");
return (uint64_t)-1;
}
switch (size) {
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 967b07cd89..cb2f581826 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -41,6 +41,7 @@ enum {
};
typedef struct VFIODeviceOps VFIODeviceOps;
+typedef struct VFIODeviceIOOps VFIODeviceIOOps;
typedef struct VFIOMigration VFIOMigration;
typedef struct IOMMUFDBackend IOMMUFDBackend;
@@ -66,6 +67,7 @@ typedef struct VFIODevice {
OnOffAuto migration_multifd_transfer;
bool migration_events;
VFIODeviceOps *ops;
+ VFIODeviceIOOps *io_ops;
unsigned int num_irqs;
unsigned int num_regions;
unsigned int flags;
@@ -140,6 +142,25 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIODeviceList vfio_device_list;
#ifdef CONFIG_LINUX
+/*
+ * How devices communicate with the server. The default option is through
+ * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote
+ * process.
+ */
+struct VFIODeviceIOOps {
+ int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *);
+ int (*get_region_info)(VFIODevice *vdev,
+ struct vfio_region_info *info);
+ int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq);
+ int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs);
+ int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
+ void *data);
+ int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
+ void *data);
+};
+
+extern VFIODeviceIOOps vfio_device_io_ops_ioctl;
+
int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
struct vfio_region_info **info);
int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
@@ -154,6 +175,7 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
- DeviceState *dev, bool ram_discard);
+ VFIODeviceIOOps *io_ops, DeviceState *dev,
+ bool ram_discard);
int vfio_device_get_aw_bits(VFIODevice *vdev);
#endif /* HW_VFIO_VFIO_COMMON_H */
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 13/14] vfio: add device IO ops vector
2025-04-09 13:48 ` [PATCH 13/14] vfio: add device IO ops vector John Levon
@ 2025-04-24 16:18 ` Cédric Le Goater
0 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 16:18 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Elena Ufimtseva,
Jagannathan Raman
On 4/9/25 15:48, John Levon wrote:
> For vfio-user, device operations such as IRQ handling and region
> read/writes are implemented in userspace over the control socket, not
> ioctl() or read()/write() to the vfio kernel driver; add an ops vector
> to generalize this, and implement vfio_device_io_ops_ioctl for
> interacting with the kernel vfio driver.
>
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/ap.c | 2 +-
> hw/vfio/ccw.c | 2 +-
> hw/vfio/container-base.c | 6 +-
> hw/vfio/device.c | 102 ++++++++++++++++++++++++++++++----
> hw/vfio/listener.c | 13 +++--
> hw/vfio/pci.c | 40 +++++++------
> hw/vfio/platform.c | 2 +-
> hw/vfio/region.c | 17 ++++--
> include/hw/vfio/vfio-device.h | 24 +++++++-
> 9 files changed, 155 insertions(+), 53 deletions(-)
>
> diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
> index f311bca5b6..b6233b2107 100644
> --- a/hw/vfio/ap.c
> +++ b/hw/vfio/ap.c
> @@ -229,7 +229,7 @@ static void vfio_ap_instance_init(Object *obj)
> * handle ram_block_discard_disable().
> */
> vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
> - DEVICE(vapdev), true);
> + &vfio_device_io_ops_ioctl, DEVICE(vapdev), true);
We only have one io_ops implementation currently. Please drop all
the vfio_device_init() changes and keep vfio_device_io_ops_ioctl
static.
>
> /* AP device is mdev type device */
> vbasedev->mdev = true;
> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
> index 14dee7cd19..aee52b5a8d 100644
> --- a/hw/vfio/ccw.c
> +++ b/hw/vfio/ccw.c
> @@ -676,7 +676,7 @@ static void vfio_ccw_instance_init(Object *obj)
> * ram_block_discard_disable().
> */
> vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
> - DEVICE(vcdev), true);
> + &vfio_device_io_ops_ioctl, DEVICE(vcdev), true);
> }
>
> #ifdef CONFIG_IOMMUFD
> diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
> index 613fe1a00d..16fe5f79d2 100644
> --- a/hw/vfio/container-base.c
> +++ b/hw/vfio/container-base.c
> @@ -198,11 +198,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
> feature->flags = VFIO_DEVICE_FEATURE_GET |
> VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
>
> - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
> - return -errno;
> - }
> -
> - return 0;
> + return vbasedev->io_ops->device_feature(vbasedev, feature);
> }
>
> static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 102fa5a9b4..545d9f1faf 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -82,7 +82,7 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
> .count = 0,
> };
>
> - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> + vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> }
>
> void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
> @@ -95,7 +95,7 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
> .count = 1,
> };
>
> - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> + vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> }
>
> void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
> @@ -108,7 +108,7 @@ void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
> .count = 1,
> };
>
> - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> + vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
> }
>
> static inline const char *action_to_str(int action)
> @@ -155,6 +155,7 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
> int argsz;
> const char *name;
> int32_t *pfd;
> + int ret;
>
> argsz = sizeof(*irq_set) + sizeof(*pfd);
>
> @@ -167,7 +168,9 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
> pfd = (int32_t *)&irq_set->data;
> *pfd = fd;
>
> - if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
> + ret = vbasedev->io_ops->set_irqs(vbasedev, irq_set);
> +
> + if (!ret) {
> return true;
> }
>
> @@ -188,22 +191,19 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
> int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
> struct vfio_irq_info *info)
> {
> - int ret;
> -
> memset(info, 0, sizeof(*info));
>
> info->argsz = sizeof(*info);
> info->index = index;
>
> - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
> -
> - return ret < 0 ? -errno : ret;
> + return vbasedev->io_ops->get_irq_info(vbasedev, info);
> }
>
> int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> struct vfio_region_info **info)
> {
> size_t argsz = sizeof(struct vfio_region_info);
> + int ret;
>
> /* create region info cache */
> if (vbasedev->reginfo == NULL) {
> @@ -222,10 +222,11 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> retry:
> (*info)->argsz = argsz;
>
> - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
> + ret = vbasedev->io_ops->get_region_info(vbasedev, *info);
> + if (ret != 0) {
> g_free(*info);
> *info = NULL;
> - return -errno;
> + return ret;
> }
>
> if ((*info)->argsz > argsz) {
> @@ -332,10 +333,12 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
> }
>
> void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
> - DeviceState *dev, bool ram_discard)
> + VFIODeviceIOOps *io_ops, DeviceState *dev,
> + bool ram_discard)
> {
> vbasedev->type = type;
> vbasedev->ops = ops;
> + vbasedev->io_ops = io_ops;
> vbasedev->dev = dev;
> vbasedev->fd = -1;
>
> @@ -463,3 +466,78 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
>
> vfio_device_get_all_region_info(vbasedev);
> }
> +
> +/*
> + * Traditional ioctl() based io
> + */
> +
> +static int vfio_device_io_device_feature(VFIODevice *vbasedev,
> + struct vfio_device_feature *feature)
> +{
> + int ret;
> +
> + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
> + struct vfio_region_info *info)
> +{
> + int ret;
> +
> + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_get_irq_info(VFIODevice *vbasedev,
> + struct vfio_irq_info *info)
> +{
> + int ret;
> +
> + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_set_irqs(VFIODevice *vbasedev,
> + struct vfio_irq_set *irqs)
> +{
> + int ret;
> +
> + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
> + off_t off, uint32_t size, void *data)
> +{
> + struct vfio_region_info *info = vbasedev->reginfo[index];
> + int ret;
> +
> + ret = pread(vbasedev->fd, data, size, info->offset + off);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
> + off_t off, uint32_t size, void *data)
> +{
> + struct vfio_region_info *info = vbasedev->reginfo[index];
> + int ret;
> +
> + ret = pwrite(vbasedev->fd, data, size, info->offset + off);
> +
> + return ret < 0 ? -errno : ret;
> +}
> +
> +VFIODeviceIOOps vfio_device_io_ops_ioctl = {
> + .device_feature = vfio_device_io_device_feature,
> + .get_region_info = vfio_device_io_get_region_info,
> + .get_irq_info = vfio_device_io_get_irq_info,
> + .set_irqs = vfio_device_io_set_irqs,
> + .region_read = vfio_device_io_region_read,
> + .region_write = vfio_device_io_region_write,
> +};
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index bcf2b98e79..7ea9e0dfb7 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -821,13 +821,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
> VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
>
> QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
> + int ret;
> +
> if (!vbasedev->dirty_tracking) {
> continue;
> }
>
> - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
> + ret = vbasedev->io_ops->device_feature(vbasedev, feature);
> +
> + if (ret != 0) {
> warn_report("%s: Failed to stop DMA logging, err %d (%s)",
> - vbasedev->name, -errno, strerror(errno));
> + vbasedev->name, -ret, strerror(-ret));
> }
> vbasedev->dirty_tracking = false;
> }
> @@ -928,10 +932,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
> continue;
> }
>
> - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
> + ret = vbasedev->io_ops->device_feature(vbasedev, feature);
> if (ret) {
> - ret = -errno;
> - error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
> + error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
> vbasedev->name);
> goto out;
> }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index b40d5abdfd..ff2b15ff02 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -381,7 +381,7 @@ static void vfio_msi_interrupt(void *opaque)
> static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
> {
> g_autofree struct vfio_irq_set *irq_set = NULL;
> - int ret = 0, argsz;
> + int argsz;
> int32_t *fd;
>
> argsz = sizeof(*irq_set) + sizeof(*fd);
> @@ -396,9 +396,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
> fd = (int32_t *)&irq_set->data;
> *fd = -1;
>
> - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
> - return ret < 0 ? -errno : ret;
> + return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
> }
>
> static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
> @@ -455,11 +453,11 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
> fds[i] = fd;
> }
>
> - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> + ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
>
> g_free(irq_set);
>
> - return ret < 0 ? -errno : ret;
> + return ret;
> }
>
> static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
> @@ -917,18 +915,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> memset(vdev->rom, 0xff, size);
>
> while (size) {
> - bytes = pread(vbasedev->fd, vdev->rom + off,
> - size, vdev->rom_offset + off);
> + bytes = vbasedev->io_ops->region_read(vbasedev,
> + VFIO_PCI_ROM_REGION_INDEX,
> + off, size, vdev->rom + off);
> +
> if (bytes == 0) {
> break;
> } else if (bytes > 0) {
> off += bytes;
> size -= bytes;
> } else {
> - if (errno == EINTR || errno == EAGAIN) {
> + if (bytes == -EINTR || bytes == -EAGAIN) {
> continue;
> }
> - error_report("vfio: Error reading device ROM: %m");
> + error_report("vfio: Error reading device ROM: %s",
> + strerror(-bytes));
> +
> break;
> }
> }
> @@ -968,22 +970,18 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
> static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
> uint32_t size, void *data)
> {
> - ssize_t ret;
> -
> - ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> -
> - return ret < 0 ? -errno : (int)ret;
> + return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
> + VFIO_PCI_CONFIG_REGION_INDEX,
> + offset, size, data);
> }
>
> /* "Raw" write of underlying config space. */
> static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
> uint32_t size, void *data)
> {
> - ssize_t ret;
> -
> - ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> -
> - return ret < 0 ? -errno : (int)ret;
> + return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
> + VFIO_PCI_CONFIG_REGION_INDEX,
> + offset, size, data);
> }
>
> static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
> @@ -3405,7 +3403,7 @@ static void vfio_instance_init(Object *obj)
> vdev->host.function = ~0U;
>
> vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
> - DEVICE(vdev), false);
> + &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
>
> vdev->nv_gpudirect_clique = 0xFF;
>
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> index fd176c18a4..28eedfa571 100644
> --- a/hw/vfio/platform.c
> +++ b/hw/vfio/platform.c
> @@ -650,7 +650,7 @@ static void vfio_platform_instance_init(Object *obj)
> VFIODevice *vbasedev = &vdev->vbasedev;
>
> vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
> - DEVICE(vdev), false);
> + &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
> }
>
> #ifdef CONFIG_IOMMUFD
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index ef2630cac3..35fb81c04a 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -45,6 +45,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
> uint32_t dword;
> uint64_t qword;
> } buf;
> + int ret;
>
> switch (size) {
> case 1:
> @@ -64,11 +65,13 @@ void vfio_region_write(void *opaque, hwaddr addr,
> break;
> }
>
> - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> + ret = vbasedev->io_ops->region_write(vbasedev, region->nr,
> + addr, size, &buf);
> + if (ret != size) {
> error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
> - ",%d) failed: %m",
> + ",%d) failed: %s",
> __func__, vbasedev->name, region->nr,
> - addr, data, size);
> + addr, data, size, ret < 0 ? strerror(ret) : "short write");
> }
>
> trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
> @@ -96,11 +99,13 @@ uint64_t vfio_region_read(void *opaque,
> uint64_t qword;
> } buf;
> uint64_t data = 0;
> + int ret;
>
> - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
> + ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf);
> + if (ret != size) {
> + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
> __func__, vbasedev->name, region->nr,
> - addr, size);
> + addr, size, ret < 0 ? strerror(ret) : "short read");
> return (uint64_t)-1;
> }
> switch (size) {
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 967b07cd89..cb2f581826 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -41,6 +41,7 @@ enum {
> };
>
> typedef struct VFIODeviceOps VFIODeviceOps;
> +typedef struct VFIODeviceIOOps VFIODeviceIOOps;
> typedef struct VFIOMigration VFIOMigration;
>
> typedef struct IOMMUFDBackend IOMMUFDBackend;
> @@ -66,6 +67,7 @@ typedef struct VFIODevice {
> OnOffAuto migration_multifd_transfer;
> bool migration_events;
> VFIODeviceOps *ops;
> + VFIODeviceIOOps *io_ops;
> unsigned int num_irqs;
> unsigned int num_regions;
> unsigned int flags;
> @@ -140,6 +142,25 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
> extern VFIODeviceList vfio_device_list;
>
> #ifdef CONFIG_LINUX
> +/*
> + * How devices communicate with the server. The default option is through
> + * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote
> + * process.
> + */
> +struct VFIODeviceIOOps {
> + int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *);
> + int (*get_region_info)(VFIODevice *vdev,
> + struct vfio_region_info *info);
> + int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq);
> + int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs);
> + int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> + void *data);
> + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> + void *data);
> +};
Could you please add documentation for this new struct.
Thanks,
C.
> +extern VFIODeviceIOOps vfio_device_io_ops_ioctl;
> +
> int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> struct vfio_region_info **info);
> int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
> @@ -154,6 +175,7 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
> bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
> void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
> void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
> - DeviceState *dev, bool ram_discard);
> + VFIODeviceIOOps *io_ops, DeviceState *dev,
> + bool ram_discard);
> int vfio_device_get_aw_bits(VFIODevice *vdev);
> #endif /* HW_VFIO_VFIO_COMMON_H */
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 14/14] vfio/container: pass MemoryRegion to DMA operations
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (12 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 13/14] vfio: add device IO ops vector John Levon
@ 2025-04-09 13:48 ` John Levon
2025-04-24 16:32 ` Cédric Le Goater
2025-04-25 7:59 ` [PATCH 00/14] vfio: preparation for vfio-user Cédric Le Goater
14 siblings, 1 reply; 53+ messages in thread
From: John Levon @ 2025-04-09 13:48 UTC (permalink / raw)
To: qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Cédric Le Goater, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Levon, John Johnson,
Jagannathan Raman, Elena Ufimtseva
Pass through the MemoryRegion to DMA operation handlers of vfio
containers. The vfio-user container will need this later.
Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
hw/vfio/container-base.c | 4 ++--
hw/vfio/container.c | 3 ++-
hw/vfio/iommufd.c | 3 ++-
hw/vfio/listener.c | 18 +++++++++++-------
hw/virtio/vhost-vdpa.c | 2 +-
include/exec/memory.h | 4 +++-
include/hw/vfio/vfio-container-base.h | 4 ++--
system/memory.c | 7 ++++++-
8 files changed, 29 insertions(+), 16 deletions(-)
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 16fe5f79d2..55c977ec33 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -75,12 +75,12 @@ void vfio_address_space_insert(VFIOAddressSpace *space,
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- void *vaddr, bool readonly)
+ void *vaddr, bool readonly, MemoryRegion *mrp)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
g_assert(vioc->dma_map);
- return vioc->dma_map(bcontainer, iova, size, vaddr, readonly);
+ return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mrp);
}
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 61333d7fc4..587af60e57 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -215,7 +215,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
}
static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly)
+ ram_addr_t size, void *vaddr, bool readonly,
+ MemoryRegion *mrp)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 22e5b16967..4fd3c0d9f3 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -34,7 +34,8 @@
TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly)
+ ram_addr_t size, void *vaddr, bool readonly,
+ MemoryRegion *mrp)
{
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 7ea9e0dfb7..cb06a63a0c 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -94,12 +94,12 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
/* Called with rcu_read_lock held. */
static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
ram_addr_t *ram_addr, bool *read_only,
- Error **errp)
+ MemoryRegion **mrp, Error **errp)
{
bool ret, mr_has_discard_manager;
ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
- &mr_has_discard_manager, errp);
+ &mr_has_discard_manager, mrp, errp);
if (ret && mr_has_discard_manager) {
/*
* Malicious VMs might trigger discarding of IOMMU-mapped memory. The
@@ -127,6 +127,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
VFIOContainerBase *bcontainer = giommu->bcontainer;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
+ MemoryRegion *mrp;
void *vaddr;
int ret;
Error *local_err = NULL;
@@ -151,7 +152,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
bool read_only;
- if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
+ if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mrp,
+ &local_err)) {
error_report_err(local_err);
goto out;
}
@@ -164,7 +166,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
*/
ret = vfio_container_dma_map(bcontainer, iova,
iotlb->addr_mask + 1, vaddr,
- read_only);
+ read_only, mrp);
if (ret) {
error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
@@ -234,7 +236,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
vaddr = memory_region_get_ram_ptr(section->mr) + start;
ret = vfio_container_dma_map(bcontainer, iova, next - start,
- vaddr, section->readonly);
+ vaddr, section->readonly, section->mr);
if (ret) {
/* Rollback */
vfio_ram_discard_notify_discard(rdl, section);
@@ -558,7 +560,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
- vaddr, section->readonly);
+ vaddr, section->readonly, section->mr);
if (ret) {
error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
@@ -1022,7 +1024,9 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
}
rcu_read_lock();
- if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
+ if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL,
+ &local_err)) {
+ error_report_err(local_err);
goto out_unlock;
}
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7efbde3d4c..eb02b081d4 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -228,7 +228,7 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
bool read_only;
- if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL,
+ if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, NULL,
&local_err)) {
error_report_err(local_err);
return;
diff --git a/include/exec/memory.h b/include/exec/memory.h
index d09af58c97..f79ff332b5 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -747,13 +747,15 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
* @read_only: indicates if writes are allowed
* @mr_has_discard_manager: indicates memory is controlled by a
* RamDiscardManager
+ * @mrp: if non-NULL, fill in with MemoryRegion
* @errp: pointer to Error*, to store an error if it happens.
*
* Return: true on success, else false setting @errp with error.
*/
bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
ram_addr_t *ram_addr, bool *read_only,
- bool *mr_has_discard_manager, Error **errp);
+ bool *mr_has_discard_manager, MemoryRegion **mrp,
+ Error **errp);
typedef struct CoalescedMemoryRange CoalescedMemoryRange;
typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 6eaf2b2430..47ce016d8e 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -78,7 +78,7 @@ void vfio_address_space_insert(VFIOAddressSpace *space,
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- void *vaddr, bool readonly);
+ void *vaddr, bool readonly, MemoryRegion *mrp);
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb, int flags);
@@ -124,7 +124,7 @@ struct VFIOIOMMUClass {
void (*listener_commit)(VFIOContainerBase *bcontainer);
int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
- void *vaddr, bool readonly);
+ void *vaddr, bool readonly, MemoryRegion *mrp);
int (*dma_unmap)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb, int flags);
diff --git a/system/memory.c b/system/memory.c
index 4c829793a0..de4f955a66 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -2185,7 +2185,8 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
/* Called with rcu_read_lock held. */
bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
ram_addr_t *ram_addr, bool *read_only,
- bool *mr_has_discard_manager, Error **errp)
+ bool *mr_has_discard_manager, MemoryRegion **mrp,
+ Error **errp)
{
MemoryRegion *mr;
hwaddr xlat;
@@ -2250,6 +2251,10 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
*read_only = !writable || mr->readonly;
}
+ if (mrp != NULL) {
+ *mrp = mr;
+ }
+
return true;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 53+ messages in thread
* Re: [PATCH 14/14] vfio/container: pass MemoryRegion to DMA operations
2025-04-09 13:48 ` [PATCH 14/14] vfio/container: pass MemoryRegion to DMA operations John Levon
@ 2025-04-24 16:32 ` Cédric Le Goater
2025-04-24 17:49 ` John Levon
0 siblings, 1 reply; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-24 16:32 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Jagannathan Raman,
Elena Ufimtseva
On 4/9/25 15:48, John Levon wrote:
> Pass through the MemoryRegion to DMA operation handlers of vfio
> containers. The vfio-user container will need this later.
>
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
> hw/vfio/container-base.c | 4 ++--
> hw/vfio/container.c | 3 ++-
> hw/vfio/iommufd.c | 3 ++-
> hw/vfio/listener.c | 18 +++++++++++-------
> hw/virtio/vhost-vdpa.c | 2 +-
> include/exec/memory.h | 4 +++-
> include/hw/vfio/vfio-container-base.h | 4 ++--
> system/memory.c | 7 ++++++-
> 8 files changed, 29 insertions(+), 16 deletions(-)
>
> diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
> index 16fe5f79d2..55c977ec33 100644
> --- a/hw/vfio/container-base.c
> +++ b/hw/vfio/container-base.c
> @@ -75,12 +75,12 @@ void vfio_address_space_insert(VFIOAddressSpace *space,
>
> int vfio_container_dma_map(VFIOContainerBase *bcontainer,
> hwaddr iova, ram_addr_t size,
> - void *vaddr, bool readonly)
> + void *vaddr, bool readonly, MemoryRegion *mrp)
> {
> VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
>
> g_assert(vioc->dma_map);
> - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly);
> + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mrp);
> }
>
> int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
> index 61333d7fc4..587af60e57 100644
> --- a/hw/vfio/container.c
> +++ b/hw/vfio/container.c
> @@ -215,7 +215,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
> }
>
> static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
> - ram_addr_t size, void *vaddr, bool readonly)
> + ram_addr_t size, void *vaddr, bool readonly,
> + MemoryRegion *mrp)
> {
> const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
> bcontainer);
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 22e5b16967..4fd3c0d9f3 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -34,7 +34,8 @@
> TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
>
> static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
> - ram_addr_t size, void *vaddr, bool readonly)
> + ram_addr_t size, void *vaddr, bool readonly,
> + MemoryRegion *mrp)
> {
> const VFIOIOMMUFDContainer *container =
> container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index 7ea9e0dfb7..cb06a63a0c 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -94,12 +94,12 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> /* Called with rcu_read_lock held. */
> static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> ram_addr_t *ram_addr, bool *read_only,
> - Error **errp)
> + MemoryRegion **mrp, Error **errp)
> {
> bool ret, mr_has_discard_manager;
>
> ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
> - &mr_has_discard_manager, errp);
> + &mr_has_discard_manager, mrp, errp);
> if (ret && mr_has_discard_manager) {
> /*
> * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
> @@ -127,6 +127,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> VFIOContainerBase *bcontainer = giommu->bcontainer;
> hwaddr iova = iotlb->iova + giommu->iommu_offset;
> + MemoryRegion *mrp;
> void *vaddr;
> int ret;
> Error *local_err = NULL;
> @@ -151,7 +152,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> bool read_only;
>
> - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
> + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mrp,
> + &local_err)) {
> error_report_err(local_err);
> goto out;
> }
> @@ -164,7 +166,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> */
> ret = vfio_container_dma_map(bcontainer, iova,
> iotlb->addr_mask + 1, vaddr,
> - read_only);
> + read_only, mrp);
> if (ret) {
> error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
> "0x%"HWADDR_PRIx", %p) = %d (%s)",
> @@ -234,7 +236,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
> vaddr = memory_region_get_ram_ptr(section->mr) + start;
>
> ret = vfio_container_dma_map(bcontainer, iova, next - start,
> - vaddr, section->readonly);
> + vaddr, section->readonly, section->mr);
> if (ret) {
> /* Rollback */
> vfio_ram_discard_notify_discard(rdl, section);
> @@ -558,7 +560,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
> }
>
> ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
> - vaddr, section->readonly);
> + vaddr, section->readonly, section->mr);
> if (ret) {
> error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
> "0x%"HWADDR_PRIx", %p) = %d (%s)",
> @@ -1022,7 +1024,9 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> }
>
> rcu_read_lock();
> - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
> + if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL,
> + &local_err)) {
> + error_report_err(local_err);
> goto out_unlock;
> }
>
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 7efbde3d4c..eb02b081d4 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -228,7 +228,7 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> bool read_only;
>
> - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL,
> + if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, NULL,
> &local_err)) {
> error_report_err(local_err);
> return;
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index d09af58c97..f79ff332b5 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -747,13 +747,15 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
> * @read_only: indicates if writes are allowed
> * @mr_has_discard_manager: indicates memory is controlled by a
> * RamDiscardManager
> + * @mrp: if non-NULL, fill in with MemoryRegion
> * @errp: pointer to Error*, to store an error if it happens.
> *
> * Return: true on success, else false setting @errp with error.
> */
> bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> ram_addr_t *ram_addr, bool *read_only,
> - bool *mr_has_discard_manager, Error **errp);
> + bool *mr_has_discard_manager, MemoryRegion **mrp,
> + Error **errp);
>
> typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
> index 6eaf2b2430..47ce016d8e 100644
> --- a/include/hw/vfio/vfio-container-base.h
> +++ b/include/hw/vfio/vfio-container-base.h
> @@ -78,7 +78,7 @@ void vfio_address_space_insert(VFIOAddressSpace *space,
>
> int vfio_container_dma_map(VFIOContainerBase *bcontainer,
> hwaddr iova, ram_addr_t size,
> - void *vaddr, bool readonly);
> + void *vaddr, bool readonly, MemoryRegion *mrp);
> int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
> hwaddr iova, ram_addr_t size,
> IOMMUTLBEntry *iotlb, int flags);
> @@ -124,7 +124,7 @@ struct VFIOIOMMUClass {
> void (*listener_commit)(VFIOContainerBase *bcontainer);
> int (*dma_map)(const VFIOContainerBase *bcontainer,
> hwaddr iova, ram_addr_t size,
> - void *vaddr, bool readonly);
> + void *vaddr, bool readonly, MemoryRegion *mrp);
> int (*dma_unmap)(const VFIOContainerBase *bcontainer,
> hwaddr iova, ram_addr_t size,
> IOMMUTLBEntry *iotlb, int flags);
> diff --git a/system/memory.c b/system/memory.c
> index 4c829793a0..de4f955a66 100644
> --- a/system/memory.c
> +++ b/system/memory.c
> @@ -2185,7 +2185,8 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
> /* Called with rcu_read_lock held. */
> bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> ram_addr_t *ram_addr, bool *read_only,
> - bool *mr_has_discard_manager, Error **errp)
> + bool *mr_has_discard_manager, MemoryRegion **mrp,
> + Error **errp)
> {
> MemoryRegion *mr;
> hwaddr xlat;
> @@ -2250,6 +2251,10 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> *read_only = !writable || mr->readonly;
> }
>
> + if (mrp != NULL) {
> + *mrp = mr;
> + }
> +
> return true;
> }
>
Is everyone OK with adding an extra in/out parameter to memory_get_xlat_addr() ?
Should we take a ref on the region ?
I think this change should be proposed in its own patch as done in the previous
series and by Steve for live update.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 14/14] vfio/container: pass MemoryRegion to DMA operations
2025-04-24 16:32 ` Cédric Le Goater
@ 2025-04-24 17:49 ` John Levon
0 siblings, 0 replies; 53+ messages in thread
From: John Levon @ 2025-04-24 17:49 UTC (permalink / raw)
To: Cédric Le Goater
Cc: qemu-devel, Tony Krowiak, Stefano Garzarella, Peter Xu,
Thomas Huth, Matthew Rosato, David Hildenbrand,
Michael S. Tsirkin, Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman, John Johnson, Jagannathan Raman,
Elena Ufimtseva
On Thu, Apr 24, 2025 at 06:32:13PM +0200, Cédric Le Goater wrote:
> > @@ -2250,6 +2251,10 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> > *read_only = !writable || mr->readonly;
> > }
> > + if (mrp != NULL) {
> > + *mrp = mr;
> > + }
> > +
> > return true;
> > }
>
> Is everyone OK with adding an extra in/out parameter to memory_get_xlat_addr() ?
> Should we take a ref on the region ?
>
> I think this change should be proposed in its own patch as done in the previous
> series and by Steve for live update.
Yep, I left this one last as I think it's best picked up by someone who
understands this a lot better than me!
regards
john
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 00/14] vfio: preparation for vfio-user
2025-04-09 13:48 [PATCH 00/14] vfio: preparation for vfio-user John Levon
` (13 preceding siblings ...)
2025-04-09 13:48 ` [PATCH 14/14] vfio/container: pass MemoryRegion to DMA operations John Levon
@ 2025-04-25 7:59 ` Cédric Le Goater
14 siblings, 0 replies; 53+ messages in thread
From: Cédric Le Goater @ 2025-04-25 7:59 UTC (permalink / raw)
To: John Levon, qemu-devel
Cc: Tony Krowiak, Stefano Garzarella, Peter Xu, Thomas Huth,
Matthew Rosato, David Hildenbrand, Michael S. Tsirkin,
Alex Williamson, qemu-s390x, Tomita Moeko,
Philippe Mathieu-Daudé, Halil Pasic, Jason Herne,
Paolo Bonzini, Eric Farman
On 4/9/25 15:48, John Levon wrote:
> Hi, this series is based on Cédric Le Goater's vfio cleanup series:
> https://github.com/legoater/qemu/commits/vfio-10.1
>
> The series contains patches to vfio to prepare for the vfio-user
> implementation. A previous version of these patches can be found at
> https://lore.kernel.org/all/7dd34008-e0f1-4eed-a77e-55b1f68fbe69@redhat.com/T/
> ("[PATCH v8 00/28] vfio-user client"); roughly corresponding to patches 1-10.
>
> Please see that series for justification and context.
>
> The following changes have been made since the previous series:
>
> - rebased on top of vfio-10.1 cleanup series
> - split unmap all patch into two, fixed some nits, dropped unmap all detection
> - fix two vfio_interrupt_setup() cleanup bugs
> - various renames as per review
> - new vfio_device_get_irq_info()
> - split out device ops patch into smaller chunks
> - region cache renames for clarity
> - commit author corrected for several patches
>
> thanks
> john
>
> John Levon (14):
> vfio: refactor out vfio_interrupt_setup()
> vfio: refactor out vfio_pci_config_setup()
> vfio: add vfio_prepare_device()
> vfio: add vfio_attach_device_by_iommu_type()
> vfio/container: pass listener_begin/commit callbacks
> vfio: add flags parameter to DMA unmap callback
> vfio: specify VFIO_DMA_UNMAP_FLAG_ALL to callback
> vfio: add vfio-pci-base class
> vfio: add vfio_device_get_irq_info() helper
> vfio: consistently handle return value for helpers
> vfio: add vfio_pci_config_space_read/write()
> vfio: add region info cache
> vfio: add device IO ops vector
> vfio/container: pass MemoryRegion to DMA operations
>
> hw/vfio/ap.c | 21 +-
> hw/vfio/ccw.c | 27 +-
> hw/vfio/container-base.c | 14 +-
> hw/vfio/container.c | 74 ++-
> hw/vfio/device.c | 178 ++++++-
> hw/vfio/igd.c | 8 +-
> hw/vfio/iommufd.c | 35 +-
> hw/vfio/listener.c | 82 ++--
> hw/vfio/pci.c | 672 +++++++++++++++-----------
> hw/vfio/pci.h | 12 +-
> hw/vfio/platform.c | 8 +-
> hw/vfio/region.c | 19 +-
> hw/virtio/vhost-vdpa.c | 2 +-
> include/exec/memory.h | 4 +-
> include/hw/vfio/vfio-container-base.h | 10 +-
> include/hw/vfio/vfio-device.h | 34 +-
> system/memory.c | 7 +-
> 17 files changed, 784 insertions(+), 423 deletions(-)
>
Applied patch 1,2 to vfio-next.
Thanks,
C.
^ permalink raw reply [flat|nested] 53+ messages in thread