* [PATCH V1 0/2] preserve pending interrupts during cpr
@ 2025-07-14 14:27 Steve Sistare
2025-07-14 14:27 ` [PATCH V1 1/2] vfio/pci: augment set_handler Steve Sistare
2025-07-14 14:27 ` [PATCH V1 2/2] vfio/pci: preserve pending interrupts Steve Sistare
0 siblings, 2 replies; 7+ messages in thread
From: Steve Sistare @ 2025-07-14 14:27 UTC (permalink / raw)
To: qemu-devel
Cc: Cedric Le Goater, Zhenzhong Duan, Alex Williamson, Steve Sistare
Close a race condition that causes cpr-transfer to lose VFIO
interrupts. See commit messages for details.
Steve Sistare (2):
vfio/pci: augment set_handler
vfio/pci: preserve pending interrupts
hw/vfio/cpr.c | 92 +++++++++++++++++++++++++++++++++++++-
hw/vfio/pci.c | 15 ++++++-
hw/vfio/pci.h | 4 +-
include/hw/vfio/vfio-cpr.h | 6 +++
4 files changed, 113 insertions(+), 4 deletions(-)
--
2.39.3
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH V1 1/2] vfio/pci: augment set_handler
2025-07-14 14:27 [PATCH V1 0/2] preserve pending interrupts during cpr Steve Sistare
@ 2025-07-14 14:27 ` Steve Sistare
2025-07-15 12:58 ` Cédric Le Goater
2025-07-14 14:27 ` [PATCH V1 2/2] vfio/pci: preserve pending interrupts Steve Sistare
1 sibling, 1 reply; 7+ messages in thread
From: Steve Sistare @ 2025-07-14 14:27 UTC (permalink / raw)
To: qemu-devel
Cc: Cedric Le Goater, Zhenzhong Duan, Alex Williamson, Steve Sistare
Extend vfio_pci_msi_set_handler() so it can set or clear the handler.
Add a similar accessor for INTx. No functional change.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
hw/vfio/cpr.c | 2 +-
hw/vfio/pci.c | 13 +++++++++++--
hw/vfio/pci.h | 3 ++-
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index af0f12a7ad..2a244fc4b6 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -70,7 +70,7 @@ static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
if (fd >= 0) {
vfio_pci_vector_init(vdev, i);
- vfio_pci_msi_set_handler(vdev, i);
+ vfio_pci_msi_set_handler(vdev, i, true);
}
if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 1093b28df7..8b471c054a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -415,6 +415,14 @@ bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
return vfio_intx_enable(vdev, errp);
}
+void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable)
+{
+ int fd = event_notifier_get_fd(&vdev->intx.interrupt);
+ IOHandler *handler = (enable ? vfio_intx_interrupt : NULL);
+
+ qemu_set_fd_handler(fd, handler, NULL, vdev);
+}
+
/*
* MSI/X
*/
@@ -453,12 +461,13 @@ static void vfio_msi_interrupt(void *opaque)
notify(&vdev->pdev, nr);
}
-void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr)
+void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable)
{
VFIOMSIVector *vector = &vdev->msi_vectors[nr];
int fd = event_notifier_get_fd(&vector->interrupt);
+ IOHandler *handler = (enable ? vfio_msi_interrupt : NULL);
- qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+ qemu_set_fd_handler(fd, handler, NULL, vector);
}
/*
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 495fae737d..80c8fcfa07 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -218,8 +218,9 @@ void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp);
+void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable);
void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev);
-void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr);
+void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable);
uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
void vfio_pci_write_config(PCIDevice *pdev,
--
2.39.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH V1 2/2] vfio/pci: preserve pending interrupts
2025-07-14 14:27 [PATCH V1 0/2] preserve pending interrupts during cpr Steve Sistare
2025-07-14 14:27 ` [PATCH V1 1/2] vfio/pci: augment set_handler Steve Sistare
@ 2025-07-14 14:27 ` Steve Sistare
2025-07-16 12:40 ` Cédric Le Goater
2025-07-16 12:53 ` Cédric Le Goater
1 sibling, 2 replies; 7+ messages in thread
From: Steve Sistare @ 2025-07-14 14:27 UTC (permalink / raw)
To: qemu-devel
Cc: Cedric Le Goater, Zhenzhong Duan, Alex Williamson, Steve Sistare
cpr-transfer may lose a VFIO interrupt because the KVM instance is
destroyed and recreated. If an interrupt arrives in the middle, it is
dropped. To fix, disable pended interrupts during cpr save, and pick
up the pieces. In more detail:
Stop the VCPUs. Call kvm_irqchip_remove_irqfd_notifier_gsi --> KVM_IRQFD to
deassign the irqfd gsi that routes interrupts directly to the VCPU and KVM.
After this call, interrupts fall back to the kernel vfio_msihandler, which
writes to QEMU's kvm_interrupt eventfd. CPR already preserves that
eventfd. When the route is re-established in new QEMU, the kernel tests
the eventfd and pends an interrupt to KVM if necessary.
Deassign INTx in a similar manner. For both MSI and INTx, remove the
eventfd handler so old QEMU does not consume an event.
If an interrupt was already pended to KVM prior to the completion of
kvm_irqchip_remove_irqfd_notifier_gsi, it will be recovered by the
subsequent call to cpu_synchronize_all_states, which pulls KVM interrupt
state to userland prior to saving it in vmstate.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
hw/vfio/cpr.c | 90 ++++++++++++++++++++++++++++++++++++++
hw/vfio/pci.c | 2 +
hw/vfio/pci.h | 1 +
include/hw/vfio/vfio-cpr.h | 6 +++
4 files changed, 99 insertions(+)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 2a244fc4b6..ae2a6b7acd 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -198,3 +198,93 @@ void vfio_cpr_add_kvm_notifier(void)
MIG_MODE_CPR_TRANSFER);
}
}
+
+static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+ EventNotifier *rn, int virq, bool enable)
+{
+ if (enable) {
+ return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq);
+ } else {
+ return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq);
+ }
+}
+
+static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable)
+{
+ const char *op = (enable ? "enable" : "disable");
+ PCIDevice *pdev = &vdev->pdev;
+ int i, nr_vectors, ret = 0;
+
+ if (msix_enabled(pdev)) {
+ nr_vectors = vdev->msix->entries;
+
+ } else if (msi_enabled(pdev)) {
+ nr_vectors = msi_nr_vectors_allocated(pdev);
+
+ } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
+ ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
+ &vdev->intx.unmask, vdev->intx.route.irq,
+ enable);
+ if (ret) {
+ error_setg(errp, "failed to %s INTx irq %d: error %d",
+ op, vdev->intx.route.irq, ret);
+ } else {
+ vfio_pci_intx_set_handler(vdev, enable);
+ }
+ return ret;
+
+ } else {
+ nr_vectors = 0;
+ }
+
+ for (i = 0; i < nr_vectors; i++) {
+ VFIOMSIVector *vector = &vdev->msi_vectors[i];
+ if (vector->use) {
+ ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
+ NULL, vector->virq, enable);
+ if (ret) {
+ error_setg(errp, "failed to %s msi vector %d virq %d: error %d",
+ op, i, vector->virq, ret);
+ } else {
+ vfio_pci_msi_set_handler(vdev, i, enable);
+ }
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * When CPR starts, detach IRQs from the VFIO device so future interrupts
+ * are posted to kvm_interrupt, which is preserved in new QEMU. Interrupts
+ * that were already posted to the old KVM instance, but not delivered to the
+ * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance
+ * in new QEMU.
+ *
+ * If CPR fails, reattach the IRQs.
+ */
+static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier,
+ MigrationEvent *e, Error **errp)
+{
+ VFIOPCIDevice *vdev =
+ container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier);
+
+ if (e->type == MIG_EVENT_PRECOPY_SETUP) {
+ return vfio_cpr_set_msi_virq(vdev, errp, false);
+ } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
+ return vfio_cpr_set_msi_virq(vdev, errp, true);
+ }
+ return 0;
+}
+
+void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev)
+{
+ migration_add_notifier_mode(&vdev->cpr.transfer_notifier,
+ vfio_cpr_pci_notifier,
+ MIG_MODE_CPR_TRANSFER);
+}
+
+void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev)
+{
+ migration_remove_notifier(&vdev->cpr.transfer_notifier);
+}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8b471c054a..22a4125131 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2993,6 +2993,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
{
vfio_display_finalize(vdev);
vfio_bars_finalize(vdev);
+ vfio_cpr_pci_unregister_device(vdev);
g_free(vdev->emulated_config_bits);
g_free(vdev->rom);
/*
@@ -3442,6 +3443,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
vfio_pci_register_err_notifier(vdev);
vfio_pci_register_req_notifier(vdev);
vfio_setup_resetfn_quirk(vdev);
+ vfio_cpr_pci_register_device(vdev);
return;
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 80c8fcfa07..7989b94eb3 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -194,6 +194,7 @@ struct VFIOPCIDevice {
bool skip_vsc_check;
VFIODisplay *dpy;
Notifier irqchip_change_notifier;
+ VFIOPCICPR cpr;
};
/* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
index 80ad20d216..d37daffbc5 100644
--- a/include/hw/vfio/vfio-cpr.h
+++ b/include/hw/vfio/vfio-cpr.h
@@ -38,6 +38,10 @@ typedef struct VFIODeviceCPR {
uint32_t ioas_id;
} VFIODeviceCPR;
+typedef struct VFIOPCICPR {
+ NotifierWithReturn transfer_notifier;
+} VFIOPCICPR;
+
bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
Error **errp);
void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container);
@@ -77,5 +81,7 @@ extern const VMStateDescription vfio_cpr_pci_vmstate;
extern const VMStateDescription vmstate_cpr_vfio_devices;
void vfio_cpr_add_kvm_notifier(void);
+void vfio_cpr_pci_register_device(struct VFIOPCIDevice *vdev);
+void vfio_cpr_pci_unregister_device(struct VFIOPCIDevice *vdev);
#endif /* HW_VFIO_VFIO_CPR_H */
--
2.39.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH V1 1/2] vfio/pci: augment set_handler
2025-07-14 14:27 ` [PATCH V1 1/2] vfio/pci: augment set_handler Steve Sistare
@ 2025-07-15 12:58 ` Cédric Le Goater
0 siblings, 0 replies; 7+ messages in thread
From: Cédric Le Goater @ 2025-07-15 12:58 UTC (permalink / raw)
To: Steve Sistare, qemu-devel; +Cc: Zhenzhong Duan, Alex Williamson
On 7/14/25 16:27, Steve Sistare wrote:
> Extend vfio_pci_msi_set_handler() so it can set or clear the handler.
> Add a similar accessor for INTx. No functional change.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Thanks,
C.
> ---
> hw/vfio/cpr.c | 2 +-
> hw/vfio/pci.c | 13 +++++++++++--
> hw/vfio/pci.h | 3 ++-
> 3 files changed, 14 insertions(+), 4 deletions(-)
>
> diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
> index af0f12a7ad..2a244fc4b6 100644
> --- a/hw/vfio/cpr.c
> +++ b/hw/vfio/cpr.c
> @@ -70,7 +70,7 @@ static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
> fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
> if (fd >= 0) {
> vfio_pci_vector_init(vdev, i);
> - vfio_pci_msi_set_handler(vdev, i);
> + vfio_pci_msi_set_handler(vdev, i, true);
> }
>
> if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 1093b28df7..8b471c054a 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -415,6 +415,14 @@ bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp)
> return vfio_intx_enable(vdev, errp);
> }
>
> +void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable)
> +{
> + int fd = event_notifier_get_fd(&vdev->intx.interrupt);
> + IOHandler *handler = (enable ? vfio_intx_interrupt : NULL);
> +
> + qemu_set_fd_handler(fd, handler, NULL, vdev);
> +}
> +
> /*
> * MSI/X
> */
> @@ -453,12 +461,13 @@ static void vfio_msi_interrupt(void *opaque)
> notify(&vdev->pdev, nr);
> }
>
> -void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr)
> +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable)
> {
> VFIOMSIVector *vector = &vdev->msi_vectors[nr];
> int fd = event_notifier_get_fd(&vector->interrupt);
> + IOHandler *handler = (enable ? vfio_msi_interrupt : NULL);
>
> - qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
> + qemu_set_fd_handler(fd, handler, NULL, vector);
> }
>
> /*
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 495fae737d..80c8fcfa07 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -218,8 +218,9 @@ void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
> void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
> void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
> bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp);
> +void vfio_pci_intx_set_handler(VFIOPCIDevice *vdev, bool enable);
> void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev);
> -void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr);
> +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr, bool enable);
>
> uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> void vfio_pci_write_config(PCIDevice *pdev,
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH V1 2/2] vfio/pci: preserve pending interrupts
2025-07-14 14:27 ` [PATCH V1 2/2] vfio/pci: preserve pending interrupts Steve Sistare
@ 2025-07-16 12:40 ` Cédric Le Goater
2025-07-16 17:52 ` Steven Sistare
2025-07-16 12:53 ` Cédric Le Goater
1 sibling, 1 reply; 7+ messages in thread
From: Cédric Le Goater @ 2025-07-16 12:40 UTC (permalink / raw)
To: Steve Sistare, qemu-devel; +Cc: Zhenzhong Duan, Alex Williamson
On 7/14/25 16:27, Steve Sistare wrote:
> cpr-transfer may lose a VFIO interrupt because the KVM instance is
> destroyed and recreated. If an interrupt arrives in the middle, it is
> dropped. To fix, disable pended interrupts during cpr save, and pick
'pending' interrupts is more common.
> up the pieces. In more detail:
>
> Stop the VCPUs. Call kvm_irqchip_remove_irqfd_notifier_gsi --> KVM_IRQFD to
> deassign the irqfd gsi that routes interrupts directly to the VCPU and KVM.
> After this call, interrupts fall back to the kernel vfio_msihandler, which
> writes to QEMU's kvm_interrupt eventfd. CPR already preserves that
> eventfd. When the route is re-established in new QEMU, the kernel tests
> the eventfd and pends an interrupt to KVM if necessary.
'triggers an interrupt' maybe ?
> Deassign INTx in a similar manner. For both MSI and INTx, remove the
> eventfd handler so old QEMU does not consume an event.
>
> If an interrupt was already pended to KVM prior to the completion of
> kvm_irqchip_remove_irqfd_notifier_gsi, it will be recovered by the
> subsequent call to cpu_synchronize_all_states, which pulls KVM interrupt
> state to userland prior to saving it in vmstate.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> hw/vfio/cpr.c | 90 ++++++++++++++++++++++++++++++++++++++
> hw/vfio/pci.c | 2 +
> hw/vfio/pci.h | 1 +
> include/hw/vfio/vfio-cpr.h | 6 +++
> 4 files changed, 99 insertions(+)
>
> diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
> index 2a244fc4b6..ae2a6b7acd 100644
> --- a/hw/vfio/cpr.c
> +++ b/hw/vfio/cpr.c
> @@ -198,3 +198,93 @@ void vfio_cpr_add_kvm_notifier(void)
> MIG_MODE_CPR_TRANSFER);
> }
> }
> +
> +static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
> + EventNotifier *rn, int virq, bool enable)
> +{
> + if (enable) {
> + return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq);
> + } else {
> + return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq);
> + }
> +}
> +
> +static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable)
> +{
> + const char *op = (enable ? "enable" : "disable");
> + PCIDevice *pdev = &vdev->pdev;
> + int i, nr_vectors, ret = 0;
> +
> + if (msix_enabled(pdev)) {
> + nr_vectors = vdev->msix->entries;
> +
> + } else if (msi_enabled(pdev)) {
> + nr_vectors = msi_nr_vectors_allocated(pdev);
> +
> + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
> + ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
> + &vdev->intx.unmask, vdev->intx.route.irq,
> + enable);
I think 'ret' is an errno, we could use error_setg_errno()
> + if (ret) {
> + error_setg(errp, "failed to %s INTx irq %d: error %d",
> + op, vdev->intx.route.irq, ret);
I'd prefer to :
return ret;
}
vfio_pci_intx_set_handler(vdev, enable);
return ret;
> + } else {
> + vfio_pci_intx_set_handler(vdev, enable);
> + }
> + return ret;
> +
> + } else {
> + nr_vectors = 0;
'return 0' is as good.
> + }
> +
> + for (i = 0; i < nr_vectors; i++) {
> + VFIOMSIVector *vector = &vdev->msi_vectors[i];
> + if (vector->use) {
> + ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
> + NULL, vector->virq, enable);
> + if (ret) {
> + error_setg(errp, "failed to %s msi vector %d virq %d: error %d",
> + op, i, vector->virq, ret);
If errp is set multiple times, qemu will abort. This routine should
return at the first error.
Thanks,
C.
> + } else {
> + vfio_pci_msi_set_handler(vdev, i, enable);
> + }
> + }
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * When CPR starts, detach IRQs from the VFIO device so future interrupts
> + * are posted to kvm_interrupt, which is preserved in new QEMU. Interrupts
> + * that were already posted to the old KVM instance, but not delivered to the
> + * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance
> + * in new QEMU.
> + *
> + * If CPR fails, reattach the IRQs.
> + */
> +static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier,
> + MigrationEvent *e, Error **errp)
> +{
> + VFIOPCIDevice *vdev =
> + container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier);
> +
> + if (e->type == MIG_EVENT_PRECOPY_SETUP) {
> + return vfio_cpr_set_msi_virq(vdev, errp, false);
> + } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
> + return vfio_cpr_set_msi_virq(vdev, errp, true);
> + }
> + return 0;
> +}
> +
> +void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev)
> +{
> + migration_add_notifier_mode(&vdev->cpr.transfer_notifier,
> + vfio_cpr_pci_notifier,
> + MIG_MODE_CPR_TRANSFER);
> +}
> +
> +void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev)
> +{
> + migration_remove_notifier(&vdev->cpr.transfer_notifier);
> +}
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 8b471c054a..22a4125131 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2993,6 +2993,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
> {
> vfio_display_finalize(vdev);
> vfio_bars_finalize(vdev);
> + vfio_cpr_pci_unregister_device(vdev);
> g_free(vdev->emulated_config_bits);
> g_free(vdev->rom);
> /*
> @@ -3442,6 +3443,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
> vfio_pci_register_err_notifier(vdev);
> vfio_pci_register_req_notifier(vdev);
> vfio_setup_resetfn_quirk(vdev);
> + vfio_cpr_pci_register_device(vdev);
>
> return;
>
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 80c8fcfa07..7989b94eb3 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -194,6 +194,7 @@ struct VFIOPCIDevice {
> bool skip_vsc_check;
> VFIODisplay *dpy;
> Notifier irqchip_change_notifier;
> + VFIOPCICPR cpr;
> };
>
> /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
> diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
> index 80ad20d216..d37daffbc5 100644
> --- a/include/hw/vfio/vfio-cpr.h
> +++ b/include/hw/vfio/vfio-cpr.h
> @@ -38,6 +38,10 @@ typedef struct VFIODeviceCPR {
> uint32_t ioas_id;
> } VFIODeviceCPR;
>
> +typedef struct VFIOPCICPR {
> + NotifierWithReturn transfer_notifier;
> +} VFIOPCICPR;
> +
> bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
> Error **errp);
> void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container);
> @@ -77,5 +81,7 @@ extern const VMStateDescription vfio_cpr_pci_vmstate;
> extern const VMStateDescription vmstate_cpr_vfio_devices;
>
> void vfio_cpr_add_kvm_notifier(void);
> +void vfio_cpr_pci_register_device(struct VFIOPCIDevice *vdev);
> +void vfio_cpr_pci_unregister_device(struct VFIOPCIDevice *vdev);
>
> #endif /* HW_VFIO_VFIO_CPR_H */
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH V1 2/2] vfio/pci: preserve pending interrupts
2025-07-14 14:27 ` [PATCH V1 2/2] vfio/pci: preserve pending interrupts Steve Sistare
2025-07-16 12:40 ` Cédric Le Goater
@ 2025-07-16 12:53 ` Cédric Le Goater
1 sibling, 0 replies; 7+ messages in thread
From: Cédric Le Goater @ 2025-07-16 12:53 UTC (permalink / raw)
To: Steve Sistare, qemu-devel
Cc: Zhenzhong Duan, Alex Williamson, 'Peter Xu'
+Peter, for info.
On 7/14/25 16:27, Steve Sistare wrote:
> cpr-transfer may lose a VFIO interrupt because the KVM instance is
> destroyed and recreated. If an interrupt arrives in the middle, it is
> dropped. To fix, disable pended interrupts during cpr save, and pick
> up the pieces. In more detail:
>
> Stop the VCPUs. Call kvm_irqchip_remove_irqfd_notifier_gsi --> KVM_IRQFD to
> deassign the irqfd gsi that routes interrupts directly to the VCPU and KVM.
> After this call, interrupts fall back to the kernel vfio_msihandler, which
> writes to QEMU's kvm_interrupt eventfd. CPR already preserves that
> eventfd. When the route is re-established in new QEMU, the kernel tests
> the eventfd and pends an interrupt to KVM if necessary.
>
> Deassign INTx in a similar manner. For both MSI and INTx, remove the
> eventfd handler so old QEMU does not consume an event.
>
> If an interrupt was already pended to KVM prior to the completion of
> kvm_irqchip_remove_irqfd_notifier_gsi, it will be recovered by the
> subsequent call to cpu_synchronize_all_states, which pulls KVM interrupt
> state to userland prior to saving it in vmstate.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> hw/vfio/cpr.c | 90 ++++++++++++++++++++++++++++++++++++++
> hw/vfio/pci.c | 2 +
> hw/vfio/pci.h | 1 +
> include/hw/vfio/vfio-cpr.h | 6 +++
> 4 files changed, 99 insertions(+)
>
> diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
> index 2a244fc4b6..ae2a6b7acd 100644
> --- a/hw/vfio/cpr.c
> +++ b/hw/vfio/cpr.c
> @@ -198,3 +198,93 @@ void vfio_cpr_add_kvm_notifier(void)
> MIG_MODE_CPR_TRANSFER);
> }
> }
> +
> +static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
> + EventNotifier *rn, int virq, bool enable)
> +{
> + if (enable) {
> + return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq);
> + } else {
> + return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq);
> + }
> +}
> +
> +static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable)
> +{
> + const char *op = (enable ? "enable" : "disable");
> + PCIDevice *pdev = &vdev->pdev;
> + int i, nr_vectors, ret = 0;
> +
> + if (msix_enabled(pdev)) {
> + nr_vectors = vdev->msix->entries;
> +
> + } else if (msi_enabled(pdev)) {
> + nr_vectors = msi_nr_vectors_allocated(pdev);
> +
> + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
> + ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
> + &vdev->intx.unmask, vdev->intx.route.irq,
> + enable);
> + if (ret) {
> + error_setg(errp, "failed to %s INTx irq %d: error %d",
> + op, vdev->intx.route.irq, ret);
> + } else {
> + vfio_pci_intx_set_handler(vdev, enable);
> + }
> + return ret;
> +
> + } else {
> + nr_vectors = 0;
> + }
> +
> + for (i = 0; i < nr_vectors; i++) {
> + VFIOMSIVector *vector = &vdev->msi_vectors[i];
> + if (vector->use) {
> + ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
> + NULL, vector->virq, enable);
> + if (ret) {
> + error_setg(errp, "failed to %s msi vector %d virq %d: error %d",
> + op, i, vector->virq, ret);
> + } else {
> + vfio_pci_msi_set_handler(vdev, i, enable);
> + }
> + }
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * When CPR starts, detach IRQs from the VFIO device so future interrupts
> + * are posted to kvm_interrupt, which is preserved in new QEMU. Interrupts
> + * that were already posted to the old KVM instance, but not delivered to the
> + * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance
> + * in new QEMU.
> + *
> + * If CPR fails, reattach the IRQs.
> + */
> +static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier,
> + MigrationEvent *e, Error **errp)
> +{
> + VFIOPCIDevice *vdev =
> + container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier);
> +
> + if (e->type == MIG_EVENT_PRECOPY_SETUP) {
> + return vfio_cpr_set_msi_virq(vdev, errp, false);
> + } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
> + return vfio_cpr_set_msi_virq(vdev, errp, true);
> + }
> + return 0;
> +}
> +
> +void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev)
> +{
> + migration_add_notifier_mode(&vdev->cpr.transfer_notifier,
> + vfio_cpr_pci_notifier,
> + MIG_MODE_CPR_TRANSFER);
> +}
> +
> +void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev)
> +{
> + migration_remove_notifier(&vdev->cpr.transfer_notifier);
> +}
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 8b471c054a..22a4125131 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2993,6 +2993,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
> {
> vfio_display_finalize(vdev);
> vfio_bars_finalize(vdev);
> + vfio_cpr_pci_unregister_device(vdev);
> g_free(vdev->emulated_config_bits);
> g_free(vdev->rom);
> /*
> @@ -3442,6 +3443,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
> vfio_pci_register_err_notifier(vdev);
> vfio_pci_register_req_notifier(vdev);
> vfio_setup_resetfn_quirk(vdev);
> + vfio_cpr_pci_register_device(vdev);
>
> return;
>
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 80c8fcfa07..7989b94eb3 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -194,6 +194,7 @@ struct VFIOPCIDevice {
> bool skip_vsc_check;
> VFIODisplay *dpy;
> Notifier irqchip_change_notifier;
> + VFIOPCICPR cpr;
> };
>
> /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
> diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
> index 80ad20d216..d37daffbc5 100644
> --- a/include/hw/vfio/vfio-cpr.h
> +++ b/include/hw/vfio/vfio-cpr.h
> @@ -38,6 +38,10 @@ typedef struct VFIODeviceCPR {
> uint32_t ioas_id;
> } VFIODeviceCPR;
>
> +typedef struct VFIOPCICPR {
> + NotifierWithReturn transfer_notifier;
> +} VFIOPCICPR;
> +
> bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
> Error **errp);
> void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container);
> @@ -77,5 +81,7 @@ extern const VMStateDescription vfio_cpr_pci_vmstate;
> extern const VMStateDescription vmstate_cpr_vfio_devices;
>
> void vfio_cpr_add_kvm_notifier(void);
> +void vfio_cpr_pci_register_device(struct VFIOPCIDevice *vdev);
> +void vfio_cpr_pci_unregister_device(struct VFIOPCIDevice *vdev);
>
> #endif /* HW_VFIO_VFIO_CPR_H */
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH V1 2/2] vfio/pci: preserve pending interrupts
2025-07-16 12:40 ` Cédric Le Goater
@ 2025-07-16 17:52 ` Steven Sistare
0 siblings, 0 replies; 7+ messages in thread
From: Steven Sistare @ 2025-07-16 17:52 UTC (permalink / raw)
To: Cédric Le Goater, qemu-devel; +Cc: Zhenzhong Duan, Alex Williamson
OK on all. Patch V2 coming shortly - steve
On 7/16/2025 8:40 AM, Cédric Le Goater wrote:
> On 7/14/25 16:27, Steve Sistare wrote:
>> cpr-transfer may lose a VFIO interrupt because the KVM instance is
>> destroyed and recreated. If an interrupt arrives in the middle, it is
>> dropped. To fix, disable pended interrupts during cpr save, and pick
>
> 'pending' interrupts is more common.
>
>> up the pieces. In more detail:
>>
>> Stop the VCPUs. Call kvm_irqchip_remove_irqfd_notifier_gsi --> KVM_IRQFD to
>> deassign the irqfd gsi that routes interrupts directly to the VCPU and KVM.
>> After this call, interrupts fall back to the kernel vfio_msihandler, which
>> writes to QEMU's kvm_interrupt eventfd. CPR already preserves that
>> eventfd. When the route is re-established in new QEMU, the kernel tests
>> the eventfd and pends an interrupt to KVM if necessary.
>
> 'triggers an interrupt' maybe ?
>
>> Deassign INTx in a similar manner. For both MSI and INTx, remove the
>> eventfd handler so old QEMU does not consume an event.
>>
>> If an interrupt was already pended to KVM prior to the completion of
>> kvm_irqchip_remove_irqfd_notifier_gsi, it will be recovered by the
>> subsequent call to cpu_synchronize_all_states, which pulls KVM interrupt
>> state to userland prior to saving it in vmstate.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> hw/vfio/cpr.c | 90 ++++++++++++++++++++++++++++++++++++++
>> hw/vfio/pci.c | 2 +
>> hw/vfio/pci.h | 1 +
>> include/hw/vfio/vfio-cpr.h | 6 +++
>> 4 files changed, 99 insertions(+)
>>
>> diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
>> index 2a244fc4b6..ae2a6b7acd 100644
>> --- a/hw/vfio/cpr.c
>> +++ b/hw/vfio/cpr.c
>> @@ -198,3 +198,93 @@ void vfio_cpr_add_kvm_notifier(void)
>> MIG_MODE_CPR_TRANSFER);
>> }
>> }
>> +
>> +static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
>> + EventNotifier *rn, int virq, bool enable)
>> +{
>> + if (enable) {
>> + return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq);
>> + } else {
>> + return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq);
>> + }
>> +}
>> +
>> +static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool enable)
>> +{
>> + const char *op = (enable ? "enable" : "disable");
>> + PCIDevice *pdev = &vdev->pdev;
>> + int i, nr_vectors, ret = 0;
>> +
>> + if (msix_enabled(pdev)) {
>> + nr_vectors = vdev->msix->entries;
>> +
>> + } else if (msi_enabled(pdev)) {
>> + nr_vectors = msi_nr_vectors_allocated(pdev);
>> +
>> + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
>> + ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
>> + &vdev->intx.unmask, vdev->intx.route.irq,
>> + enable);
>
> I think 'ret' is an errno, we could use error_setg_errno()
>
>> + if (ret) {
>> + error_setg(errp, "failed to %s INTx irq %d: error %d",
>> + op, vdev->intx.route.irq, ret);
>
> I'd prefer to :
> return ret;
> }
>
> vfio_pci_intx_set_handler(vdev, enable);
> return ret;
>
>> + } else {
>> + vfio_pci_intx_set_handler(vdev, enable);
>> + }
>> + return ret;
>> +
>> + } else {
>> + nr_vectors = 0;
>
> 'return 0' is as good.
>
>> + }
>> +
>> + for (i = 0; i < nr_vectors; i++) {
>> + VFIOMSIVector *vector = &vdev->msi_vectors[i];
>> + if (vector->use) {
>> + ret = set_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
>> + NULL, vector->virq, enable);
>> + if (ret) {
>> + error_setg(errp, "failed to %s msi vector %d virq %d: error %d",
>> + op, i, vector->virq, ret);
>
> If errp is set multiple times, qemu will abort. This routine should
> return at the first error.
>
>
> Thanks,
>
> C.
>
>
>
>> + } else {
>> + vfio_pci_msi_set_handler(vdev, i, enable);
>> + }
>> + }
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * When CPR starts, detach IRQs from the VFIO device so future interrupts
>> + * are posted to kvm_interrupt, which is preserved in new QEMU. Interrupts
>> + * that were already posted to the old KVM instance, but not delivered to the
>> + * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM instance
>> + * in new QEMU.
>> + *
>> + * If CPR fails, reattach the IRQs.
>> + */
>> +static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier,
>> + MigrationEvent *e, Error **errp)
>> +{
>> + VFIOPCIDevice *vdev =
>> + container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier);
>> +
>> + if (e->type == MIG_EVENT_PRECOPY_SETUP) {
>> + return vfio_cpr_set_msi_virq(vdev, errp, false);
>> + } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
>> + return vfio_cpr_set_msi_virq(vdev, errp, true);
>> + }
>> + return 0;
>> +}
>> +
>> +void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev)
>> +{
>> + migration_add_notifier_mode(&vdev->cpr.transfer_notifier,
>> + vfio_cpr_pci_notifier,
>> + MIG_MODE_CPR_TRANSFER);
>> +}
>> +
>> +void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev)
>> +{
>> + migration_remove_notifier(&vdev->cpr.transfer_notifier);
>> +}
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 8b471c054a..22a4125131 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2993,6 +2993,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>> {
>> vfio_display_finalize(vdev);
>> vfio_bars_finalize(vdev);
>> + vfio_cpr_pci_unregister_device(vdev);
>> g_free(vdev->emulated_config_bits);
>> g_free(vdev->rom);
>> /*
>> @@ -3442,6 +3443,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
>> vfio_pci_register_err_notifier(vdev);
>> vfio_pci_register_req_notifier(vdev);
>> vfio_setup_resetfn_quirk(vdev);
>> + vfio_cpr_pci_register_device(vdev);
>> return;
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 80c8fcfa07..7989b94eb3 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -194,6 +194,7 @@ struct VFIOPCIDevice {
>> bool skip_vsc_check;
>> VFIODisplay *dpy;
>> Notifier irqchip_change_notifier;
>> + VFIOPCICPR cpr;
>> };
>> /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
>> diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
>> index 80ad20d216..d37daffbc5 100644
>> --- a/include/hw/vfio/vfio-cpr.h
>> +++ b/include/hw/vfio/vfio-cpr.h
>> @@ -38,6 +38,10 @@ typedef struct VFIODeviceCPR {
>> uint32_t ioas_id;
>> } VFIODeviceCPR;
>> +typedef struct VFIOPCICPR {
>> + NotifierWithReturn transfer_notifier;
>> +} VFIOPCICPR;
>> +
>> bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
>> Error **errp);
>> void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container);
>> @@ -77,5 +81,7 @@ extern const VMStateDescription vfio_cpr_pci_vmstate;
>> extern const VMStateDescription vmstate_cpr_vfio_devices;
>> void vfio_cpr_add_kvm_notifier(void);
>> +void vfio_cpr_pci_register_device(struct VFIOPCIDevice *vdev);
>> +void vfio_cpr_pci_unregister_device(struct VFIOPCIDevice *vdev);
>> #endif /* HW_VFIO_VFIO_CPR_H */
>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2025-07-16 17:55 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-14 14:27 [PATCH V1 0/2] preserve pending interrupts during cpr Steve Sistare
2025-07-14 14:27 ` [PATCH V1 1/2] vfio/pci: augment set_handler Steve Sistare
2025-07-15 12:58 ` Cédric Le Goater
2025-07-14 14:27 ` [PATCH V1 2/2] vfio/pci: preserve pending interrupts Steve Sistare
2025-07-16 12:40 ` Cédric Le Goater
2025-07-16 17:52 ` Steven Sistare
2025-07-16 12:53 ` Cédric Le Goater
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).