From: Steven Sistare <steven.sistare@oracle.com>
To: Peter Xu <peterx@redhat.com>
Cc: qemu-devel@nongnu.org, Fabiano Rosas <farosas@suse.de>,
Paolo Bonzini <pbonzini@redhat.com>,
Alex Williamson <alex.williamson@redhat.com>,
Cedric Le Goater <clg@redhat.com>, Yi Liu <yi.l.liu@intel.com>,
Eric Auger <eric.auger@redhat.com>,
Zhenzhong Duan <zhenzhong.duan@intel.com>,
"Michael S. Tsirkin" <mst@redhat.com>,
Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Subject: Re: [PATCH V5 20/38] migration: close kvm after cpr
Date: Wed, 2 Jul 2025 15:41:08 -0400 [thread overview]
Message-ID: <a0487a01-41de-4997-860c-bc555a295643@oracle.com> (raw)
In-Reply-To: <aGVYD1GkOC-LuI1T@x1.local>
On 7/2/2025 12:02 PM, Peter Xu wrote:
> On Tue, Jul 01, 2025 at 11:25:23AM -0400, Steven Sistare wrote:
>> Hi Paolo, Peter, Fabiano,
>>
>> This patch needs review. CPR for vfio is broken without it.
>> Soft feature freeze July 15.
>
> Sorry to not have tried looking at this more even if this is marked
> "migration".. obviously I still almost see it as a KVM change..
>
> Questions inline below:
>
>>
>> - Steve
>>
>> On 6/10/2025 11:39 AM, Steve Sistare wrote:
>>> cpr-transfer breaks vfio network connectivity to and from the guest, and
>>> the host system log shows:
>>> irq bypass consumer (token 00000000a03c32e5) registration fails: -16
>>> which is EBUSY. This occurs because KVM descriptors are still open in
>>> the old QEMU process. Close them.
>>>
>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>> ---
>>> include/hw/vfio/vfio-device.h | 2 ++
>>> include/migration/cpr.h | 2 ++
>>> include/system/kvm.h | 1 +
>>> accel/kvm/kvm-all.c | 32 ++++++++++++++++++++++++++++++++
>>> accel/stubs/kvm-stub.c | 5 +++++
>>> hw/vfio/helpers.c | 10 ++++++++++
>>> hw/vfio/vfio-stubs.c | 13 +++++++++++++
>>> migration/cpr-transfer.c | 18 ++++++++++++++++++
>>> migration/cpr.c | 8 ++++++++
>>> migration/migration.c | 1 +
>>> hw/vfio/meson.build | 2 ++
>>> 11 files changed, 94 insertions(+)
>>> create mode 100644 hw/vfio/vfio-stubs.c
>>>
>>> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
>>> index 4e4d0b6..6eb6f21 100644
>>> --- a/include/hw/vfio/vfio-device.h
>>> +++ b/include/hw/vfio/vfio-device.h
>>> @@ -231,4 +231,6 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
>>> void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
>>> DeviceState *dev, bool ram_discard);
>>> int vfio_device_get_aw_bits(VFIODevice *vdev);
>>> +
>>> +void vfio_kvm_device_close(void);
>>> #endif /* HW_VFIO_VFIO_COMMON_H */
>>> diff --git a/include/migration/cpr.h b/include/migration/cpr.h
>>> index 07858e9..d09b657 100644
>>> --- a/include/migration/cpr.h
>>> +++ b/include/migration/cpr.h
>>> @@ -32,7 +32,9 @@ void cpr_state_close(void);
>>> struct QIOChannel *cpr_state_ioc(void);
>>> bool cpr_incoming_needed(void *opaque);
>>> +void cpr_kvm_close(void);
>>> +void cpr_transfer_init(void);
>>> QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
>>> QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
>>> diff --git a/include/system/kvm.h b/include/system/kvm.h
>>> index 7cc60d2..4896a3c 100644
>>> --- a/include/system/kvm.h
>>> +++ b/include/system/kvm.h
>>> @@ -195,6 +195,7 @@ bool kvm_has_sync_mmu(void);
>>> int kvm_has_vcpu_events(void);
>>> int kvm_max_nested_state_length(void);
>>> int kvm_has_gsi_routing(void);
>>> +void kvm_close(void);
>>> /**
>>> * kvm_arm_supports_user_irq
>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>> index a317783..3d3a557 100644
>>> --- a/accel/kvm/kvm-all.c
>>> +++ b/accel/kvm/kvm-all.c
>>> @@ -515,16 +515,23 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
>>> goto err;
>>> }
>>> + /* If I am the CPU that created coalesced_mmio_ring, then discard it */
>>> + if (s->coalesced_mmio_ring == (void *)cpu->kvm_run + PAGE_SIZE) {
>>> + s->coalesced_mmio_ring = NULL;
>>> + }
>>> +
>>> ret = munmap(cpu->kvm_run, mmap_size);
>>> if (ret < 0) {
>>> goto err;
>>> }
>>> + cpu->kvm_run = NULL;
>>> if (cpu->kvm_dirty_gfns) {
>>> ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
>>> if (ret < 0) {
>>> goto err;
>>> }
>>> + cpu->kvm_dirty_gfns = NULL;
>>> }
>>> kvm_park_vcpu(cpu);
>>> @@ -608,6 +615,31 @@ err:
>>> return ret;
>>> }
>>> +void kvm_close(void)
>>> +{
>>> + CPUState *cpu;
>>> +
>>> + if (!kvm_state || kvm_state->fd == -1) {
>>> + return;
>>> + }
>>> +
>>> + CPU_FOREACH(cpu) {
>>> + cpu_remove_sync(cpu);
>>> + close(cpu->kvm_fd);
>>> + cpu->kvm_fd = -1;
>>> + close(cpu->kvm_vcpu_stats_fd);
>>> + cpu->kvm_vcpu_stats_fd = -1;
>>> + }
>>> +
>>> + if (kvm_state && kvm_state->fd != -1) {
>>> + close(kvm_state->vmfd);
>>> + kvm_state->vmfd = -1;
>>> + close(kvm_state->fd);
>>> + kvm_state->fd = -1;
>>> + }
>>> + kvm_state = NULL;
>>> +}
>>> +
>>> /*
>>> * dirty pages logging control
>>> */
>>> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
>>> index ecfd763..97dacb3 100644
>>> --- a/accel/stubs/kvm-stub.c
>>> +++ b/accel/stubs/kvm-stub.c
>>> @@ -134,3 +134,8 @@ int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
>>> {
>>> return -ENOSYS;
>>> }
>>> +
>>> +void kvm_close(void)
>>> +{
>>> + return;
>>> +}
>>> diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
>>> index d0dbab1..af1db2f 100644
>>> --- a/hw/vfio/helpers.c
>>> +++ b/hw/vfio/helpers.c
>>> @@ -117,6 +117,16 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
>>> int vfio_kvm_device_fd = -1;
>>> #endif
>>> +void vfio_kvm_device_close(void)
>>> +{
>>> +#ifdef CONFIG_KVM
>>> + if (vfio_kvm_device_fd != -1) {
>>> + close(vfio_kvm_device_fd);
>>> + vfio_kvm_device_fd = -1;
>>> + }
>>> +#endif
>>> +}
>>> +
>>> int vfio_kvm_device_add_fd(int fd, Error **errp)
>>> {
>>> #ifdef CONFIG_KVM
>>> diff --git a/hw/vfio/vfio-stubs.c b/hw/vfio/vfio-stubs.c
>>> new file mode 100644
>>> index 0000000..a4c8b56
>>> --- /dev/null
>>> +++ b/hw/vfio/vfio-stubs.c
>>> @@ -0,0 +1,13 @@
>>> +/*
>>> + * Copyright (c) 2025 Oracle and/or its affiliates.
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "hw/vfio/vfio-device.h"
>>> +
>>> +void vfio_kvm_device_close(void)
>>> +{
>>> + return;
>>> +}
>
> Do we really need this stub, and the "include VFIO" headers in CPR as
> below? I thought it would be doable the other way round, that VFIO or KVM
> can register migration notifiers for CPR mode only. After all, the
> registration (migration_add_notifier_mode) is in misc.h so it should be
> available to QEMU all across.
OK. I have reworked the code to register the notifier from vfio and
eliminate the stub.
> Besides that, a high level question: what this patch does is trying to
> close early the relevant kvm/vfio fds that are used to attach to irq
> consumer / providers. > At the meantime, AFAICT, CPR as a whole feature when
> used against VFIO available, works only if VFIO can do whatever it wants
> (DMA, irq injections) during the whole process of CPR live upgrade,
> assuming that all the states are persisted in the fds. Then, if here we
> need to (a) unregister on src QEMU and (b) re-attach on dest QEMU, what
> happens if the irqs are generated exactly between (a) and (b)? Could they
> get lost?
The irq producer is not closed, but it is detached from the kvm consumer.
It's eventfd is preserved in new QEMU, and interrupts that arrive during
transition are pended there.
- Steve
>>> diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
>>> index e1f1403..396558f 100644
>>> --- a/migration/cpr-transfer.c
>>> +++ b/migration/cpr-transfer.c
>>> @@ -17,6 +17,24 @@
>>> #include "migration/vmstate.h"
>>> #include "trace.h"
>>> +static int cpr_transfer_notifier(NotifierWithReturn *notifier,
>>> + MigrationEvent *e,
>>> + Error **errp)
>>> +{
>>> + if (e->type == MIG_EVENT_PRECOPY_DONE) {
>>> + cpr_kvm_close();
>>> + }
>>> + return 0;
>>> +}
>>> +
>>> +void cpr_transfer_init(void)
>>> +{
>>> + static NotifierWithReturn notifier;
>>> +
>>> + migration_add_notifier_mode(¬ifier, cpr_transfer_notifier,
>>> + MIG_MODE_CPR_TRANSFER);
>>> +}
>>> +
>>> QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp)
>>> {
>>> MigrationAddress *addr = channel->addr;
>>> diff --git a/migration/cpr.c b/migration/cpr.c
>>> index a50a57e..49fb0a5 100644
>>> --- a/migration/cpr.c
>>> +++ b/migration/cpr.c
>>> @@ -7,12 +7,14 @@
>>> #include "qemu/osdep.h"
>>> #include "qapi/error.h"
>>> +#include "hw/vfio/vfio-device.h"
>
> [1]
>
>>> #include "migration/cpr.h"
>>> #include "migration/misc.h"
>>> #include "migration/options.h"
>>> #include "migration/qemu-file.h"
>>> #include "migration/savevm.h"
>>> #include "migration/vmstate.h"
>>> +#include "system/kvm.h"
>>> #include "system/runstate.h"
>>> #include "trace.h"
>>> @@ -264,3 +266,9 @@ bool cpr_incoming_needed(void *opaque)
>>> MigMode mode = migrate_mode();
>>> return mode == MIG_MODE_CPR_TRANSFER;
>>> }
>>> +
>>> +void cpr_kvm_close(void)
>>> +{
>>> + kvm_close();
>>> + vfio_kvm_device_close();
>>> +}
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index 4098870..8f23cff 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -337,6 +337,7 @@ void migration_object_init(void)
>>> ram_mig_init();
>>> dirty_bitmap_mig_init();
>>> + cpr_transfer_init();
>>> /* Initialize cpu throttle timers */
>>> cpu_throttle_init();
>>> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
>>> index 73d29f9..98134a7 100644
>>> --- a/hw/vfio/meson.build
>>> +++ b/hw/vfio/meson.build
>>> @@ -17,6 +17,8 @@ vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c'))
>>> specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss)
>>> +system_ss.add(when: 'CONFIG_VFIO', if_false: files('vfio-stubs.c'))
>>> +
>>> system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c'))
>>> system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c'))
>>> system_ss.add(when: 'CONFIG_VFIO', if_true: files(
>
next prev parent reply other threads:[~2025-07-02 19:42 UTC|newest]
Thread overview: 101+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-10 15:39 [PATCH V5 00/38] Live update: vfio and iommufd Steve Sistare
2025-06-10 15:39 ` [PATCH V5 01/38] migration: cpr helpers Steve Sistare
2025-06-10 15:39 ` [PATCH V5 02/38] migration: lower handler priority Steve Sistare
2025-06-10 15:39 ` [PATCH V5 03/38] vfio/container: register container for cpr Steve Sistare
2025-06-10 15:39 ` [PATCH V5 04/38] vfio/container: preserve descriptors Steve Sistare
2025-06-23 9:07 ` Duan, Zhenzhong
2025-07-01 14:25 ` Steven Sistare
2025-07-02 14:23 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 05/38] vfio/container: discard old DMA vaddr Steve Sistare
2025-06-10 15:39 ` [PATCH V5 06/38] vfio/container: restore " Steve Sistare
2025-06-10 15:39 ` [PATCH V5 07/38] vfio/container: mdev cpr blocker Steve Sistare
2025-06-10 15:39 ` [PATCH V5 08/38] vfio/container: recover from unmap-all-vaddr failure Steve Sistare
2025-08-13 12:54 ` Cédric Le Goater
2025-08-13 14:18 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 09/38] pci: export msix_is_pending Steve Sistare
2025-06-10 15:39 ` [PATCH V5 10/38] pci: skip reset during cpr Steve Sistare
2025-06-10 15:39 ` [PATCH V5 11/38] vfio-pci: " Steve Sistare
2025-06-10 15:39 ` [PATCH V5 12/38] vfio/pci: vfio_pci_vector_init Steve Sistare
2025-06-10 15:39 ` [PATCH V5 13/38] vfio/pci: vfio_notifier_init Steve Sistare
2025-06-10 15:39 ` [PATCH V5 14/38] vfio/pci: pass vector to virq functions Steve Sistare
2025-06-10 15:39 ` [PATCH V5 15/38] vfio/pci: vfio_notifier_init cpr parameters Steve Sistare
2025-06-10 15:39 ` [PATCH V5 16/38] vfio/pci: vfio_notifier_cleanup Steve Sistare
2025-06-10 15:39 ` [PATCH V5 17/38] vfio/pci: export MSI functions Steve Sistare
2025-06-10 15:39 ` [PATCH V5 18/38] vfio-pci: preserve MSI Steve Sistare
2025-07-01 16:12 ` Steven Sistare
2025-07-02 7:17 ` Cédric Le Goater
2025-07-02 12:03 ` Steven Sistare
2025-07-02 15:35 ` Cédric Le Goater
2025-07-02 16:40 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 19/38] vfio-pci: preserve INTx Steve Sistare
2025-07-02 15:23 ` Cédric Le Goater
2025-07-02 17:54 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 20/38] migration: close kvm after cpr Steve Sistare
2025-07-01 15:25 ` Steven Sistare
2025-07-02 16:02 ` Peter Xu
2025-07-02 19:41 ` Steven Sistare [this message]
2025-07-03 19:45 ` Peter Xu
2025-07-03 21:21 ` Cédric Le Goater
2025-07-03 21:58 ` Peter Xu
2025-07-07 13:13 ` Steven Sistare
2025-07-01 17:49 ` Fabiano Rosas
2025-06-10 15:39 ` [PATCH V5 21/38] migration: cpr_get_fd_param helper Steve Sistare
2025-06-10 15:39 ` [PATCH V5 22/38] backends/iommufd: iommufd_backend_map_file_dma Steve Sistare
2025-06-10 15:39 ` [PATCH V5 23/38] backends/iommufd: change process ioctl Steve Sistare
2025-06-11 12:38 ` Cédric Le Goater
2025-06-23 8:20 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 24/38] physmem: qemu_ram_get_fd_offset Steve Sistare
2025-06-10 15:39 ` [PATCH V5 25/38] vfio/iommufd: use IOMMU_IOAS_MAP_FILE Steve Sistare
2025-06-10 15:39 ` [PATCH V5 26/38] vfio/iommufd: invariant device name Steve Sistare
2025-06-23 8:25 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 27/38] vfio/iommufd: add vfio_device_free_name Steve Sistare
2025-06-11 12:38 ` Cédric Le Goater
2025-06-23 8:27 ` Duan, Zhenzhong
2025-06-23 13:50 ` Eric Farman
2025-07-01 14:26 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 28/38] vfio/iommufd: device name blocker Steve Sistare
2025-06-23 10:29 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 29/38] vfio/iommufd: register container for cpr Steve Sistare
2025-07-01 14:25 ` Steven Sistare
2025-07-02 14:17 ` Duan, Zhenzhong
2025-07-02 14:52 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 30/38] migration: vfio cpr state hook Steve Sistare
2025-06-24 11:24 ` Duan, Zhenzhong
2025-07-01 14:26 ` Steven Sistare
2025-07-02 13:39 ` Duan, Zhenzhong
2025-07-02 15:07 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 31/38] vfio/iommufd: cpr state Steve Sistare
2025-06-23 10:45 ` Duan, Zhenzhong
2025-07-01 14:26 ` Steven Sistare
2025-07-02 13:44 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 32/38] vfio/iommufd: preserve descriptors Steve Sistare
2025-06-25 11:40 ` Duan, Zhenzhong
2025-07-01 14:26 ` Steven Sistare
2025-07-02 14:08 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 33/38] vfio/iommufd: reconstruct device Steve Sistare
2025-06-25 11:40 ` Duan, Zhenzhong
2025-07-01 14:26 ` Steven Sistare
2025-07-02 14:14 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 34/38] vfio/iommufd: reconstruct hwpt Steve Sistare
2025-06-25 11:40 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 35/38] vfio/iommufd: change process Steve Sistare
2025-06-25 11:40 ` Duan, Zhenzhong
2025-07-01 14:26 ` Steven Sistare
2025-07-02 13:46 ` Duan, Zhenzhong
2025-07-02 20:57 ` Steven Sistare
2025-06-10 15:39 ` [PATCH V5 36/38] iommufd: preserve DMA mappings Steve Sistare
2025-06-25 11:40 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 37/38] vfio/container: delete old cpr register Steve Sistare
2025-06-25 11:40 ` Duan, Zhenzhong
2025-06-10 15:39 ` [PATCH V5 38/38] vfio: doc changes for cpr Steve Sistare
2025-07-02 14:03 ` Steven Sistare
2025-07-02 14:49 ` Cédric Le Goater
2025-07-02 17:52 ` Fabiano Rosas
2025-06-10 17:18 ` [PATCH V5 00/38] Live update: vfio and iommufd Cédric Le Goater
2025-06-10 17:39 ` Cédric Le Goater
2025-06-11 14:25 ` Cédric Le Goater
2025-06-11 14:39 ` Steven Sistare
2025-06-12 7:23 ` Cédric Le Goater
2025-06-19 12:03 ` Cédric Le Goater
2025-06-20 5:46 ` Duan, Zhenzhong
2025-06-11 14:49 ` Peter Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a0487a01-41de-4997-860c-bc555a295643@oracle.com \
--to=steven.sistare@oracle.com \
--cc=alex.williamson@redhat.com \
--cc=clg@redhat.com \
--cc=eric.auger@redhat.com \
--cc=farosas@suse.de \
--cc=marcel.apfelbaum@gmail.com \
--cc=mst@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=yi.l.liu@intel.com \
--cc=zhenzhong.duan@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).