* [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy()
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
@ 2026-06-09 7:57 ` Avihai Horon
2026-06-09 12:11 ` Philippe Mathieu-Daudé
2026-06-09 7:58 ` [PATCH v3 02/14] migration/ram: Use migration_bitmap_sync_precopy() for postcopy discard Avihai Horon
` (12 subsequent siblings)
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:57 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
migration_completion_precopy() doesn't propagate errors to migration
core which leads to error information loss. Fix that.
This prepares for a follow-up where migration_switchover_start() can
fail on switchover-ack and still report a useful error.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
migration/savevm.h | 2 +-
migration/migration.c | 13 ++++++++-----
migration/savevm.c | 29 +++++++++++++++++------------
3 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/migration/savevm.h b/migration/savevm.h
index 96fdf96d4e..b6bb4fa977 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -44,7 +44,7 @@ void qemu_savevm_state_header(QEMUFile *f);
int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
void qemu_savevm_state_cleanup(void);
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
-int qemu_savevm_state_complete_precopy(MigrationState *s);
+int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
diff --git a/migration/migration.c b/migration/migration.c
index 074d3f2c69..aad23f3228 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2814,7 +2814,7 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
return true;
}
-static int migration_completion_precopy(MigrationState *s)
+static int migration_completion_precopy(MigrationState *s, Error **errp)
{
int ret;
@@ -2823,16 +2823,17 @@ static int migration_completion_precopy(MigrationState *s)
if (!migrate_mode_is_cpr()) {
ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to stop the VM");
goto out_unlock;
}
}
- if (!migration_switchover_start(s, NULL)) {
+ if (!migration_switchover_start(s, errp)) {
ret = -EFAULT;
goto out_unlock;
}
- ret = qemu_savevm_state_complete_precopy(s);
+ ret = qemu_savevm_state_complete_precopy(s, errp);
out_unlock:
bql_unlock();
return ret;
@@ -2869,7 +2870,7 @@ static void migration_completion(MigrationState *s)
Error *local_err = NULL;
if (s->state == MIGRATION_STATUS_ACTIVE) {
- ret = migration_completion_precopy(s);
+ ret = migration_completion_precopy(s, &local_err);
} else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
migration_completion_postcopy(s);
} else {
@@ -2900,7 +2901,9 @@ static void migration_completion(MigrationState *s)
return;
fail:
- if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
+ if (local_err) {
+ migrate_error_propagate(s, local_err);
+ } else if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
migrate_error_propagate(s, local_err);
} else if (ret) {
error_setg_errno(&local_err, -ret, "Error in migration completion");
diff --git a/migration/savevm.c b/migration/savevm.c
index 23adaf9dd9..9d1d58c8f4 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1771,28 +1771,34 @@ int qemu_savevm_state_non_iterable(QEMUFile *f, Error **errp)
return 0;
}
-int qemu_savevm_state_complete_precopy(MigrationState *s)
+int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp)
{
+ ERRP_GUARD();
QEMUFile *f = s->to_dst_file;
- Error *local_err = NULL;
int ret;
ret = qemu_savevm_state_complete_precopy_iterable(f, false);
if (ret) {
+ qemu_file_get_error_obj(f, errp);
+ error_prepend(errp, "Failed to save iterable device state: ");
return ret;
}
- /* TODO: pass error upper */
- ret = qemu_savevm_state_non_iterable(f, &local_err);
+ ret = qemu_savevm_state_non_iterable(f, errp);
if (ret) {
- migrate_error_propagate(s, error_copy(local_err));
- error_report_err(local_err);
return ret;
}
qemu_savevm_state_end_precopy(s, f);
- return qemu_fflush(f);
+ ret = qemu_fflush(f);
+ if (ret) {
+ qemu_file_get_error_obj(f, errp);
+ error_prepend(errp, "%s: Failed to flush QEMUFile", __func__);
+ return ret;
+ }
+
+ return 0;
}
void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
@@ -1874,13 +1880,12 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
}
ret = qemu_file_get_error(f);
- if (ret == 0) {
- qemu_savevm_state_complete_precopy(ms);
- ret = qemu_file_get_error(f);
- }
- if (ret != 0) {
+ if (ret) {
error_setg_errno(errp, -ret, "Error while writing VM state");
+ goto cleanup;
}
+
+ ret = qemu_savevm_state_complete_precopy(ms, errp);
cleanup:
qemu_savevm_state_cleanup();
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy()
2026-06-09 7:57 ` [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy() Avihai Horon
@ 2026-06-09 12:11 ` Philippe Mathieu-Daudé
2026-06-12 16:03 ` Peter Xu
0 siblings, 1 reply; 27+ messages in thread
From: Philippe Mathieu-Daudé @ 2026-06-09 12:11 UTC (permalink / raw)
To: Avihai Horon, qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Zhao Liu, Halil Pasic, Christian Borntraeger,
Jason Herne, Richard Henderson, Ilya Leoshkevich,
David Hildenbrand, Eric Farman, Matthew Rosato, Cornelia Huck,
Eric Blake, Vladimir Sementsov-Ogievskiy, John Snow,
Markus Armbruster, Maor Gottlieb
On 9/6/26 09:57, Avihai Horon wrote:
> migration_completion_precopy() doesn't propagate errors to migration
> core which leads to error information loss. Fix that.
>
> This prepares for a follow-up where migration_switchover_start() can
> fail on switchover-ack and still report a useful error.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
> migration/savevm.h | 2 +-
> migration/migration.c | 13 ++++++++-----
> migration/savevm.c | 29 +++++++++++++++++------------
> 3 files changed, 26 insertions(+), 18 deletions(-)
>
> diff --git a/migration/savevm.h b/migration/savevm.h
> index 96fdf96d4e..b6bb4fa977 100644
> --- a/migration/savevm.h
> +++ b/migration/savevm.h
> @@ -44,7 +44,7 @@ void qemu_savevm_state_header(QEMUFile *f);
> int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
> void qemu_savevm_state_cleanup(void);
> void qemu_savevm_state_complete_postcopy(QEMUFile *f);
> -int qemu_savevm_state_complete_precopy(MigrationState *s);
> +int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
> void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
> int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
> bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
> diff --git a/migration/migration.c b/migration/migration.c
> index 074d3f2c69..aad23f3228 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2814,7 +2814,7 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
> return true;
> }
>
> -static int migration_completion_precopy(MigrationState *s)
> +static int migration_completion_precopy(MigrationState *s, Error **errp)
> {
> int ret;
>
> @@ -2823,16 +2823,17 @@ static int migration_completion_precopy(MigrationState *s)
> if (!migrate_mode_is_cpr()) {
> ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> if (ret < 0) {
> + error_setg_errno(errp, -ret, "Failed to stop the VM");
> goto out_unlock;
> }
> }
>
> - if (!migration_switchover_start(s, NULL)) {
> + if (!migration_switchover_start(s, errp)) {
> ret = -EFAULT;
This function should now returns a boolean IMHO.
> goto out_unlock;
> }
>
> - ret = qemu_savevm_state_complete_precopy(s);
> + ret = qemu_savevm_state_complete_precopy(s, errp);
> out_unlock:
> bql_unlock();
> return ret;
> @@ -2869,7 +2870,7 @@ static void migration_completion(MigrationState *s)
> Error *local_err = NULL;
>
> if (s->state == MIGRATION_STATUS_ACTIVE) {
> - ret = migration_completion_precopy(s);
> + ret = migration_completion_precopy(s, &local_err);
> } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
> migration_completion_postcopy(s);
> } else {
> @@ -2900,7 +2901,9 @@ static void migration_completion(MigrationState *s)
> return;
>
> fail:
> - if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
> + if (local_err) {
> + migrate_error_propagate(s, local_err);
> + } else if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
if (local_err
|| qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
> migrate_error_propagate(s, local_err);
> } else if (ret) {
> error_setg_errno(&local_err, -ret, "Error in migration completion");
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 23adaf9dd9..9d1d58c8f4 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1771,28 +1771,34 @@ int qemu_savevm_state_non_iterable(QEMUFile *f, Error **errp)
> return 0;
> }
>
> -int qemu_savevm_state_complete_precopy(MigrationState *s)
> +int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp)
> {
> + ERRP_GUARD();
> QEMUFile *f = s->to_dst_file;
> - Error *local_err = NULL;
> int ret;
>
> ret = qemu_savevm_state_complete_precopy_iterable(f, false);
> if (ret) {
> + qemu_file_get_error_obj(f, errp);
> + error_prepend(errp, "Failed to save iterable device state: ");
> return ret;
> }
>
> - /* TODO: pass error upper */
> - ret = qemu_savevm_state_non_iterable(f, &local_err);
> + ret = qemu_savevm_state_non_iterable(f, errp);
> if (ret) {
> - migrate_error_propagate(s, error_copy(local_err));
> - error_report_err(local_err);
> return ret;
> }
>
> qemu_savevm_state_end_precopy(s, f);
>
> - return qemu_fflush(f);
> + ret = qemu_fflush(f);
> + if (ret) {
> + qemu_file_get_error_obj(f, errp);
> + error_prepend(errp, "%s: Failed to flush QEMUFile", __func__);
Ditto, directly return boolean.
Anyway can be done on top, so:
Reviewed-by: Philippe Mathieu-Daudé <philmd@oss.qualcomm.com>
> + return ret;
> + }
> +
> + return 0;
> }
>
> void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
> @@ -1874,13 +1880,12 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
> }
>
> ret = qemu_file_get_error(f);
> - if (ret == 0) {
> - qemu_savevm_state_complete_precopy(ms);
> - ret = qemu_file_get_error(f);
> - }
> - if (ret != 0) {
> + if (ret) {
> error_setg_errno(errp, -ret, "Error while writing VM state");
> + goto cleanup;
> }
> +
> + ret = qemu_savevm_state_complete_precopy(ms, errp);
> cleanup:
> qemu_savevm_state_cleanup();
>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy()
2026-06-09 12:11 ` Philippe Mathieu-Daudé
@ 2026-06-12 16:03 ` Peter Xu
2026-06-14 9:34 ` Avihai Horon
0 siblings, 1 reply; 27+ messages in thread
From: Peter Xu @ 2026-06-12 16:03 UTC (permalink / raw)
To: Philippe Mathieu-Daudé
Cc: Avihai Horon, qemu-devel, Alex Williamson, Cédric Le Goater,
Fabiano Rosas, Pierrick Bouvier, Zhao Liu, Halil Pasic,
Christian Borntraeger, Jason Herne, Richard Henderson,
Ilya Leoshkevich, David Hildenbrand, Eric Farman, Matthew Rosato,
Cornelia Huck, Eric Blake, Vladimir Sementsov-Ogievskiy,
John Snow, Markus Armbruster, Maor Gottlieb
On Tue, Jun 09, 2026 at 02:11:00PM +0200, Philippe Mathieu-Daudé wrote:
> On 9/6/26 09:57, Avihai Horon wrote:
> > migration_completion_precopy() doesn't propagate errors to migration
> > core which leads to error information loss. Fix that.
> >
> > This prepares for a follow-up where migration_switchover_start() can
> > fail on switchover-ack and still report a useful error.
> >
> > Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> > ---
> > migration/savevm.h | 2 +-
> > migration/migration.c | 13 ++++++++-----
> > migration/savevm.c | 29 +++++++++++++++++------------
> > 3 files changed, 26 insertions(+), 18 deletions(-)
> >
> > diff --git a/migration/savevm.h b/migration/savevm.h
> > index 96fdf96d4e..b6bb4fa977 100644
> > --- a/migration/savevm.h
> > +++ b/migration/savevm.h
> > @@ -44,7 +44,7 @@ void qemu_savevm_state_header(QEMUFile *f);
> > int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
> > void qemu_savevm_state_cleanup(void);
> > void qemu_savevm_state_complete_postcopy(QEMUFile *f);
> > -int qemu_savevm_state_complete_precopy(MigrationState *s);
> > +int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
> > void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
> > int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
> > bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 074d3f2c69..aad23f3228 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2814,7 +2814,7 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
> > return true;
> > }
> > -static int migration_completion_precopy(MigrationState *s)
> > +static int migration_completion_precopy(MigrationState *s, Error **errp)
> > {
> > int ret;
> > @@ -2823,16 +2823,17 @@ static int migration_completion_precopy(MigrationState *s)
> > if (!migrate_mode_is_cpr()) {
> > ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
> > if (ret < 0) {
> > + error_setg_errno(errp, -ret, "Failed to stop the VM");
> > goto out_unlock;
> > }
> > }
> > - if (!migration_switchover_start(s, NULL)) {
> > + if (!migration_switchover_start(s, errp)) {
> > ret = -EFAULT;
>
> This function should now returns a boolean IMHO.
>
> > goto out_unlock;
> > }
> > - ret = qemu_savevm_state_complete_precopy(s);
> > + ret = qemu_savevm_state_complete_precopy(s, errp);
> > out_unlock:
> > bql_unlock();
> > return ret;
> > @@ -2869,7 +2870,7 @@ static void migration_completion(MigrationState *s)
> > Error *local_err = NULL;
> > if (s->state == MIGRATION_STATUS_ACTIVE) {
> > - ret = migration_completion_precopy(s);
> > + ret = migration_completion_precopy(s, &local_err);
> > } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
> > migration_completion_postcopy(s);
> > } else {
> > @@ -2900,7 +2901,9 @@ static void migration_completion(MigrationState *s)
> > return;
> > fail:
> > - if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
> > + if (local_err) {
> > + migrate_error_propagate(s, local_err);
> > + } else if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
>
> if (local_err
> || qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
>
> > migrate_error_propagate(s, local_err);
> > } else if (ret) {
> > error_setg_errno(&local_err, -ret, "Error in migration completion");
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index 23adaf9dd9..9d1d58c8f4 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1771,28 +1771,34 @@ int qemu_savevm_state_non_iterable(QEMUFile *f, Error **errp)
> > return 0;
> > }
> > -int qemu_savevm_state_complete_precopy(MigrationState *s)
> > +int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp)
> > {
> > + ERRP_GUARD();
> > QEMUFile *f = s->to_dst_file;
> > - Error *local_err = NULL;
> > int ret;
> > ret = qemu_savevm_state_complete_precopy_iterable(f, false);
> > if (ret) {
> > + qemu_file_get_error_obj(f, errp);
> > + error_prepend(errp, "Failed to save iterable device state: ");
> > return ret;
> > }
> > - /* TODO: pass error upper */
> > - ret = qemu_savevm_state_non_iterable(f, &local_err);
> > + ret = qemu_savevm_state_non_iterable(f, errp);
> > if (ret) {
> > - migrate_error_propagate(s, error_copy(local_err));
> > - error_report_err(local_err);
> > return ret;
> > }
> > qemu_savevm_state_end_precopy(s, f);
> > - return qemu_fflush(f);
> > + ret = qemu_fflush(f);
> > + if (ret) {
> > + qemu_file_get_error_obj(f, errp);
> > + error_prepend(errp, "%s: Failed to flush QEMUFile", __func__);
>
> Ditto, directly return boolean.
>
> Anyway can be done on top, so:
Agreed, I didn't mention this because I know touching the retval needs
further touch callers. Can be done on top.
> Reviewed-by: Philippe Mathieu-Daudé <philmd@oss.qualcomm.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy()
2026-06-12 16:03 ` Peter Xu
@ 2026-06-14 9:34 ` Avihai Horon
0 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-14 9:34 UTC (permalink / raw)
To: Peter Xu, Philippe Mathieu-Daudé
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Zhao Liu, Halil Pasic, Christian Borntraeger,
Jason Herne, Richard Henderson, Ilya Leoshkevich,
David Hildenbrand, Eric Farman, Matthew Rosato, Cornelia Huck,
Eric Blake, Vladimir Sementsov-Ogievskiy, John Snow,
Markus Armbruster, Maor Gottlieb
On 6/12/2026 7:03 PM, Peter Xu wrote:
> External email: Use caution opening links or attachments
>
>
> On Tue, Jun 09, 2026 at 02:11:00PM +0200, Philippe Mathieu-Daudé wrote:
>> On 9/6/26 09:57, Avihai Horon wrote:
>>> migration_completion_precopy() doesn't propagate errors to migration
>>> core which leads to error information loss. Fix that.
>>>
>>> This prepares for a follow-up where migration_switchover_start() can
>>> fail on switchover-ack and still report a useful error.
>>>
>>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>>> ---
>>> migration/savevm.h | 2 +-
>>> migration/migration.c | 13 ++++++++-----
>>> migration/savevm.c | 29 +++++++++++++++++------------
>>> 3 files changed, 26 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/migration/savevm.h b/migration/savevm.h
>>> index 96fdf96d4e..b6bb4fa977 100644
>>> --- a/migration/savevm.h
>>> +++ b/migration/savevm.h
>>> @@ -44,7 +44,7 @@ void qemu_savevm_state_header(QEMUFile *f);
>>> int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
>>> void qemu_savevm_state_cleanup(void);
>>> void qemu_savevm_state_complete_postcopy(QEMUFile *f);
>>> -int qemu_savevm_state_complete_precopy(MigrationState *s);
>>> +int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
>>> void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
>>> int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
>>> bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index 074d3f2c69..aad23f3228 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -2814,7 +2814,7 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
>>> return true;
>>> }
>>> -static int migration_completion_precopy(MigrationState *s)
>>> +static int migration_completion_precopy(MigrationState *s, Error **errp)
>>> {
>>> int ret;
>>> @@ -2823,16 +2823,17 @@ static int migration_completion_precopy(MigrationState *s)
>>> if (!migrate_mode_is_cpr()) {
>>> ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
>>> if (ret < 0) {
>>> + error_setg_errno(errp, -ret, "Failed to stop the VM");
>>> goto out_unlock;
>>> }
>>> }
>>> - if (!migration_switchover_start(s, NULL)) {
>>> + if (!migration_switchover_start(s, errp)) {
>>> ret = -EFAULT;
>> This function should now returns a boolean IMHO.
>>
>>> goto out_unlock;
>>> }
>>> - ret = qemu_savevm_state_complete_precopy(s);
>>> + ret = qemu_savevm_state_complete_precopy(s, errp);
>>> out_unlock:
>>> bql_unlock();
>>> return ret;
>>> @@ -2869,7 +2870,7 @@ static void migration_completion(MigrationState *s)
>>> Error *local_err = NULL;
>>> if (s->state == MIGRATION_STATUS_ACTIVE) {
>>> - ret = migration_completion_precopy(s);
>>> + ret = migration_completion_precopy(s, &local_err);
>>> } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
>>> migration_completion_postcopy(s);
>>> } else {
>>> @@ -2900,7 +2901,9 @@ static void migration_completion(MigrationState *s)
>>> return;
>>> fail:
>>> - if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
>>> + if (local_err) {
>>> + migrate_error_propagate(s, local_err);
>>> + } else if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
>> if (local_err
>> || qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
>>
>>> migrate_error_propagate(s, local_err);
>>> } else if (ret) {
>>> error_setg_errno(&local_err, -ret, "Error in migration completion");
>>> diff --git a/migration/savevm.c b/migration/savevm.c
>>> index 23adaf9dd9..9d1d58c8f4 100644
>>> --- a/migration/savevm.c
>>> +++ b/migration/savevm.c
>>> @@ -1771,28 +1771,34 @@ int qemu_savevm_state_non_iterable(QEMUFile *f, Error **errp)
>>> return 0;
>>> }
>>> -int qemu_savevm_state_complete_precopy(MigrationState *s)
>>> +int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp)
>>> {
>>> + ERRP_GUARD();
>>> QEMUFile *f = s->to_dst_file;
>>> - Error *local_err = NULL;
>>> int ret;
>>> ret = qemu_savevm_state_complete_precopy_iterable(f, false);
>>> if (ret) {
>>> + qemu_file_get_error_obj(f, errp);
>>> + error_prepend(errp, "Failed to save iterable device state: ");
>>> return ret;
>>> }
>>> - /* TODO: pass error upper */
>>> - ret = qemu_savevm_state_non_iterable(f, &local_err);
>>> + ret = qemu_savevm_state_non_iterable(f, errp);
>>> if (ret) {
>>> - migrate_error_propagate(s, error_copy(local_err));
>>> - error_report_err(local_err);
>>> return ret;
>>> }
>>> qemu_savevm_state_end_precopy(s, f);
>>> - return qemu_fflush(f);
>>> + ret = qemu_fflush(f);
>>> + if (ret) {
>>> + qemu_file_get_error_obj(f, errp);
>>> + error_prepend(errp, "%s: Failed to flush QEMUFile", __func__);
>> Ditto, directly return boolean.
>>
>> Anyway can be done on top, so:
> Agreed, I didn't mention this because I know touching the retval needs
> further touch callers. Can be done on top.
Sure, I can send a patch for that later.
Thanks.
>
>> Reviewed-by: Philippe Mathieu-Daudé <philmd@oss.qualcomm.com>
> Reviewed-by: Peter Xu <peterx@redhat.com>
>
> --
> Peter Xu
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 02/14] migration/ram: Use migration_bitmap_sync_precopy() for postcopy discard
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
2026-06-09 7:57 ` [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy() Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-12 15:59 ` Peter Xu
2026-06-09 7:58 ` [PATCH v3 03/14] migration: Run final save_query_pending at switchover Avihai Horon
` (11 subsequent siblings)
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon, Gavin Shan, Wei Wang
ram_postcopy_send_discard_bitmap() performs the final RAM dirty bitmap
sync at postcopy switchover, before sending discard bitmap to the
destination. Unlike the precopy switchover path, it currently calls the
raw migration_bitmap_sync() helper and passes last_stage=false.
Postcopy switchover is also a stopped-VM final sync point, so use
migration_bitmap_sync_precopy(true). This keeps RAM final bitmap sync
handling consistent across precopy and postcopy switchover, including
the precopy bitmap-sync notifier wrapper. Current notifier users are
safe in postcopy: virtio-balloon free-page hinting already opts out when
postcopy-ram is enabled [1].
This prepares for moving the final sync out of RAM completion/discard
paths and into migration_switchover_start(), where the migration core
can run a final save_query_pending pass for all modules uniformly.
[1] fd51e54fa102 ("virtio-balloon: don't start free page hinting if postcopy is possible")
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
migration/ram.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/migration/ram.c b/migration/ram.c
index fc38ffbf8a..d7b8cc61ea 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2684,7 +2684,7 @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms)
RCU_READ_LOCK_GUARD();
/* This should be our last sync, the src is now paused */
- migration_bitmap_sync(rs, false);
+ migration_bitmap_sync_precopy(true);
/* Easiest way to make sure we don't resume in the middle of a host-page */
rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 02/14] migration/ram: Use migration_bitmap_sync_precopy() for postcopy discard
2026-06-09 7:58 ` [PATCH v3 02/14] migration/ram: Use migration_bitmap_sync_precopy() for postcopy discard Avihai Horon
@ 2026-06-12 15:59 ` Peter Xu
0 siblings, 0 replies; 27+ messages in thread
From: Peter Xu @ 2026-06-12 15:59 UTC (permalink / raw)
To: Avihai Horon
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Gavin Shan, Wei Wang
On Tue, Jun 09, 2026 at 10:58:00AM +0300, Avihai Horon wrote:
> ram_postcopy_send_discard_bitmap() performs the final RAM dirty bitmap
> sync at postcopy switchover, before sending discard bitmap to the
> destination. Unlike the precopy switchover path, it currently calls the
> raw migration_bitmap_sync() helper and passes last_stage=false.
>
> Postcopy switchover is also a stopped-VM final sync point, so use
> migration_bitmap_sync_precopy(true). This keeps RAM final bitmap sync
> handling consistent across precopy and postcopy switchover, including
> the precopy bitmap-sync notifier wrapper. Current notifier users are
> safe in postcopy: virtio-balloon free-page hinting already opts out when
> postcopy-ram is enabled [1].
>
> This prepares for moving the final sync out of RAM completion/discard
> paths and into migration_switchover_start(), where the migration core
> can run a final save_query_pending pass for all modules uniformly.
>
> [1] fd51e54fa102 ("virtio-balloon: don't start free page hinting if postcopy is possible")
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 03/14] migration: Run final save_query_pending at switchover
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
2026-06-09 7:57 ` [PATCH v3 01/14] migration: Propagate errors in migration_completion_precopy() Avihai Horon
2026-06-09 7:58 ` [PATCH v3 02/14] migration/ram: Use migration_bitmap_sync_precopy() for postcopy discard Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-12 16:03 ` Peter Xu
2026-06-09 7:58 ` [PATCH v3 04/14] migration: Log the approver in qemu_loadvm_approve_switchover() Avihai Horon
` (10 subsequent siblings)
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Before switchover, the source needs one last exact pending query so
modules can flush dirty state. This is currently done ad hoc in modules
handlers. For example, RAM syncs its dirty bitmap in its save_complete
handler.
This should be a general concept relevant for any module, so extract it
to migration core instead by running a final save_query_pending before
switchover.
The final query requires special handling by modules (e.g., it's called
with BQL locked, during VM stop), so extend save_query_pending
SaveVMHandlers callback and qemu_savevm_query_pending() with a "final"
flag so migration modules can tell the last pending query during
switchover from periodic iteration queries.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
include/migration/register.h | 41 ++++++++++++++++++----------------
migration/savevm.h | 3 ++-
hw/s390x/s390-stattrib.c | 2 +-
hw/vfio/migration.c | 11 ++++++---
migration/block-dirty-bitmap.c | 11 ++++++---
migration/migration.c | 14 ++++++++++--
migration/ram.c | 40 +++++++++++++++++++--------------
migration/savevm.c | 20 +++++++++++++----
hw/vfio/trace-events | 2 +-
migration/trace-events | 2 +-
10 files changed, 94 insertions(+), 52 deletions(-)
diff --git a/include/migration/register.h b/include/migration/register.h
index 5e5e0ee432..6f632123f1 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -171,6 +171,28 @@ typedef struct SaveVMHandlers {
*/
bool (*is_active_iterate)(void *opaque);
+ /**
+ * @save_query_pending
+ *
+ * This estimates the remaining data to transfer on the source side.
+ *
+ * When @exact is true, a module must report accurate results. When
+ * @exact is false, a module may report estimates.
+ *
+ * It's highly recommended that modules implement a faster version of
+ * the query path (for example, by proper caching on the counters) if
+ * an accurate query will be time-consuming.
+ *
+ * @opaque: data pointer passed to register_savevm_live()
+ * @pending: pointer to a MigPendingData struct
+ * @exact: set to true for an accurate (slow) query
+ * @final: set to true for the final query during switchover. When final is
+ * true, the query is called with BQL locked. Otherwise, it's called with
+ * BQL unlocked.
+ */
+ void (*save_query_pending)(void *opaque, MigPendingData *pending,
+ bool exact, bool final);
+
/* This runs outside the BQL in the migration case, and
* within the lock in the savevm case. The callback had better only
* use data that is local to the migration thread or protected
@@ -210,25 +232,6 @@ typedef struct SaveVMHandlers {
*/
bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp);
- /**
- * @save_query_pending
- *
- * This estimates the remaining data to transfer on the source side.
- *
- * When @exact is true, a module must report accurate results. When
- * @exact is false, a module may report estimates.
- *
- * It's highly recommended that modules implement a faster version of
- * the query path (for example, by proper caching on the counters) if
- * an accurate query will be time-consuming.
- *
- * @opaque: data pointer passed to register_savevm_live()
- * @pending: pointer to a MigPendingData struct
- * @exact: set to true for an accurate (slow) query
- */
- void (*save_query_pending)(void *opaque, MigPendingData *pending,
- bool exact);
-
/**
* @load_state
*
diff --git a/migration/savevm.h b/migration/savevm.h
index b6bb4fa977..81abd96dda 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -45,7 +45,8 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
void qemu_savevm_state_cleanup(void);
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
-void qemu_savevm_query_pending(MigPendingData *pending, bool exact);
+void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact);
+void qemu_savevm_query_pending_final(MigPendingData *pending);
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
void qemu_savevm_state_end(QEMUFile *f);
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index c334714b31..b0f04eb30c 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -190,7 +190,7 @@ static int cmma_save_setup(QEMUFile *f, void *opaque, Error **errp)
}
static void cmma_state_pending(void *opaque, MigPendingData *pending,
- bool exact)
+ bool exact, bool final)
{
S390StAttribState *sas = S390_STATTRIB(opaque);
S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index fb12b9717f..195498845e 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -622,13 +622,18 @@ static void vfio_state_pending_sync(VFIODevice *vbasedev)
}
static void vfio_state_pending(void *opaque, MigPendingData *pending,
- bool exact)
+ bool exact, bool final)
{
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
uint64_t precopy_size, stopcopy_size;
- if (exact) {
+ /*
+ * The final pending query runs during switchover downtime. VFIO does not
+ * need a fresh device pending-data query then to get the latest dirty
+ * data, so avoid the extra work and report the cached counters below.
+ */
+ if (exact && !final) {
vfio_state_pending_sync(vbasedev);
}
@@ -646,7 +651,7 @@ static void vfio_state_pending(void *opaque, MigPendingData *pending,
trace_vfio_state_pending(vbasedev->name, migration->stopcopy_size,
migration->precopy_init_size,
- migration->precopy_dirty_size, exact);
+ migration->precopy_dirty_size, exact, final);
}
static bool vfio_is_active_iterate(void *opaque)
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 7ef3759e53..cba54e25cd 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -767,13 +767,16 @@ static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
}
static void dirty_bitmap_state_pending(void *opaque, MigPendingData *data,
- bool exact)
+ bool exact, bool final)
{
DBMSaveState *s = &((DBMState *)opaque)->save;
SaveBitmapState *dbms;
uint64_t pending = 0;
- bql_lock();
+ /* Final pending query is called with BQL locked */
+ if (!final) {
+ bql_lock();
+ }
QSIMPLEQ_FOREACH(dbms, &s->dbms_list, entry) {
uint64_t gran = bdrv_dirty_bitmap_granularity(dbms->bitmap);
@@ -783,7 +786,9 @@ static void dirty_bitmap_state_pending(void *opaque, MigPendingData *data,
pending += DIV_ROUND_UP(sectors * BDRV_SECTOR_SIZE, gran);
}
- bql_unlock();
+ if (!final) {
+ bql_unlock();
+ }
trace_dirty_bitmap_state_pending(pending);
diff --git a/migration/migration.c b/migration/migration.c
index aad23f3228..929a6c432c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2787,12 +2787,22 @@ static bool migration_switchover_prepare(MigrationState *s)
static bool migration_switchover_start(MigrationState *s, Error **errp)
{
ERRP_GUARD();
+ MigPendingData pending = {};
if (!migration_switchover_prepare(s)) {
error_setg(errp, "Switchover is interrupted");
return false;
}
+ /*
+ * The final query to the whole system on dirty data to make sure we
+ * collect the latest status of the VM. For precopy, source QEMU will
+ * dump all the dirty data during switchover. For postcopy, this will
+ * properly update all the dirty bitmaps to finally generate the
+ * correct discard bitmaps; see ram_postcopy_send_discard_bitmap().
+ */
+ qemu_savevm_query_pending_final(&pending);
+
/* Inactivate disks except in COLO */
if (!migrate_colo()) {
/*
@@ -3285,7 +3295,7 @@ static void migration_iteration_go_next(MigPendingData *pending)
/*
* Do a slow sync first before boosting the iteration count.
*/
- qemu_savevm_query_pending(pending, true);
+ qemu_savevm_query_pending_iter(pending, true);
/*
* Update the dirty information for the whole system for this
@@ -3336,7 +3346,7 @@ static MigIterateState migration_iteration_run(MigrationState *s)
bool complete_ready;
/* Fast path - get the estimated amount of pending data */
- qemu_savevm_query_pending(&pending, false);
+ qemu_savevm_query_pending_iter(&pending, false);
if (in_postcopy) {
/*
diff --git a/migration/ram.c b/migration/ram.c
index d7b8cc61ea..079a9b9275 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2683,9 +2683,6 @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms)
RCU_READ_LOCK_GUARD();
- /* This should be our last sync, the src is now paused */
- migration_bitmap_sync_precopy(true);
-
/* Easiest way to make sure we don't resume in the middle of a host-page */
rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
rs->last_seen_block = NULL;
@@ -3376,10 +3373,6 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
rs->last_stage = !migration_in_colo_state();
WITH_RCU_READ_LOCK_GUARD() {
- if (!migration_in_postcopy()) {
- migration_bitmap_sync_precopy(true);
- }
-
ret = rdma_registration_start(f, RAM_CONTROL_FINISH);
if (ret < 0) {
qemu_file_set_error(f, ret);
@@ -3442,25 +3435,38 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return qemu_fflush(f);
}
-static void ram_state_pending(void *opaque, MigPendingData *pending,
- bool exact)
+static void ram_state_pending_sync(bool exact, bool final)
{
- RAMState **temp = opaque;
- RAMState *rs = *temp;
- uint64_t remaining_size;
-
/*
* Sync is not needed either with: (1) a fast query, or (2) after
* postcopy has started (no new dirty will generate anymore).
*/
- if (exact && !migration_in_postcopy()) {
+ if (!exact || migration_in_postcopy()) {
+ return;
+ }
+
+ /* Final pending query is called with BQL locked */
+ if (!final) {
bql_lock();
- WITH_RCU_READ_LOCK_GUARD() {
- migration_bitmap_sync_precopy(false);
- }
+ }
+
+ WITH_RCU_READ_LOCK_GUARD() {
+ migration_bitmap_sync_precopy(final);
+ }
+
+ if (!final) {
bql_unlock();
}
+}
+
+static void ram_state_pending(void *opaque, MigPendingData *pending,
+ bool exact, bool final)
+{
+ RAMState **temp = opaque;
+ RAMState *rs = *temp;
+ uint64_t remaining_size;
+ ram_state_pending_sync(exact, final);
remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
if (migrate_postcopy_ram()) {
diff --git a/migration/savevm.c b/migration/savevm.c
index 9d1d58c8f4..ed62defadc 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1801,7 +1801,8 @@ int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp)
return 0;
}
-void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
+static void qemu_savevm_query_pending(MigPendingData *pending, bool exact,
+ bool final)
{
SaveStateEntry *se;
@@ -1814,7 +1815,7 @@ void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
if (!qemu_savevm_state_active(se)) {
continue;
}
- se->ops->save_query_pending(se->opaque, pending, exact);
+ se->ops->save_query_pending(se->opaque, pending, exact, final);
}
pending->total_bytes = pending->precopy_bytes +
@@ -1826,13 +1827,24 @@ void qemu_savevm_query_pending(MigPendingData *pending, bool exact)
* close to reality when this got invoked frequently while iterating.
*/
mig_stats.dirty_bytes_total = pending->total_bytes;
-
- trace_qemu_savevm_query_pending(exact, pending->precopy_bytes,
+ trace_qemu_savevm_query_pending(exact, final, pending->precopy_bytes,
pending->stopcopy_bytes,
pending->postcopy_bytes,
pending->total_bytes);
}
+void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact)
+{
+ qemu_savevm_query_pending(pending, exact, false);
+}
+
+void qemu_savevm_query_pending_final(MigPendingData *pending)
+{
+ g_assert(bql_locked());
+
+ qemu_savevm_query_pending(pending, true, true);
+}
+
void qemu_savevm_state_cleanup(void)
{
SaveStateEntry *se;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 2049159015..8f57d0b7d8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -175,7 +175,7 @@ vfio_save_device_config_state(const char *name) " (%s)"
vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64
vfio_save_iterate_start(const char *name) " (%s)"
vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size %"PRIu64
-vfio_state_pending(const char *name, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size, bool exact) " (%s) stopcopy size %"PRIu64" precopy initial size %"PRIu64" precopy dirty size %"PRIu64 " exact %d"
+vfio_state_pending(const char *name, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size, bool exact, bool final) " (%s) stopcopy size %"PRIu64", precopy initial size %"PRIu64", precopy dirty size %"PRIu64", exact %d, final %d"
vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
diff --git a/migration/trace-events b/migration/trace-events
index de99d976ab..1c9212d3e2 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -7,7 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
qemu_loadvm_state_post_main(int ret) "%d"
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
qemu_savevm_send_packaged(void) ""
-qemu_savevm_query_pending(bool exact, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t total) "exact=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
+qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t total) "exact=%d, final=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
loadvm_state_setup(void) ""
loadvm_state_cleanup(void) ""
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 03/14] migration: Run final save_query_pending at switchover
2026-06-09 7:58 ` [PATCH v3 03/14] migration: Run final save_query_pending at switchover Avihai Horon
@ 2026-06-12 16:03 ` Peter Xu
0 siblings, 0 replies; 27+ messages in thread
From: Peter Xu @ 2026-06-12 16:03 UTC (permalink / raw)
To: Avihai Horon
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On Tue, Jun 09, 2026 at 10:58:01AM +0300, Avihai Horon wrote:
> Before switchover, the source needs one last exact pending query so
> modules can flush dirty state. This is currently done ad hoc in modules
> handlers. For example, RAM syncs its dirty bitmap in its save_complete
> handler.
>
> This should be a general concept relevant for any module, so extract it
> to migration core instead by running a final save_query_pending before
> switchover.
>
> The final query requires special handling by modules (e.g., it's called
> with BQL locked, during VM stop), so extend save_query_pending
> SaveVMHandlers callback and qemu_savevm_query_pending() with a "final"
> flag so migration modules can tell the last pending query during
> switchover from periodic iteration queries.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 04/14] migration: Log the approver in qemu_loadvm_approve_switchover()
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (2 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 03/14] migration: Run final save_query_pending at switchover Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 7:58 ` [PATCH v3 05/14] migration: Replace switchover_ack_needed SaveVMHandler Avihai Horon
` (9 subsequent siblings)
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Pass the device name that approved switchover to
qemu_loadvm_approve_switchover() and log it in the trace for debugging
purposes.
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
migration/savevm.h | 2 +-
hw/vfio/migration.c | 2 +-
migration/savevm.c | 4 ++--
migration/trace-events | 2 +-
4 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/migration/savevm.h b/migration/savevm.h
index 81abd96dda..44424be347 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -71,7 +71,7 @@ void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis,
Error **errp);
int qemu_load_device_state(QEMUFile *f, Error **errp);
-int qemu_loadvm_approve_switchover(void);
+int qemu_loadvm_approve_switchover(const char *approver);
int qemu_savevm_state_non_iterable(QEMUFile *f, Error **errp);
int qemu_savevm_state_non_iterable_early(QEMUFile *f,
JSONWriter *vmdesc,
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 195498845e..180b316bae 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -847,7 +847,7 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
return -EINVAL;
}
- ret = qemu_loadvm_approve_switchover();
+ ret = qemu_loadvm_approve_switchover(vbasedev->name);
if (ret) {
error_report(
"%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
diff --git a/migration/savevm.c b/migration/savevm.c
index ed62defadc..8ff6c0b17e 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -3174,7 +3174,7 @@ int qemu_load_device_state(QEMUFile *f, Error **errp)
return 0;
}
-int qemu_loadvm_approve_switchover(void)
+int qemu_loadvm_approve_switchover(const char *approver)
{
MigrationIncomingState *mis = migration_incoming_get_current();
@@ -3183,7 +3183,7 @@ int qemu_loadvm_approve_switchover(void)
}
mis->switchover_ack_pending_num--;
- trace_loadvm_approve_switchover(mis->switchover_ack_pending_num);
+ trace_loadvm_approve_switchover(approver, mis->switchover_ack_pending_num);
if (mis->switchover_ack_pending_num) {
return 0;
diff --git a/migration/trace-events b/migration/trace-events
index 1c9212d3e2..c0c433744c 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -24,7 +24,7 @@ loadvm_postcopy_ram_handle_discard_end(void) ""
loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
loadvm_process_command_ping(uint32_t val) "0x%x"
-loadvm_approve_switchover(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
+loadvm_approve_switchover(const char *approver, unsigned int switchover_ack_pending_num) "Approver %s, switchover_ack_pending_num %u"
postcopy_ram_listen_thread_exit(void) ""
postcopy_ram_listen_thread_start(void) ""
qemu_savevm_send_postcopy_advise(void) ""
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v3 05/14] migration: Replace switchover_ack_needed SaveVMHandler
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (3 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 04/14] migration: Log the approver in qemu_loadvm_approve_switchover() Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 7:58 ` [PATCH v3 06/14] migration: Rename switchover-ack code to legacy Avihai Horon
` (8 subsequent siblings)
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
A new switchover-ack mechanism that will replace the existing one will
be added in the following patches. The new mechanism will not use
switchover_ack_needed SaveVMHandler, however, the old mechanism must
still be kept for backward compatibility.
To keep things clear and decrease API surface of old code, replace
switchover_ack_needed SaveVMHandler with a regular function
migration_request_switchover_ack().
No functional changes intended.
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
docs/devel/migration/vfio.rst | 3 ---
include/migration/misc.h | 2 ++
include/migration/register.h | 13 -------------
hw/vfio/migration.c | 18 ++++++++++--------
migration/migration.c | 15 +++++++++++++++
migration/savevm.c | 21 ---------------------
migration/trace-events | 2 +-
7 files changed, 28 insertions(+), 46 deletions(-)
diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst
index 691061d182..854277b11c 100644
--- a/docs/devel/migration/vfio.rst
+++ b/docs/devel/migration/vfio.rst
@@ -59,9 +59,6 @@ VFIO implements the device hooks for the iterative approach as follows:
* A ``save_live_iterate`` function that reads the VFIO device's data from the
vendor driver during iterative pre-copy phase.
-* A ``switchover_ack_needed`` function that checks if the VFIO device uses
- "switchover-ack" migration capability when this capability is enabled.
-
* A ``switchover_start`` function that in the multifd mode starts a thread that
reassembles the multifd received data and loads it in-order into the device.
In the non-multifd mode this function is a NOP.
diff --git a/include/migration/misc.h b/include/migration/misc.h
index 3159a5e53c..a2219c981b 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -156,4 +156,6 @@ bool multifd_device_state_save_thread_should_exit(void);
void multifd_abort_device_state_save_threads(void);
bool multifd_join_device_state_save_threads(void);
+void migration_request_switchover_ack(const char *requester);
+
#endif
diff --git a/include/migration/register.h b/include/migration/register.h
index 6f632123f1..a61c4236d2 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -302,19 +302,6 @@ typedef struct SaveVMHandlers {
*/
int (*resume_prepare)(MigrationState *s, void *opaque);
- /**
- * @switchover_ack_needed
- *
- * Checks if switchover ack should be used. Called only on
- * destination.
- *
- * @opaque: data pointer passed to register_savevm_live()
- *
- * Returns true if switchover ack should be used and false
- * otherwise
- */
- bool (*switchover_ack_needed)(void *opaque);
-
/**
* @switchover_start
*
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 180b316bae..7055cfbd3e 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -487,6 +487,14 @@ static bool vfio_precopy_supported(VFIODevice *vbasedev)
return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
}
+static void vfio_request_switchover_ack(VFIODevice *vbasedev)
+{
+ if (vfio_precopy_supported(vbasedev)) {
+ /* Precopy support implies switchover-ack is needed */
+ migration_request_switchover_ack(vbasedev->name);
+ }
+}
+
/* ---------------------------------------------------------------------- */
static int vfio_save_prepare(void *opaque, Error **errp)
@@ -776,6 +784,8 @@ static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
return ret;
}
+ vfio_request_switchover_ack(vbasedev);
+
return 0;
}
@@ -874,13 +884,6 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
return ret;
}
-static bool vfio_switchover_ack_needed(void *opaque)
-{
- VFIODevice *vbasedev = opaque;
-
- return vfio_precopy_supported(vbasedev);
-}
-
static int vfio_switchover_start(void *opaque)
{
VFIODevice *vbasedev = opaque;
@@ -904,7 +907,6 @@ static const SaveVMHandlers savevm_vfio_handlers = {
.load_setup = vfio_load_setup,
.load_cleanup = vfio_load_cleanup,
.load_state = vfio_load_state,
- .switchover_ack_needed = vfio_switchover_ack_needed,
/*
* Multifd support
*/
diff --git a/migration/migration.c b/migration/migration.c
index 929a6c432c..d6383b23b9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2196,6 +2196,21 @@ void migration_rp_kick(MigrationState *s)
qemu_sem_post(&s->rp_state.rp_sem);
}
+/* This is called only on destination side */
+void migration_request_switchover_ack(const char *requester)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+
+ if (!migrate_switchover_ack()) {
+ return;
+ }
+
+ mis->switchover_ack_pending_num++;
+
+ trace_migration_request_switchover_ack(requester,
+ mis->switchover_ack_pending_num);
+}
+
static struct rp_cmd_args {
ssize_t len; /* -1 = variable */
const char *name;
diff --git a/migration/savevm.c b/migration/savevm.c
index 8ff6c0b17e..25afcfd71e 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2800,23 +2800,6 @@ static int qemu_loadvm_state_header(QEMUFile *f, Error **errp)
return 0;
}
-static void qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis)
-{
- SaveStateEntry *se;
-
- QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (!se->ops || !se->ops->switchover_ack_needed) {
- continue;
- }
-
- if (se->ops->switchover_ack_needed(se->opaque)) {
- mis->switchover_ack_pending_num++;
- }
- }
-
- trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num);
-}
-
static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp)
{
ERRP_GUARD();
@@ -3078,10 +3061,6 @@ int qemu_loadvm_state(QEMUFile *f, Error **errp)
return -EINVAL;
}
- if (migrate_switchover_ack()) {
- qemu_loadvm_state_switchover_ack_needed(mis);
- }
-
cpu_synchronize_all_pre_loadvm();
ret = qemu_loadvm_state_main(f, mis, errp);
diff --git a/migration/trace-events b/migration/trace-events
index c0c433744c..5955befcc6 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -8,7 +8,6 @@ qemu_loadvm_state_post_main(int ret) "%d"
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
qemu_savevm_send_packaged(void) ""
qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t total) "exact=%d, final=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
-loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
loadvm_state_setup(void) ""
loadvm_state_cleanup(void) ""
loadvm_handle_cmd_packaged(unsigned int length) "%u"
@@ -199,6 +198,7 @@ process_incoming_migration_co_postcopy_end_main(void) ""
postcopy_preempt_enabled(bool value) "%d"
migration_precopy_complete(void) ""
migration_call_notifiers(int type) "type=%d"
+migration_request_switchover_ack(const char *requester, unsigned int switchover_ack_pending_num) "Requester %s, switchover_ack_pending_num %u"
# migration-stats
migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd, uint64_t rdma) "qemu_file %" PRIu64 " multifd %" PRIu64 " RDMA %" PRIu64
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v3 06/14] migration: Rename switchover-ack code to legacy
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (4 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 05/14] migration: Replace switchover_ack_needed SaveVMHandler Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 7:58 ` [PATCH v3 07/14] migration: Make switchover-ack re-usable Avihai Horon
` (7 subsequent siblings)
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
A new switchover-ack mechanism will be added in the following patches.
However, the old mechanism must still be kept for backward
compatibility.
Rename existing code that will be used only for old switchover-ack
mechanism as legacy. This will help to distinguish legacy code from new
code and make it more readable and easier for removal later when no
longer needed.
No functional change intended.
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
include/migration/misc.h | 2 +-
migration/migration.h | 2 +-
hw/vfio/migration.c | 6 ++---
migration/migration.c | 8 +++----
migration/savevm.c | 49 +++++++++++++++++++++++++++-------------
migration/trace-events | 4 ++--
6 files changed, 44 insertions(+), 27 deletions(-)
diff --git a/include/migration/misc.h b/include/migration/misc.h
index a2219c981b..4b43413aee 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -156,6 +156,6 @@ bool multifd_device_state_save_thread_should_exit(void);
void multifd_abort_device_state_save_threads(void);
bool multifd_join_device_state_save_threads(void);
-void migration_request_switchover_ack(const char *requester);
+void migration_request_switchover_ack_legacy(const char *requester);
#endif
diff --git a/migration/migration.h b/migration/migration.h
index 841f49b215..da45444f7b 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -246,7 +246,7 @@ struct MigrationIncomingState {
* zero an ACK that it's OK to do switchover is sent to the source. No lock
* is needed as this field is updated serially.
*/
- unsigned int switchover_ack_pending_num;
+ unsigned int switchover_ack_pending_num_legacy;
/* Do exit on incoming migration failure */
bool exit_on_error;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 7055cfbd3e..6b7acb2fa1 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -487,11 +487,11 @@ static bool vfio_precopy_supported(VFIODevice *vbasedev)
return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
}
-static void vfio_request_switchover_ack(VFIODevice *vbasedev)
+static void vfio_request_switchover_ack_legacy(VFIODevice *vbasedev)
{
if (vfio_precopy_supported(vbasedev)) {
/* Precopy support implies switchover-ack is needed */
- migration_request_switchover_ack(vbasedev->name);
+ migration_request_switchover_ack_legacy(vbasedev->name);
}
}
@@ -784,7 +784,7 @@ static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
return ret;
}
- vfio_request_switchover_ack(vbasedev);
+ vfio_request_switchover_ack_legacy(vbasedev);
return 0;
}
diff --git a/migration/migration.c b/migration/migration.c
index d6383b23b9..8d189fec80 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2197,7 +2197,7 @@ void migration_rp_kick(MigrationState *s)
}
/* This is called only on destination side */
-void migration_request_switchover_ack(const char *requester)
+void migration_request_switchover_ack_legacy(const char *requester)
{
MigrationIncomingState *mis = migration_incoming_get_current();
@@ -2205,10 +2205,10 @@ void migration_request_switchover_ack(const char *requester)
return;
}
- mis->switchover_ack_pending_num++;
+ mis->switchover_ack_pending_num_legacy++;
- trace_migration_request_switchover_ack(requester,
- mis->switchover_ack_pending_num);
+ trace_migration_request_switchover_ack_legacy(
+ requester, mis->switchover_ack_pending_num_legacy);
}
static struct rp_cmd_args {
diff --git a/migration/savevm.c b/migration/savevm.c
index 25afcfd71e..8e6c5b7c87 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2478,6 +2478,31 @@ static int loadvm_postcopy_handle_switchover_start(Error **errp)
return 0;
}
+/*
+ * If legacy switchover-ack is enabled but no device uses it, need to send an
+ * ACK to source that it's OK to switchover.
+ */
+static int loadvm_switchover_ack_no_users_legacy(MigrationIncomingState *mis,
+ Error **errp)
+{
+ int ret;
+
+ if (!migrate_switchover_ack()) {
+ return 0;
+ }
+
+ if (!mis->switchover_ack_pending_num_legacy) {
+ ret = migrate_send_rp_switchover_ack(mis);
+ if (ret) {
+ error_setg_errno(errp, -ret,
+ "Could not send switchover ack RP MSG");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/*
* Process an incoming 'QEMU_VM_COMMAND'
* 0 just a normal return
@@ -2527,18 +2552,9 @@ static int loadvm_process_command(QEMUFile *f, Error **errp)
}
mis->to_src_file = qemu_file_get_return_path(f);
- /*
- * Switchover ack is enabled but no device uses it, so send an ACK to
- * source that it's OK to switchover. Do it here, after return path has
- * been created.
- */
- if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) {
- ret = migrate_send_rp_switchover_ack(mis);
- if (ret) {
- error_setg_errno(errp, -ret,
- "Could not send switchover ack RP MSG");
- return ret;
- }
+ ret = loadvm_switchover_ack_no_users_legacy(mis, errp);
+ if (ret) {
+ return ret;
}
return 0;
@@ -3157,14 +3173,15 @@ int qemu_loadvm_approve_switchover(const char *approver)
{
MigrationIncomingState *mis = migration_incoming_get_current();
- if (!mis->switchover_ack_pending_num) {
+ if (!mis->switchover_ack_pending_num_legacy) {
return -EINVAL;
}
- mis->switchover_ack_pending_num--;
- trace_loadvm_approve_switchover(approver, mis->switchover_ack_pending_num);
+ mis->switchover_ack_pending_num_legacy--;
+ trace_loadvm_approve_switchover_legacy(
+ approver, mis->switchover_ack_pending_num_legacy);
- if (mis->switchover_ack_pending_num) {
+ if (mis->switchover_ack_pending_num_legacy) {
return 0;
}
diff --git a/migration/trace-events b/migration/trace-events
index 5955befcc6..a6b8c31ee1 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -23,7 +23,7 @@ loadvm_postcopy_ram_handle_discard_end(void) ""
loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
loadvm_process_command_ping(uint32_t val) "0x%x"
-loadvm_approve_switchover(const char *approver, unsigned int switchover_ack_pending_num) "Approver %s, switchover_ack_pending_num %u"
+loadvm_approve_switchover_legacy(const char *approver, unsigned int switchover_ack_pending_num_legacy) "Approver %s, switchover_ack_pending_num_legacy %u"
postcopy_ram_listen_thread_exit(void) ""
postcopy_ram_listen_thread_start(void) ""
qemu_savevm_send_postcopy_advise(void) ""
@@ -198,7 +198,7 @@ process_incoming_migration_co_postcopy_end_main(void) ""
postcopy_preempt_enabled(bool value) "%d"
migration_precopy_complete(void) ""
migration_call_notifiers(int type) "type=%d"
-migration_request_switchover_ack(const char *requester, unsigned int switchover_ack_pending_num) "Requester %s, switchover_ack_pending_num %u"
+migration_request_switchover_ack_legacy(const char *requester, unsigned int switchover_ack_pending_num_legacy) "Requester %s, switchover_ack_pending_num_legacy %u"
# migration-stats
migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd, uint64_t rdma) "qemu_file %" PRIu64 " multifd %" PRIu64 " RDMA %" PRIu64
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v3 07/14] migration: Make switchover-ack re-usable
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (5 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 06/14] migration: Rename switchover-ack code to legacy Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-12 16:16 ` Peter Xu
2026-06-09 7:58 ` [PATCH v3 08/14] migration: Fail migration if switchover-ack is requested after switchover decision Avihai Horon
` (6 subsequent siblings)
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Switchover-ack is a mechanism to synchronize between source and
destination QEMU during migration to prevent the source from switching
over prematurely.
VFIO uses switchover-ack to ensure switchover happens only after
destination side has loaded the precopy initial bytes. This is important
for VFIO, as otherwise downtime could be impacted and be higher.
In its current state, switchover-ack is a one-time mechanism, meaning
that switchover is acked only once and past that another ACK cannot be
requested again. This was sufficient until now, as VFIO precopy initial
bytes was defined to be monotonically decreasing. Thus, when precopy
initial bytes reached zero for all VFIO devices, a single ACK would be
sent and its validity would hold.
However, now the new VFIO_PRECOPY_INFO_REINIT feature allows precopy
initial bytes to be re-initialized during precopy. Specifically, it
means that initial bytes can grow after reaching zero, which would
invalidate a previously sent switchover ACK.
To solve this, make switchover-ack reusable and allow devices to request
switchover ACKs when needed via the save_query_pending SaveVMHandler.
Since now switchover ACK can be requested for a specific device and in
different times, make switchover ACK per-device (instead of a single ACK
for all devices) and let source side do the pending ACKs accounting.
Keep the legacy switchover-ack mechanism for backward compatibility and
turn it on by a compatibility property for older machines. Enable the
property until VFIO implements the new switchover-ack.
Acked-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
qapi/migration.json | 14 ++++----
include/migration/client-options.h | 1 +
include/migration/register.h | 2 ++
migration/migration.h | 32 ++++++++++++++++--
migration/savevm.h | 6 ++--
hw/core/machine.c | 1 +
migration/migration.c | 37 ++++++++++++++-------
migration/options.c | 10 ++++++
migration/savevm.c | 53 +++++++++++++++++++++++-------
migration/trace-events | 5 +--
10 files changed, 124 insertions(+), 37 deletions(-)
diff --git a/qapi/migration.json b/qapi/migration.json
index 27a7970556..9b3070e494 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -508,14 +508,12 @@
# (since 7.1)
#
# @switchover-ack: If enabled, migration will not stop the source VM
-# and complete the migration until an ACK is received from the
-# destination that it's OK to do so. Exactly when this ACK is
-# sent depends on the migrated devices that use this feature. For
-# example, a device can use it to make sure some of its data is
-# sent and loaded in the destination before doing switchover.
-# This can reduce downtime if devices that support this capability
-# are present. 'return-path' capability must be enabled to use
-# it. (since 8.1)
+# and complete the migration until the destination has
+# acknowledged that it is OK to switchover. The acknowledgement
+# may depend, for example, on some device's data being loaded in
+# the destination before doing switchover. This can reduce
+# downtime if devices that support this capability are present.
+# Capability @return-path must be enabled to use it. (since 8.1)
#
# @dirty-limit: If enabled, migration will throttle vCPUs as needed to
# keep their dirty page rate within @vcpu-dirty-limit. This can
diff --git a/include/migration/client-options.h b/include/migration/client-options.h
index 289c9d7762..78b1daa1a6 100644
--- a/include/migration/client-options.h
+++ b/include/migration/client-options.h
@@ -13,6 +13,7 @@
/* properties */
bool migrate_send_switchover_start(void);
+bool migrate_switchover_ack_legacy(void);
/* capabilities */
diff --git a/include/migration/register.h b/include/migration/register.h
index a61c4236d2..5825eb30cb 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -23,6 +23,8 @@ typedef struct MigPendingData {
uint64_t postcopy_bytes;
/* Amount of pending bytes can be transferred only in stopcopy */
uint64_t stopcopy_bytes;
+ /* Number of new pending switchover ACKs */
+ uint32_t switchover_ack_pending;
/*
* Total pending data, modules do not need to update this field, it
* will be automatically calculated by migration core API.
diff --git a/migration/migration.h b/migration/migration.h
index da45444f7b..086eb9a15d 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -494,6 +494,29 @@ struct MigrationState {
*/
uint8_t clear_bitmap_shift;
+ /*
+ * This decides whether to use legacy switchover-ack or new switchover-ack.
+ * The main difference between them is that the former allows acknowledging
+ * switchover only once while the latter multiple times.
+ *
+ * In legacy, the destination keeps track of a pending ACKs counter. As
+ * migration progresses, the devices on the destination acknowledge
+ * switchover, decreasing the counter. When the counter reaches zero, a
+ * single ACK message is sent to the source via the return path, indicating
+ * that it's OK to switchover.
+ *
+ * In new switchover-ack, the source is the one that keeps track of a
+ * pending ACKs counter. As migration progresses, the destination sends ACK
+ * message per-device via the return path, which decrements the source
+ * counter. When the counter reaches zero, it's OK to switchover. During
+ * precopy, source-side devices may request additional ACKs, which increment
+ * the counter again.
+ *
+ * In both legacy and new schemes, we rely on per-device protocol to request
+ * switchover ACK from the destination-side counterpart.
+ */
+ bool switchover_ack_legacy;
+
/*
* This save hostname when out-going migration starts
*/
@@ -503,10 +526,13 @@ struct MigrationState {
JSONWriter *vmdesc;
/*
- * Indicates whether an ACK from the destination that it's OK to do
- * switchover has been received.
+ * Indicates the number of pending ACKs from the destination. The value may
+ * increase or decrease during precopy as new ACKs are requested or
+ * received. When zero is reached, it's OK to switchover. In legacy
+ * switchover-ack, it's initialized to 1 and decreased to zero upon ACK.
*/
- bool switchover_acked;
+ uint32_t switchover_ack_pending_num;
+
/* Is this a rdma migration */
bool rdma_migration;
diff --git a/migration/savevm.h b/migration/savevm.h
index 44424be347..fb92d3bc85 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -45,8 +45,10 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
void qemu_savevm_state_cleanup(void);
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
-void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact);
-void qemu_savevm_query_pending_final(MigPendingData *pending);
+void qemu_savevm_query_pending_iter(MigrationState *s, MigPendingData *pending,
+ bool exact);
+void qemu_savevm_query_pending_final(MigrationState *s,
+ MigPendingData *pending);
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
void qemu_savevm_state_end(QEMUFile *f);
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 4d8b15d99e..8219f13779 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -43,6 +43,7 @@ GlobalProperty hw_compat_11_0[] = {
{ "chardev-vc", "encoding", "cp437" },
{ "tpm-crb", "cap-chunk", "off" },
{ "tpm-crb", "x-allow-chunk-migration", "off" },
+ { "migration", "switchover-ack-legacy", "on" },
};
const size_t hw_compat_11_0_len = G_N_ELEMENTS(hw_compat_11_0);
diff --git a/migration/migration.c b/migration/migration.c
index 8d189fec80..60493e2c10 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1707,7 +1707,9 @@ int migrate_init(MigrationState *s, Error **errp)
s->vm_old_state = -1;
s->iteration_initial_bytes = 0;
s->threshold_size = 0;
- s->switchover_acked = false;
+ /* Legacy switchover-ack sends a single ACK for all devices */
+ qatomic_set(&s->switchover_ack_pending_num,
+ migrate_switchover_ack_legacy() ? 1 : 0);
s->rdma_migration = false;
/*
@@ -2201,7 +2203,7 @@ void migration_request_switchover_ack_legacy(const char *requester)
{
MigrationIncomingState *mis = migration_incoming_get_current();
- if (!migrate_switchover_ack()) {
+ if (!migrate_switchover_ack() || !migrate_switchover_ack_legacy()) {
return;
}
@@ -2457,9 +2459,18 @@ static void *source_return_path_thread(void *opaque)
break;
case MIG_RP_MSG_SWITCHOVER_ACK:
- ms->switchover_acked = true;
- trace_source_return_path_thread_switchover_acked();
+ {
+ uint32_t pending_num;
+
+ pending_num = qatomic_dec_fetch(&ms->switchover_ack_pending_num);
+ trace_source_return_path_thread_switchover_acked(pending_num);
+ if (pending_num == UINT32_MAX) {
+ error_setg(&err, "Switchover ack pending num underflowed");
+ goto out;
+ }
+
break;
+ }
default:
break;
@@ -2816,7 +2827,7 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
* properly update all the dirty bitmaps to finally generate the
* correct discard bitmaps; see ram_postcopy_send_discard_bitmap().
*/
- qemu_savevm_query_pending_final(&pending);
+ qemu_savevm_query_pending_final(s, &pending);
/* Inactivate disks except in COLO */
if (!migrate_colo()) {
@@ -3266,7 +3277,7 @@ static bool migration_can_switchover(MigrationState *s)
return true;
}
- return s->switchover_acked;
+ return qatomic_read(&s->switchover_ack_pending_num) == 0;
}
/* Migration thread iteration status */
@@ -3305,12 +3316,13 @@ static bool migration_iteration_next_ready(MigrationState *s,
return false;
}
-static void migration_iteration_go_next(MigPendingData *pending)
+static void migration_iteration_go_next(MigrationState *s,
+ MigPendingData *pending)
{
/*
* Do a slow sync first before boosting the iteration count.
*/
- qemu_savevm_query_pending_iter(pending, true);
+ qemu_savevm_query_pending_iter(s, pending, true);
/*
* Update the dirty information for the whole system for this
@@ -3356,12 +3368,12 @@ static MigIterateState migration_iteration_run(MigrationState *s)
Error *local_err = NULL;
bool in_postcopy = (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE ||
s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
- bool can_switchover = migration_can_switchover(s);
+ bool can_switchover;
MigPendingData pending = { };
bool complete_ready;
/* Fast path - get the estimated amount of pending data */
- qemu_savevm_query_pending_iter(&pending, false);
+ qemu_savevm_query_pending_iter(s, &pending, false);
if (in_postcopy) {
/*
@@ -3402,9 +3414,12 @@ static MigIterateState migration_iteration_run(MigrationState *s)
* during postcopy phase.
*/
if (migration_iteration_next_ready(s, &pending)) {
- migration_iteration_go_next(&pending);
+ migration_iteration_go_next(s, &pending);
}
+ /* Check can switchover after qemu_savevm_query_pending() */
+ can_switchover = migration_can_switchover(s);
+
/* Should we switch to postcopy now? */
if (can_switchover && postcopy_should_start(s, &pending)) {
if (postcopy_start(s, &local_err)) {
diff --git a/migration/options.c b/migration/options.c
index 5cbfd29099..4c9b25372e 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -110,6 +110,9 @@ const Property migration_properties[] = {
preempt_pre_7_2, false),
DEFINE_PROP_BOOL("multifd-clean-tls-termination", MigrationState,
multifd_clean_tls_termination, true),
+ /* Use legacy until VFIO implements new switchover-ack */
+ DEFINE_PROP_BOOL("switchover-ack-legacy", MigrationState,
+ switchover_ack_legacy, true),
/* Migration parameters */
DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
@@ -467,6 +470,13 @@ bool migrate_rdma(void)
return s->rdma_migration;
}
+bool migrate_switchover_ack_legacy(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ return s->switchover_ack_legacy;
+}
+
typedef enum WriteTrackingSupport {
WT_SUPPORT_UNKNOWN = 0,
WT_SUPPORT_ABSENT,
diff --git a/migration/savevm.c b/migration/savevm.c
index 8e6c5b7c87..b8c93d86d7 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1801,7 +1801,8 @@ int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp)
return 0;
}
-static void qemu_savevm_query_pending(MigPendingData *pending, bool exact,
+static void qemu_savevm_query_pending(MigrationState *s,
+ MigPendingData *pending, bool exact,
bool final)
{
SaveStateEntry *se;
@@ -1827,22 +1828,35 @@ static void qemu_savevm_query_pending(MigPendingData *pending, bool exact,
* close to reality when this got invoked frequently while iterating.
*/
mig_stats.dirty_bytes_total = pending->total_bytes;
- trace_qemu_savevm_query_pending(exact, final, pending->precopy_bytes,
- pending->stopcopy_bytes,
- pending->postcopy_bytes,
- pending->total_bytes);
+
+ if (migrate_switchover_ack() && !migrate_switchover_ack_legacy() &&
+ pending->switchover_ack_pending) {
+ /*
+ * NOTE: Currently we rely on per-device protocol to request switchover
+ * ACK from the device on the destination side.
+ */
+ qatomic_add(&s->switchover_ack_pending_num,
+ pending->switchover_ack_pending);
+ }
+
+ trace_qemu_savevm_query_pending(
+ exact, final, pending->precopy_bytes, pending->stopcopy_bytes,
+ pending->postcopy_bytes, pending->total_bytes,
+ pending->switchover_ack_pending,
+ qatomic_read(&s->switchover_ack_pending_num));
}
-void qemu_savevm_query_pending_iter(MigPendingData *pending, bool exact)
+void qemu_savevm_query_pending_iter(MigrationState *s, MigPendingData *pending,
+ bool exact)
{
- qemu_savevm_query_pending(pending, exact, false);
+ qemu_savevm_query_pending(s, pending, exact, false);
}
-void qemu_savevm_query_pending_final(MigPendingData *pending)
+void qemu_savevm_query_pending_final(MigrationState *s, MigPendingData *pending)
{
g_assert(bql_locked());
- qemu_savevm_query_pending(pending, true, true);
+ qemu_savevm_query_pending(s, pending, true, true);
}
void qemu_savevm_state_cleanup(void)
@@ -2487,7 +2501,7 @@ static int loadvm_switchover_ack_no_users_legacy(MigrationIncomingState *mis,
{
int ret;
- if (!migrate_switchover_ack()) {
+ if (!migrate_switchover_ack() || !migrate_switchover_ack_legacy()) {
return 0;
}
@@ -3169,7 +3183,7 @@ int qemu_load_device_state(QEMUFile *f, Error **errp)
return 0;
}
-int qemu_loadvm_approve_switchover(const char *approver)
+static int qemu_loadvm_approve_switchover_legacy(const char *approver)
{
MigrationIncomingState *mis = migration_incoming_get_current();
@@ -3188,6 +3202,23 @@ int qemu_loadvm_approve_switchover(const char *approver)
return migrate_send_rp_switchover_ack(mis);
}
+int qemu_loadvm_approve_switchover(const char *approver)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+
+ if (!migrate_switchover_ack()) {
+ return 0;
+ }
+
+ if (migrate_switchover_ack_legacy()) {
+ return qemu_loadvm_approve_switchover_legacy(approver);
+ }
+
+ trace_loadvm_approve_switchover(approver);
+
+ return migrate_send_rp_switchover_ack(mis);
+}
+
bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
char *buf, size_t len, Error **errp)
{
diff --git a/migration/trace-events b/migration/trace-events
index a6b8c31ee1..f5339f4193 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -7,7 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
qemu_loadvm_state_post_main(int ret) "%d"
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
qemu_savevm_send_packaged(void) ""
-qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t total) "exact=%d, final=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64
+qemu_savevm_query_pending(bool exact, bool final, uint64_t precopy, uint64_t stopcopy, uint64_t postcopy, uint64_t total, uint32_t switchover_ack_pending, uint32_t total_switchover_ack_pending) "exact=%d, final=%d, precopy=%"PRIu64", stopcopy=%"PRIu64", postcopy=%"PRIu64", total=%"PRIu64", collected switchover ack pending=%"PRIu32", total switchover ack pending=%"PRIu32
loadvm_state_setup(void) ""
loadvm_state_cleanup(void) ""
loadvm_handle_cmd_packaged(unsigned int length) "%u"
@@ -24,6 +24,7 @@ loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s:
loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
loadvm_process_command_ping(uint32_t val) "0x%x"
loadvm_approve_switchover_legacy(const char *approver, unsigned int switchover_ack_pending_num_legacy) "Approver %s, switchover_ack_pending_num_legacy %u"
+loadvm_approve_switchover(const char *approver) "Approver %s"
postcopy_ram_listen_thread_exit(void) ""
postcopy_ram_listen_thread_start(void) ""
qemu_savevm_send_postcopy_advise(void) ""
@@ -189,7 +190,7 @@ source_return_path_thread_loop_top(void) ""
source_return_path_thread_pong(uint32_t val) "0x%x"
source_return_path_thread_shut(uint32_t val) "0x%x"
source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
-source_return_path_thread_switchover_acked(void) ""
+source_return_path_thread_switchover_acked(uint32_t pending_num) "switchover_ack_pending_num %" PRIu32
source_return_path_thread_postcopy_package_loaded(void) ""
migration_thread_low_pending(uint64_t pending) "%" PRIu64
migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 07/14] migration: Make switchover-ack re-usable
2026-06-09 7:58 ` [PATCH v3 07/14] migration: Make switchover-ack re-usable Avihai Horon
@ 2026-06-12 16:16 ` Peter Xu
2026-06-14 9:50 ` Avihai Horon
0 siblings, 1 reply; 27+ messages in thread
From: Peter Xu @ 2026-06-12 16:16 UTC (permalink / raw)
To: Avihai Horon
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On Tue, Jun 09, 2026 at 10:58:05AM +0300, Avihai Horon wrote:
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 4d8b15d99e..8219f13779 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -43,6 +43,7 @@ GlobalProperty hw_compat_11_0[] = {
> { "chardev-vc", "encoding", "cp437" },
> { "tpm-crb", "cap-chunk", "off" },
> { "tpm-crb", "x-allow-chunk-migration", "off" },
> + { "migration", "switchover-ack-legacy", "on" },
> };
This line needs to be moved to the last patch, or people will get very
confused on last patch flipping default without caring about old behavior..
After move, feel free to take:
Reviewed-by: Peter Xu <peterx@redhat.com>
PS: If Cédric feels comfortable updating that while queuing it'll be fine
too.
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v3 07/14] migration: Make switchover-ack re-usable
2026-06-12 16:16 ` Peter Xu
@ 2026-06-14 9:50 ` Avihai Horon
2026-06-15 13:34 ` Peter Xu
0 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-14 9:50 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On 6/12/2026 7:16 PM, Peter Xu wrote:
> External email: Use caution opening links or attachments
>
>
> On Tue, Jun 09, 2026 at 10:58:05AM +0300, Avihai Horon wrote:
>> diff --git a/hw/core/machine.c b/hw/core/machine.c
>> index 4d8b15d99e..8219f13779 100644
>> --- a/hw/core/machine.c
>> +++ b/hw/core/machine.c
>> @@ -43,6 +43,7 @@ GlobalProperty hw_compat_11_0[] = {
>> { "chardev-vc", "encoding", "cp437" },
>> { "tpm-crb", "cap-chunk", "off" },
>> { "tpm-crb", "x-allow-chunk-migration", "off" },
>> + { "migration", "switchover-ack-legacy", "on" },
>> };
> This line needs to be moved to the last patch, or people will get very
> confused on last patch flipping default without caring about old behavior..
This patch Implements the compatibility flows for legacy switchover-ack
which use this property, so moving this line to last patch sounds even
more confusing IMHO.
The last patch should not be viewed standalone, it's tightly coupled
with this one. Plus, it doesn't make sense to have the
switchover-ack-legacy property if it's always off.
So I personally don't find it confusing, the last patch simply enables
the feature once everything is in place.
If that helps, I can extend the last patch's commit message to something
like:
Now that VFIO has implemented new switchover-ack, enable it for new
machines. Note that legacy switchover-ack is still used for
compatibility with older machines via a compatibility entry for
switchover-ack-legacy property.
>
> After move, feel free to take:
>
> Reviewed-by: Peter Xu <peterx@redhat.com>
>
> PS: If Cédric feels comfortable updating that while queuing it'll be fine
> too.
Thanks.
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v3 07/14] migration: Make switchover-ack re-usable
2026-06-14 9:50 ` Avihai Horon
@ 2026-06-15 13:34 ` Peter Xu
2026-06-15 14:35 ` Avihai Horon
0 siblings, 1 reply; 27+ messages in thread
From: Peter Xu @ 2026-06-15 13:34 UTC (permalink / raw)
To: Avihai Horon
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On Sun, Jun 14, 2026 at 12:50:14PM +0300, Avihai Horon wrote:
>
> On 6/12/2026 7:16 PM, Peter Xu wrote:
> > External email: Use caution opening links or attachments
> >
> >
> > On Tue, Jun 09, 2026 at 10:58:05AM +0300, Avihai Horon wrote:
> > > diff --git a/hw/core/machine.c b/hw/core/machine.c
> > > index 4d8b15d99e..8219f13779 100644
> > > --- a/hw/core/machine.c
> > > +++ b/hw/core/machine.c
> > > @@ -43,6 +43,7 @@ GlobalProperty hw_compat_11_0[] = {
> > > { "chardev-vc", "encoding", "cp437" },
> > > { "tpm-crb", "cap-chunk", "off" },
> > > { "tpm-crb", "x-allow-chunk-migration", "off" },
> > > + { "migration", "switchover-ack-legacy", "on" },
> > > };
> > This line needs to be moved to the last patch, or people will get very
> > confused on last patch flipping default without caring about old behavior..
>
> This patch Implements the compatibility flows for legacy switchover-ack
> which use this property, so moving this line to last patch sounds even more
> confusing IMHO.
My comment only applies to this one line to change hw_compat_11_0[], not
the property itself. It won't affect this patch. It's common pattern to
do this in one patch to "update default value, but let's keep the old
machines unaffected".
Btw, since this value still has a default value ON in this patch, moving
this line to the last patch won't change anything except making last patch
clearer.
Thanks,
>
> The last patch should not be viewed standalone, it's tightly coupled with
> this one. Plus, it doesn't make sense to have the switchover-ack-legacy
> property if it's always off.
> So I personally don't find it confusing, the last patch simply enables the
> feature once everything is in place.
>
> If that helps, I can extend the last patch's commit message to something
> like:
>
> Now that VFIO has implemented new switchover-ack, enable it for new
> machines. Note that legacy switchover-ack is still used for
> compatibility with older machines via a compatibility entry for
> switchover-ack-legacy property.
>
> >
> > After move, feel free to take:
> >
> > Reviewed-by: Peter Xu <peterx@redhat.com>
> >
> > PS: If Cédric feels comfortable updating that while queuing it'll be fine
> > too.
>
> Thanks.
>
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v3 07/14] migration: Make switchover-ack re-usable
2026-06-15 13:34 ` Peter Xu
@ 2026-06-15 14:35 ` Avihai Horon
0 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-15 14:35 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On 6/15/2026 4:34 PM, Peter Xu wrote:
> External email: Use caution opening links or attachments
>
>
> On Sun, Jun 14, 2026 at 12:50:14PM +0300, Avihai Horon wrote:
>> On 6/12/2026 7:16 PM, Peter Xu wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On Tue, Jun 09, 2026 at 10:58:05AM +0300, Avihai Horon wrote:
>>>> diff --git a/hw/core/machine.c b/hw/core/machine.c
>>>> index 4d8b15d99e..8219f13779 100644
>>>> --- a/hw/core/machine.c
>>>> +++ b/hw/core/machine.c
>>>> @@ -43,6 +43,7 @@ GlobalProperty hw_compat_11_0[] = {
>>>> { "chardev-vc", "encoding", "cp437" },
>>>> { "tpm-crb", "cap-chunk", "off" },
>>>> { "tpm-crb", "x-allow-chunk-migration", "off" },
>>>> + { "migration", "switchover-ack-legacy", "on" },
>>>> };
>>> This line needs to be moved to the last patch, or people will get very
>>> confused on last patch flipping default without caring about old behavior..
>> This patch Implements the compatibility flows for legacy switchover-ack
>> which use this property, so moving this line to last patch sounds even more
>> confusing IMHO.
> My comment only applies to this one line to change hw_compat_11_0[], not
> the property itself. It won't affect this patch. It's common pattern to
> do this in one patch to "update default value, but let's keep the old
> machines unaffected".
>
> Btw, since this value still has a default value ON in this patch, moving
> this line to the last patch won't change anything except making last patch
> clearer.
Alright, I'll move it to last patch.
Cedric, would you like me to send v4 for this?
Thanks.
>
> Thanks,
>
>> The last patch should not be viewed standalone, it's tightly coupled with
>> this one. Plus, it doesn't make sense to have the switchover-ack-legacy
>> property if it's always off.
>> So I personally don't find it confusing, the last patch simply enables the
>> feature once everything is in place.
>>
>> If that helps, I can extend the last patch's commit message to something
>> like:
>>
>> Now that VFIO has implemented new switchover-ack, enable it for new
>> machines. Note that legacy switchover-ack is still used for
>> compatibility with older machines via a compatibility entry for
>> switchover-ack-legacy property.
>>
>>> After move, feel free to take:
>>>
>>> Reviewed-by: Peter Xu <peterx@redhat.com>
>>>
>>> PS: If Cédric feels comfortable updating that while queuing it'll be fine
>>> too.
>> Thanks.
>>
> --
> Peter Xu
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 08/14] migration: Fail migration if switchover-ack is requested after switchover decision
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (6 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 07/14] migration: Make switchover-ack re-usable Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-12 16:17 ` Peter Xu
2026-06-09 7:58 ` [PATCH v3 09/14] vfio/migration: Extract VFIO_MIG_FLAG_DEV_INIT_DATA_SENT sending to helper Avihai Horon
` (5 subsequent siblings)
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Switchover ACK is checked only during precopy while the guest is still
running. The last migration_can_switchover() decision and guest stop are
not atomic, so a device may want to request another switchover ACK in
the gap after switchover decision has been made but before the guest is
stopped. Migration would then miss that request, which can increase
downtime.
Cover this case by failing the migration if a switchover-ack was
requested during that time.
Ideally, precopy iterations should be resumed in this case, however,
VFIO doesn't support going back to precopy after being stopped, so
implementing such logic would require non-trivial changes to the guest
start/stop flow. Given the above and that this case should be rare,
failing the migration seems reasonable.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
migration/savevm.h | 4 ++--
migration/migration.c | 4 +++-
migration/savevm.c | 19 ++++++++++++++++++-
3 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/migration/savevm.h b/migration/savevm.h
index fb92d3bc85..415198423f 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -47,8 +47,8 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f);
int qemu_savevm_state_complete_precopy(MigrationState *s, Error **errp);
void qemu_savevm_query_pending_iter(MigrationState *s, MigPendingData *pending,
bool exact);
-void qemu_savevm_query_pending_final(MigrationState *s,
- MigPendingData *pending);
+bool qemu_savevm_query_pending_final(MigrationState *s,
+ MigPendingData *pending, Error **errp);
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy);
bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp);
void qemu_savevm_state_end(QEMUFile *f);
diff --git a/migration/migration.c b/migration/migration.c
index 60493e2c10..4c20378ed2 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2827,7 +2827,9 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
* properly update all the dirty bitmaps to finally generate the
* correct discard bitmaps; see ram_postcopy_send_discard_bitmap().
*/
- qemu_savevm_query_pending_final(s, &pending);
+ if (!qemu_savevm_query_pending_final(s, &pending, errp)) {
+ return false;
+ }
/* Inactivate disks except in COLO */
if (!migrate_colo()) {
diff --git a/migration/savevm.c b/migration/savevm.c
index b8c93d86d7..6737058021 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1852,11 +1852,28 @@ void qemu_savevm_query_pending_iter(MigrationState *s, MigPendingData *pending,
qemu_savevm_query_pending(s, pending, exact, false);
}
-void qemu_savevm_query_pending_final(MigrationState *s, MigPendingData *pending)
+bool qemu_savevm_query_pending_final(MigrationState *s, MigPendingData *pending,
+ Error **errp)
{
g_assert(bql_locked());
qemu_savevm_query_pending(s, pending, true, true);
+
+ /*
+ * Switchover-ack requests done after switchover decision are not allowed.
+ * Fail the migration in this case since we currently don't support going
+ * back to precopy.
+ */
+ if (migrate_switchover_ack() && !migrate_switchover_ack_legacy() &&
+ pending->switchover_ack_pending > 0) {
+ error_setg(errp,
+ "Switchover ACK was requested by %" PRIu32
+ " devices during switchover",
+ pending->switchover_ack_pending);
+ return false;
+ }
+
+ return true;
}
void qemu_savevm_state_cleanup(void)
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 08/14] migration: Fail migration if switchover-ack is requested after switchover decision
2026-06-09 7:58 ` [PATCH v3 08/14] migration: Fail migration if switchover-ack is requested after switchover decision Avihai Horon
@ 2026-06-12 16:17 ` Peter Xu
0 siblings, 0 replies; 27+ messages in thread
From: Peter Xu @ 2026-06-12 16:17 UTC (permalink / raw)
To: Avihai Horon
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On Tue, Jun 09, 2026 at 10:58:06AM +0300, Avihai Horon wrote:
> Switchover ACK is checked only during precopy while the guest is still
> running. The last migration_can_switchover() decision and guest stop are
> not atomic, so a device may want to request another switchover ACK in
> the gap after switchover decision has been made but before the guest is
> stopped. Migration would then miss that request, which can increase
> downtime.
>
> Cover this case by failing the migration if a switchover-ack was
> requested during that time.
>
> Ideally, precopy iterations should be resumed in this case, however,
> VFIO doesn't support going back to precopy after being stopped, so
> implementing such logic would require non-trivial changes to the guest
> start/stop flow. Given the above and that this case should be rare,
> failing the migration seems reasonable.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 09/14] vfio/migration: Extract VFIO_MIG_FLAG_DEV_INIT_DATA_SENT sending to helper
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (7 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 08/14] migration: Fail migration if switchover-ack is requested after switchover decision Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 7:58 ` [PATCH v3 10/14] vfio/migration: Add Error ** parameter to vfio_migration_init() Avihai Horon
` (4 subsequent siblings)
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Extract the VFIO_MIG_FLAG_DEV_INIT_DATA_SENT flag sending logic from
vfio_save_iterate() into vfio_send_init_data_flag() for clarity. Also
add a trace while at it.
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
hw/vfio/migration.c | 26 +++++++++++++++++++++-----
hw/vfio/trace-events | 1 +
2 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 6b7acb2fa1..45f8e346b4 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -480,6 +480,26 @@ static void vfio_update_estimated_pending_data(VFIOMigration *migration,
data_size);
}
+/* Returns true if the init data flag was sent, false otherwise */
+static bool vfio_send_init_data_flag(QEMUFile *f, VFIOMigration *migration)
+{
+ VFIODevice *vbasedev = migration->vbasedev;
+
+ if (!migrate_switchover_ack()) {
+ return false;
+ }
+
+ if (migration->precopy_init_size || migration->initial_data_sent) {
+ return false;
+ }
+
+ qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
+ migration->initial_data_sent = true;
+ trace_vfio_send_init_data_flag(vbasedev->name);
+
+ return true;
+}
+
static bool vfio_precopy_supported(VFIODevice *vbasedev)
{
VFIOMigration *migration = vbasedev->migration;
@@ -693,11 +713,7 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
vfio_update_estimated_pending_data(migration, data_size);
- if (migrate_switchover_ack() && !migration->precopy_init_size &&
- !migration->initial_data_sent) {
- qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
- migration->initial_data_sent = true;
- } else {
+ if (!vfio_send_init_data_flag(f, migration)) {
qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
}
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 8f57d0b7d8..411a306635 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -176,6 +176,7 @@ vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy
vfio_save_iterate_start(const char *name) " (%s)"
vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size %"PRIu64
vfio_state_pending(const char *name, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size, bool exact, bool final) " (%s) stopcopy size %"PRIu64", precopy initial size %"PRIu64", precopy dirty size %"PRIu64", exact %d, final %d"
+vfio_send_init_data_flag(const char *name) " (%s)"
vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v3 10/14] vfio/migration: Add Error ** parameter to vfio_migration_init()
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (8 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 09/14] vfio/migration: Extract VFIO_MIG_FLAG_DEV_INIT_DATA_SENT sending to helper Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 12:09 ` Cédric Le Goater
2026-06-09 7:58 ` [PATCH v3 11/14] vfio/migration: Add new switchover-ack mechanism Avihai Horon
` (3 subsequent siblings)
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
vfio_migration_init() already has many failure points and a new one will
be added in next patch.
Add Error ** parameter to vfio_migration_init() to report a detailed
error message through it. Refactor it to return bool as well.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
hw/vfio/migration.c | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 45f8e346b4..3ab6b7248f 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -1056,7 +1056,7 @@ static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
}
-static int vfio_migration_init(VFIODevice *vbasedev)
+static bool vfio_migration_init(VFIODevice *vbasedev, Error **errp)
{
int ret;
Object *obj;
@@ -1067,22 +1067,32 @@ static int vfio_migration_init(VFIODevice *vbasedev)
VMChangeStateHandler *prepare_cb;
if (!vbasedev->ops->vfio_get_object) {
- return -EINVAL;
+ error_setg(errp, "no vfio_get_object handler");
+ return false;
}
obj = vbasedev->ops->vfio_get_object(vbasedev);
if (!obj) {
- return -EINVAL;
+ error_setg(errp, "failed to get object");
+ return false;
}
ret = vfio_migration_query_flags(vbasedev, &mig_flags);
if (ret) {
- return ret;
+ if (ret == -ENOTTY) {
+ error_setg_errno(errp, -ret,
+ "migration is not supported in kernel");
+ } else {
+ error_setg_errno(errp, -ret, "failed to query migration flags");
+ }
+
+ return false;
}
/* Basic migration functionality must be supported */
if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
- return -EOPNOTSUPP;
+ error_setg(errp, "VFIO_MIGRATION_STOP_COPY is not supported");
+ return false;
}
vbasedev->migration = g_new0(VFIOMigration, 1);
@@ -1113,7 +1123,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
migration_add_notifier(&migration->migration_state,
vfio_migration_state_notifier);
- return 0;
+ return true;
}
static Error *multiple_devices_migration_blocker;
@@ -1279,18 +1289,8 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
return !vfio_block_migration(vbasedev, err, errp);
}
- ret = vfio_migration_init(vbasedev);
- if (ret) {
- if (ret == -ENOTTY) {
- error_setg(&err, "%s: VFIO migration is not supported in kernel",
- vbasedev->name);
- } else {
- error_setg(&err,
- "%s: Migration couldn't be initialized for VFIO device, "
- "err: %d (%s)",
- vbasedev->name, ret, strerror(-ret));
- }
-
+ if (!vfio_migration_init(vbasedev, &err)) {
+ error_prepend(&err, "%s: VFIO migration init failed: ", vbasedev->name);
return !vfio_block_migration(vbasedev, err, errp);
}
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 10/14] vfio/migration: Add Error ** parameter to vfio_migration_init()
2026-06-09 7:58 ` [PATCH v3 10/14] vfio/migration: Add Error ** parameter to vfio_migration_init() Avihai Horon
@ 2026-06-09 12:09 ` Cédric Le Goater
0 siblings, 0 replies; 27+ messages in thread
From: Cédric Le Goater @ 2026-06-09 12:09 UTC (permalink / raw)
To: Avihai Horon, qemu-devel
Cc: Alex Williamson, Peter Xu, Fabiano Rosas, Pierrick Bouvier,
Philippe Mathieu-Daudé, Zhao Liu, Halil Pasic,
Christian Borntraeger, Jason Herne, Richard Henderson,
Ilya Leoshkevich, David Hildenbrand, Eric Farman, Matthew Rosato,
Cornelia Huck, Eric Blake, Vladimir Sementsov-Ogievskiy,
John Snow, Markus Armbruster, Maor Gottlieb
On 6/9/26 09:58, Avihai Horon wrote:
> vfio_migration_init() already has many failure points and a new one will
> be added in next patch.
>
> Add Error ** parameter to vfio_migration_init() to report a detailed
> error message through it. Refactor it to return bool as well.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
> hw/vfio/migration.c | 36 ++++++++++++++++++------------------
> 1 file changed, 18 insertions(+), 18 deletions(-)
>
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 45f8e346b4..3ab6b7248f 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -1056,7 +1056,7 @@ static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
> return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
> }
>
> -static int vfio_migration_init(VFIODevice *vbasedev)
> +static bool vfio_migration_init(VFIODevice *vbasedev, Error **errp)
> {
> int ret;
> Object *obj;
> @@ -1067,22 +1067,32 @@ static int vfio_migration_init(VFIODevice *vbasedev)
> VMChangeStateHandler *prepare_cb;
>
> if (!vbasedev->ops->vfio_get_object) {
> - return -EINVAL;
> + error_setg(errp, "no vfio_get_object handler");
> + return false;
> }
>
> obj = vbasedev->ops->vfio_get_object(vbasedev);
> if (!obj) {
> - return -EINVAL;
> + error_setg(errp, "failed to get object");
> + return false;
> }
>
> ret = vfio_migration_query_flags(vbasedev, &mig_flags);
> if (ret) {
> - return ret;
> + if (ret == -ENOTTY) {
> + error_setg_errno(errp, -ret,
> + "migration is not supported in kernel");
> + } else {
> + error_setg_errno(errp, -ret, "failed to query migration flags");
> + }
> +
> + return false;
> }
>
> /* Basic migration functionality must be supported */
> if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
> - return -EOPNOTSUPP;
> + error_setg(errp, "VFIO_MIGRATION_STOP_COPY is not supported");
> + return false;
> }
>
> vbasedev->migration = g_new0(VFIOMigration, 1);
> @@ -1113,7 +1123,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
> migration_add_notifier(&migration->migration_state,
> vfio_migration_state_notifier);
>
> - return 0;
> + return true;
> }
>
> static Error *multiple_devices_migration_blocker;
> @@ -1279,18 +1289,8 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
> return !vfio_block_migration(vbasedev, err, errp);
> }
>
> - ret = vfio_migration_init(vbasedev);
> - if (ret) {
> - if (ret == -ENOTTY) {
> - error_setg(&err, "%s: VFIO migration is not supported in kernel",
> - vbasedev->name);
> - } else {
> - error_setg(&err,
> - "%s: Migration couldn't be initialized for VFIO device, "
> - "err: %d (%s)",
> - vbasedev->name, ret, strerror(-ret));
> - }
> -
> + if (!vfio_migration_init(vbasedev, &err)) {
> + error_prepend(&err, "%s: VFIO migration init failed: ", vbasedev->name);
> return !vfio_block_migration(vbasedev, err, errp);
> }
>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Thanks,
C.
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 11/14] vfio/migration: Add new switchover-ack mechanism
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (9 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 10/14] vfio/migration: Add Error ** parameter to vfio_migration_init() Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 7:58 ` [PATCH v3 12/14] vfio/migration: Implement VFIO_PRECOPY_INFO_REINIT feature Avihai Horon
` (2 subsequent siblings)
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Add support for the new switchover-ack mechanism. This includes
requesting a switchover ACK on the first save_query_pending call (with
exact=false) if VFIO precopy is supported.
This achieves the same functionality of legacy switchover-ack but with
the new switchover-ack mechanism.
Keep legacy switchover-ack functionality for backward compatibility.
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
hw/vfio/vfio-migration-internal.h | 1 +
hw/vfio/migration.c | 13 ++++++++++++-
hw/vfio/trace-events | 2 +-
3 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/hw/vfio/vfio-migration-internal.h b/hw/vfio/vfio-migration-internal.h
index a15fc74703..dc741e5142 100644
--- a/hw/vfio/vfio-migration-internal.h
+++ b/hw/vfio/vfio-migration-internal.h
@@ -58,6 +58,7 @@ typedef struct VFIOMigration {
bool multifd_transfer;
VFIOMultifd *multifd;
bool initial_data_sent;
+ bool request_switchover_ack;
bool event_save_iterate_started;
bool event_precopy_empty_hit;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 3ab6b7248f..ebe2eafded 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -582,6 +582,9 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
}
vfio_query_precopy_size(migration);
+ if (migrate_switchover_ack() && !migrate_switchover_ack_legacy()) {
+ migration->request_switchover_ack = true;
+ }
break;
case VFIO_DEVICE_STATE_STOP:
@@ -634,6 +637,7 @@ static void vfio_save_cleanup(void *opaque)
migration->precopy_init_size = 0;
migration->precopy_dirty_size = 0;
migration->initial_data_sent = false;
+ migration->request_switchover_ack = false;
vfio_migration_cleanup(vbasedev);
trace_vfio_save_cleanup(vbasedev->name);
}
@@ -655,6 +659,7 @@ static void vfio_state_pending(void *opaque, MigPendingData *pending,
VFIODevice *vbasedev = opaque;
VFIOMigration *migration = vbasedev->migration;
uint64_t precopy_size, stopcopy_size;
+ bool request_switchover_ack = false;
/*
* The final pending query runs during switchover downtime. VFIO does not
@@ -676,10 +681,16 @@ static void vfio_state_pending(void *opaque, MigPendingData *pending,
pending->precopy_bytes += precopy_size;
pending->stopcopy_bytes += stopcopy_size;
+ if (migration->request_switchover_ack) {
+ pending->switchover_ack_pending++;
+ request_switchover_ack = true;
+ migration->request_switchover_ack = false;
+ }
trace_vfio_state_pending(vbasedev->name, migration->stopcopy_size,
migration->precopy_init_size,
- migration->precopy_dirty_size, exact, final);
+ migration->precopy_dirty_size,
+ request_switchover_ack, exact, final);
}
static bool vfio_is_active_iterate(void *opaque)
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 411a306635..53f0ba357b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -175,7 +175,7 @@ vfio_save_device_config_state(const char *name) " (%s)"
vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64
vfio_save_iterate_start(const char *name) " (%s)"
vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size %"PRIu64
-vfio_state_pending(const char *name, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size, bool exact, bool final) " (%s) stopcopy size %"PRIu64", precopy initial size %"PRIu64", precopy dirty size %"PRIu64", exact %d, final %d"
+vfio_state_pending(const char *name, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size, bool request_switchover_ack, bool exact, bool final) " (%s) stopcopy size %"PRIu64", precopy initial size %"PRIu64", precopy dirty size %"PRIu64", request switchover ack %d, exact %d, final %d"
vfio_send_init_data_flag(const char *name) " (%s)"
vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v3 12/14] vfio/migration: Implement VFIO_PRECOPY_INFO_REINIT feature
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (10 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 11/14] vfio/migration: Add new switchover-ack mechanism Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-09 7:58 ` [PATCH v3 13/14] vfio/migration: Check VFIO_PRECOPY_INFO_REINIT during switchover Avihai Horon
2026-06-09 7:58 ` [PATCH v3 14/14] migration: Enable new switchover-ack Avihai Horon
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
According to VFIO uAPI, precopy initial_bytes is considered as critical
data that should be transferred and loaded prior to moving to STOP_COPY
state to ensure precopy phase would be effective.
As currently defined, initial_bytes can only decrease as it's being read
from the data fd. However, there are cases where a new chunk of
initial_bytes should be transferred during precopy.
The new VFIO_PRECOPY_INFO_REINIT feature addresses this and allows
reporting a new value for initial_bytes regardless of any previously
reported values.
Implement VFIO_PRECOPY_INFO_REINIT feature:
1. Opt-in for VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 to make
VFIO_PRECOPY_INFO_REINIT available.
2. Request a new switchover ACK if initial_bytes increases post of a
previous switchover ACK. This ensures the device is not moved to
STOP_COPY before initial_bytes has reached zero again.
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
docs/devel/migration/vfio.rst | 14 +++++++
hw/vfio/vfio-migration-internal.h | 1 +
hw/vfio/migration.c | 68 ++++++++++++++++++++++++++++---
hw/vfio/trace-events | 4 +-
4 files changed, 80 insertions(+), 7 deletions(-)
diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst
index 854277b11c..f235c2d4f9 100644
--- a/docs/devel/migration/vfio.rst
+++ b/docs/devel/migration/vfio.rst
@@ -23,6 +23,20 @@ and recommends that the initial bytes are sent and loaded in the destination
before stopping the source VM. Enabling this migration capability will
guarantee that and thus, can potentially reduce downtime even further.
+For example, in mlx5 devices, the initial bytes hold metadata used for time
+consuming pre-allocations of resources on the destination. Although init bytes
+may be small in size and sending them may take little time, loading them in the
+destination can take a significant amount of time. Switchover-ack guarantees
+that this pre-allocation doesn't happen during downtime.
+
+Initial bytes was originally defined to be monotonically decreasing, however
+there are cases where a new chunk of initial bytes should be transferred during
+precopy, e.g., due to a device reconfiguration, etc. The
+VFIO_PRECOPY_INFO_REINIT feature addresses this and when supported, allows to
+report a new initial bytes value regardless of any previously reported values.
+In this case, a new switchover ACK will be requested to make sure the new
+initial bytes are loaded in the destination before switching over.
+
To support migration of multiple devices that might do P2P transactions between
themselves, VFIO migration uAPI defines an intermediate P2P quiescent state.
While in the P2P quiescent state, P2P DMA transactions cannot be initiated by
diff --git a/hw/vfio/vfio-migration-internal.h b/hw/vfio/vfio-migration-internal.h
index dc741e5142..a1c58b1126 100644
--- a/hw/vfio/vfio-migration-internal.h
+++ b/hw/vfio/vfio-migration-internal.h
@@ -45,6 +45,7 @@ typedef struct VFIOMigration {
void *data_buffer;
size_t data_buffer_size;
uint64_t mig_flags;
+ bool precopy_info_v2_used;
/*
* NOTE: all three sizes cached are reported from VFIO's uAPI, which
* are defined as estimate only. QEMU should not trust these values
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index ebe2eafded..1e172dd10b 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -373,9 +373,11 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev)
static int vfio_query_precopy_size(VFIOMigration *migration)
{
+ VFIODevice *vbasedev = migration->vbasedev;
struct vfio_precopy_info precopy = {
.argsz = sizeof(precopy),
};
+ bool reinit = false;
int ret = 0;
if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
@@ -383,25 +385,43 @@ static int vfio_query_precopy_size(VFIOMigration *migration)
migration->precopy_dirty_size = 0;
ret = -errno;
warn_report_once("VFIO device %s ioctl(VFIO_MIG_GET_PRECOPY_INFO) "
- "failed (%d)", migration->vbasedev->name, ret);
+ "failed (%d)", vbasedev->name, ret);
} else {
bool overflow;
migration->precopy_init_size = precopy.initial_bytes;
migration->precopy_dirty_size = precopy.dirty_bytes;
+ /*
+ * struct vfio_precopy_info.flags is valid only if
+ * VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 is used.
+ */
+ if (migration->precopy_info_v2_used) {
+ reinit = precopy.flags & VFIO_PRECOPY_INFO_REINIT;
+ }
- overflow = vfio_migration_check_overflow(migration->vbasedev,
+ overflow = vfio_migration_check_overflow(vbasedev,
migration->precopy_init_size, "precopy init size");
- overflow |= vfio_migration_check_overflow(migration->vbasedev,
+ overflow |= vfio_migration_check_overflow(vbasedev,
migration->precopy_dirty_size, "precopy dirty size");
if (overflow) {
ret = -ERANGE;
}
}
- trace_vfio_query_precopy_size(migration->vbasedev->name,
- migration->precopy_init_size,
- migration->precopy_dirty_size, ret);
+ trace_vfio_query_precopy_size(vbasedev->name, migration->precopy_init_size,
+ migration->precopy_dirty_size, reinit, ret);
+
+ /*
+ * If we got new initial_bytes after previous initial_bytes were
+ * transferred, request a new switchover ACK. Don't request if legacy
+ * switchover-ack is used.
+ */
+ if (reinit && migration->initial_data_sent &&
+ !migrate_switchover_ack_legacy()) {
+ migration->initial_data_sent = false;
+ migration->request_switchover_ack = true;
+ trace_vfio_query_precopy_size_request_switchover_ack(vbasedev->name);
+ }
return ret;
}
@@ -1054,6 +1074,27 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
return 0;
}
+/* Returns 1 on success, 0 if not supported and negative errno on failure */
+static int vfio_migration_set_precopy_info_v2(VFIODevice *vbasedev)
+{
+ uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+ sizeof(uint64_t))] = {};
+ struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+
+ feature->argsz = sizeof(buf);
+ feature->flags =
+ VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2;
+ if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+ if (errno == ENOTTY) {
+ return 0;
+ }
+
+ return -errno;
+ }
+
+ return 1;
+}
+
static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
{
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
@@ -1075,6 +1116,7 @@ static bool vfio_migration_init(VFIODevice *vbasedev, Error **errp)
char id[256] = "";
g_autofree char *path = NULL, *oid = NULL;
uint64_t mig_flags = 0;
+ bool precopy_info_v2_used = false;
VMChangeStateHandler *prepare_cb;
if (!vbasedev->ops->vfio_get_object) {
@@ -1106,12 +1148,22 @@ static bool vfio_migration_init(VFIODevice *vbasedev, Error **errp)
return false;
}
+ if (mig_flags & VFIO_MIGRATION_PRE_COPY) {
+ ret = vfio_migration_set_precopy_info_v2(vbasedev);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "failed to set precopy info v2");
+ return false;
+ }
+ precopy_info_v2_used = ret;
+ }
+
vbasedev->migration = g_new0(VFIOMigration, 1);
migration = vbasedev->migration;
migration->vbasedev = vbasedev;
migration->device_state = VFIO_DEVICE_STATE_RUNNING;
migration->data_fd = -1;
migration->mig_flags = mig_flags;
+ migration->precopy_info_v2_used = precopy_info_v2_used;
vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
@@ -1134,6 +1186,10 @@ static bool vfio_migration_init(VFIODevice *vbasedev, Error **errp)
migration_add_notifier(&migration->migration_state,
vfio_migration_state_notifier);
+ trace_vfio_migration_init(vbasedev->name, migration->mig_flags,
+ migration->precopy_info_v2_used,
+ vbasedev->dirty_pages_supported);
+
return true;
}
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 53f0ba357b..9f5c30d2f2 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -158,11 +158,13 @@ vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx
vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32
vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32
vfio_load_state_device_buffer_end(const char *name) " (%s)"
+vfio_migration_init(const char *name, uint64_t mig_flags, bool precopy_info_v2_used, bool dirty_pages_supported) " (%s) mig_flags 0x%"PRIx64", precopy_info_v2_used %d, dirty_pages_supported %d"
vfio_migration_realize(const char *name) " (%s)"
vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s"
vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s"
vfio_migration_state_notifier(const char *name, int state) " (%s) state %d"
-vfio_query_precopy_size(const char *name, uint64_t init_size, uint64_t dirty_size, int ret) " (%s) init %"PRIu64" dirty %"PRIu64" ret %d"
+vfio_query_precopy_size(const char *name, uint64_t init_size, uint64_t dirty_size, bool reinit, int ret) " (%s) init %"PRIu64", dirty %"PRIu64", reinit %d, ret %d"
+vfio_query_precopy_size_request_switchover_ack(const char *name) " (%s)"
vfio_query_stop_copy_size(const char *name, uint64_t size, int ret) " (%s) stopcopy size %"PRIu64" ret %d"
vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
vfio_save_block_precopy_empty_hit(const char *name) " (%s)"
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v3 13/14] vfio/migration: Check VFIO_PRECOPY_INFO_REINIT during switchover
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (11 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 12/14] vfio/migration: Implement VFIO_PRECOPY_INFO_REINIT feature Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
2026-06-12 16:22 ` Peter Xu
2026-06-09 7:58 ` [PATCH v3 14/14] migration: Enable new switchover-ack Avihai Horon
13 siblings, 1 reply; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
VFIO_REPCOPY_INFO_REINIT is checked only during precopy, before the
switchover decision. However, the switchover decision and guest stop are
not atomic, so a VFIO device may want to set VFIO_PRECOPY_INFO_REINIT
and request another switchover ACK in the gap after switchover decision
has been made but before the guest is stopped. This would be missed and
may increase downtime.
Solve this by checking if VFIO_PRECOPY_INFO_REINIT was set during that
gap, and request a new switchover-ack in the final save_state_pending
call. Query precopy info after vCPUs are stopped but before
transitioning from PRE_COPY state, when its valid to call the ioctl.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
hw/vfio/migration.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 1e172dd10b..73c49d8c24 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -685,6 +685,9 @@ static void vfio_state_pending(void *opaque, MigPendingData *pending,
* The final pending query runs during switchover downtime. VFIO does not
* need a fresh device pending-data query then to get the latest dirty
* data, so avoid the extra work and report the cached counters below.
+ * On the other hand, precopy sync is needed to check if switchover ACK was
+ * requested, but that's already done during guest stop when device is in
+ * PRE_COPY state.
*/
if (exact && !final) {
vfio_state_pending_sync(vbasedev);
@@ -964,6 +967,26 @@ static const SaveVMHandlers savevm_vfio_handlers = {
/* ---------------------------------------------------------------------- */
+static void vfio_final_precopy_reinit_check(VFIODevice *vbasedev)
+{
+ VFIOMigration *migration = vbasedev->migration;
+ int ret;
+
+ if (!migration->precopy_info_v2_used || !migrate_switchover_ack() ||
+ migrate_switchover_ack_legacy()) {
+ return;
+ }
+
+ ret = vfio_query_precopy_size(migration);
+ if (ret) {
+ error_report("%s: Final precopy reinit check failed (err: %d)",
+ vbasedev->name, ret);
+ /* If query failed, assume reinit and request switchover-ack */
+ migration->request_switchover_ack = true;
+ migration->initial_data_sent = false;
+ }
+}
+
static void vfio_vmstate_change_prepare(void *opaque, bool running,
RunState state)
{
@@ -977,6 +1000,15 @@ static void vfio_vmstate_change_prepare(void *opaque, bool running,
VFIO_DEVICE_STATE_PRE_COPY_P2P :
VFIO_DEVICE_STATE_RUNNING_P2P;
+ if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
+ /*
+ * Now that vCPUs are stopped, check if new init_bytes are available
+ * since switchover decision, to be reported in the final
+ * save_query_pending.
+ */
+ vfio_final_precopy_reinit_check(vbasedev);
+ }
+
ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
if (ret) {
/*
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v3 13/14] vfio/migration: Check VFIO_PRECOPY_INFO_REINIT during switchover
2026-06-09 7:58 ` [PATCH v3 13/14] vfio/migration: Check VFIO_PRECOPY_INFO_REINIT during switchover Avihai Horon
@ 2026-06-12 16:22 ` Peter Xu
0 siblings, 0 replies; 27+ messages in thread
From: Peter Xu @ 2026-06-12 16:22 UTC (permalink / raw)
To: Avihai Horon
Cc: qemu-devel, Alex Williamson, Cédric Le Goater, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb
On Tue, Jun 09, 2026 at 10:58:11AM +0300, Avihai Horon wrote:
> VFIO_REPCOPY_INFO_REINIT is checked only during precopy, before the
> switchover decision. However, the switchover decision and guest stop are
> not atomic, so a VFIO device may want to set VFIO_PRECOPY_INFO_REINIT
> and request another switchover ACK in the gap after switchover decision
> has been made but before the guest is stopped. This would be missed and
> may increase downtime.
>
> Solve this by checking if VFIO_PRECOPY_INFO_REINIT was set during that
> gap, and request a new switchover-ack in the final save_state_pending
> call. Query precopy info after vCPUs are stopped but before
> transitioning from PRE_COPY state, when its valid to call the ioctl.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
This smells like a workaround of a not-proper kernel ABI.. but I assume
it's not a huge deal.
Acked-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v3 14/14] migration: Enable new switchover-ack
2026-06-09 7:57 [PATCH v3 00/14] Make switchover-ack re-usable and add VFIO precopy REINIT feature Avihai Horon
` (12 preceding siblings ...)
2026-06-09 7:58 ` [PATCH v3 13/14] vfio/migration: Check VFIO_PRECOPY_INFO_REINIT during switchover Avihai Horon
@ 2026-06-09 7:58 ` Avihai Horon
13 siblings, 0 replies; 27+ messages in thread
From: Avihai Horon @ 2026-06-09 7:58 UTC (permalink / raw)
To: qemu-devel
Cc: Alex Williamson, Cédric Le Goater, Peter Xu, Fabiano Rosas,
Pierrick Bouvier, Philippe Mathieu-Daudé, Zhao Liu,
Halil Pasic, Christian Borntraeger, Jason Herne,
Richard Henderson, Ilya Leoshkevich, David Hildenbrand,
Eric Farman, Matthew Rosato, Cornelia Huck, Eric Blake,
Vladimir Sementsov-Ogievskiy, John Snow, Markus Armbruster,
Maor Gottlieb, Avihai Horon
Now that VFIO has implemented new switchover-ack, enable it.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
migration/options.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/migration/options.c b/migration/options.c
index 4c9b25372e..dfce19405d 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -110,9 +110,8 @@ const Property migration_properties[] = {
preempt_pre_7_2, false),
DEFINE_PROP_BOOL("multifd-clean-tls-termination", MigrationState,
multifd_clean_tls_termination, true),
- /* Use legacy until VFIO implements new switchover-ack */
DEFINE_PROP_BOOL("switchover-ack-legacy", MigrationState,
- switchover_ack_legacy, true),
+ switchover_ack_legacy, false),
/* Migration parameters */
DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
--
2.40.1
^ permalink raw reply related [flat|nested] 27+ messages in thread