Hi Lukas, On Tue, Mar 10, 2026 at 04:29:54PM +0100, Lukas Straub wrote: > On Tue, 10 Mar 2026 20:17:57 +0530 > Arun Menon wrote: > > > Hi Lukas, > > > > On Mon, Mar 02, 2026 at 12:45:28PM +0100, Lukas Straub wrote: > > > Add a COLO migration test for COLO migration and failover. > > > > > > Reviewed-by: Fabiano Rosas > > > Tested-by: Fabiano Rosas > > > Reviewed-by: Peter Xu > > > Signed-off-by: Lukas Straub > > > --- > > > MAINTAINERS | 1 + > > > tests/qtest/meson.build | 7 +- > > > tests/qtest/migration-test.c | 1 + > > > tests/qtest/migration/colo-tests.c | 198 +++++++++++++++++++++++++++++++++++++ > > > tests/qtest/migration/framework.h | 5 + > > > 5 files changed, 211 insertions(+), 1 deletion(-) > > > > > > diff --git a/MAINTAINERS b/MAINTAINERS > > > index d2a1f4cc08223cb944b61e32a6d89e25bf82eacb..1b0ae10750036be00571b7104ad8426c071bb54c 100644 > > > --- a/MAINTAINERS > > > +++ b/MAINTAINERS > > > @@ -3875,6 +3875,7 @@ F: migration/colo* > > > F: migration/multifd-colo.* > > > F: include/migration/colo.h > > > F: include/migration/failover.h > > > +F: tests/qtest/migration/colo-tests.c > > > F: docs/COLO-FT.txt > > > > > > COLO Proxy > > > diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build > > > index 25fdbc798010b19e8ec9b6ab55e02d3fb5741398..6a46e2a767de12d978d910ddb6de175bce9810b8 100644 > > > --- a/tests/qtest/meson.build > > > +++ b/tests/qtest/meson.build > > > @@ -374,6 +374,11 @@ if gnutls.found() > > > endif > > > endif > > > > > > +migration_colo_files = [] > > > +if get_option('replication').allowed() > > > + migration_colo_files = [files('migration/colo-tests.c')] > > > +endif > > > + > > > qtests = { > > > 'aspeed_hace-test': files('aspeed-hace-utils.c', 'aspeed_hace-test.c'), > > > 'aspeed_smc-test': files('aspeed-smc-utils.c', 'aspeed_smc-test.c'), > > > @@ -385,7 +390,7 @@ qtests = { > > > 'migration/migration-util.c') + dbus_vmstate1, > > > 'erst-test': files('erst-test.c'), > > > 'ivshmem-test': [rt, '../../contrib/ivshmem-server/ivshmem-server.c'], > > > - 'migration-test': test_migration_files + migration_tls_files, > > > + 'migration-test': test_migration_files + migration_tls_files + migration_colo_files, > > > 'pxe-test': files('boot-sector.c'), > > > 'pnv-xive2-test': files('pnv-xive2-common.c', 'pnv-xive2-flush-sync.c', > > > 'pnv-xive2-nvpg_bar.c'), > > > diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c > > > index 08936871741535c926eeac40a7d7c3f461c72fd0..e582f05c7dc2673dbd05a936df8feb6c964b5bbc 100644 > > > --- a/tests/qtest/migration-test.c > > > +++ b/tests/qtest/migration-test.c > > > @@ -55,6 +55,7 @@ int main(int argc, char **argv) > > > migration_test_add_precopy(env); > > > migration_test_add_cpr(env); > > > migration_test_add_misc(env); > > > + migration_test_add_colo(env); > > > > > > ret = g_test_run(); > > > > > > diff --git a/tests/qtest/migration/colo-tests.c b/tests/qtest/migration/colo-tests.c > > > new file mode 100644 > > > index 0000000000000000000000000000000000000000..598a1d3821ed0a90318732702027cebad47352fd > > > --- /dev/null > > > +++ b/tests/qtest/migration/colo-tests.c > > > @@ -0,0 +1,198 @@ > > > +/* > > > + * SPDX-License-Identifier: GPL-2.0-or-later > > > + * > > > + * QTest testcases for COLO migration > > > + * > > > + * Copyright (c) 2025 Lukas Straub > > > + * > > > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > > > + * See the COPYING file in the top-level directory. > > > + * > > > + */ > > > + > > > +#include "qemu/osdep.h" > > > +#include "libqtest.h" > > > +#include "migration/framework.h" > > > +#include "migration/migration-qmp.h" > > > +#include "migration/migration-util.h" > > > +#include "qemu/module.h" > > > + > > > +static int test_colo_common(MigrateCommon *args, > > > + bool failover_during_checkpoint, > > > + bool primary_failover) > > > +{ > > > + QTestState *from, *to; > > > + void *data_hook = NULL; > > > + > > > + /* > > > + * For the COLO test, both VMs will run in parallel. Thus both VMs want to > > > + * open the image read/write at the same time. Using read-only=on is not > > > + * possible here, because ide-hd does not support read-only backing image. > > > + * > > > + * So use -snapshot, where each qemu instance creates its own writable > > > + * snapshot internally while leaving the real image read-only. > > > + */ > > > + args->start.opts_source = "-snapshot"; > > > + args->start.opts_target = "-snapshot"; > > > + > > > + /* > > > + * COLO migration code logs many errors when the migration socket > > > + * is shut down, these are expected so we hide them here. > > > + */ > > > + args->start.hide_stderr = true; > > > + > > > + /* > > > + * Test with yank with out of band capability since that is how it is > > > + * used in production. > > > + */ > > > + args->start.oob = true; > > > + args->start.caps[MIGRATION_CAPABILITY_X_COLO] = true; > > > + > > > + if (migrate_start(&from, &to, args->listen_uri, &args->start)) { > > > + return -1; > > > + } > > > + > > > + migrate_set_parameter_int(from, "x-checkpoint-delay", 300); > > > + > > > + if (args->start_hook) { > > > + data_hook = args->start_hook(from, to); > > > + } > > > + > > > + migrate_ensure_converge(from); > > > + wait_for_serial("src_serial"); > > > + > > > + migrate_qmp(from, to, args->connect_uri, NULL, "{}"); > > > + > > > + wait_for_migration_status(from, "colo", NULL); > > > + wait_for_resume(to, get_dst()); > > > + > > > + wait_for_serial("src_serial"); > > > + wait_for_serial("dest_serial"); > > > + > > > + /* wait for 3 checkpoints */ > > > + for (int i = 0; i < 3; i++) { > > > + qtest_qmp_eventwait(to, "RESUME"); > > > + wait_for_serial("src_serial"); > > > + wait_for_serial("dest_serial"); > > > + } > > > + > > > + if (failover_during_checkpoint) { > > > + qtest_qmp_eventwait(to, "STOP"); > > > + } > > > + if (primary_failover) { > > > + qtest_qmp_assert_success(from, "{'exec-oob': 'yank', 'id': 'yank-cmd', " > > > + "'arguments': {'instances':" > > > + "[{'type': 'migration'}]}}"); > > > + qtest_qmp_assert_success(from, "{'execute': 'x-colo-lost-heartbeat'}"); > > > + wait_for_serial("src_serial"); > > > + } else { > > > + qtest_qmp_assert_success(to, "{'exec-oob': 'yank', 'id': 'yank-cmd', " > > > + "'arguments': {'instances':" > > > + "[{'type': 'migration'}]}}"); > > > + qtest_qmp_assert_success(to, "{'execute': 'x-colo-lost-heartbeat'}"); > > > + wait_for_serial("dest_serial"); > > > + } > > > + > > > + if (args->end_hook) { > > > + args->end_hook(from, to, data_hook); > > > + } > > > + > > > + migrate_end(from, to, !primary_failover); > > > + > > > + return 0; > > > +} > > > + > > > +static void test_colo_plain_common(MigrateCommon *args, > > > + bool failover_during_checkpoint, > > > + bool primary_failover) > > > +{ > > > + args->listen_uri = "tcp:127.0.0.1:0"; > > > + test_colo_common(args, failover_during_checkpoint, primary_failover); > > > +} > > > + > > > +static void *hook_start_multifd(QTestState *from, QTestState *to) > > > +{ > > > + return migrate_hook_start_precopy_tcp_multifd_common(from, to, "none"); > > > +} > > > + > > > +static void test_colo_multifd_common(MigrateCommon *args, > > > + bool failover_during_checkpoint, > > > + bool primary_failover) > > > +{ > > > + args->listen_uri = "defer"; > > > + args->start_hook = hook_start_multifd; > > > + args->start.caps[MIGRATION_CAPABILITY_MULTIFD] = true; > > > + test_colo_common(args, failover_during_checkpoint, primary_failover); > > > +} > > > + > > > +static void test_colo_plain_primary_failover(char *name, MigrateCommon *args) > > > +{ > > > + test_colo_plain_common(args, false, true); > > > +} > > > + > > > +static void test_colo_plain_secondary_failover(char *name, MigrateCommon *args) > > > +{ > > > + test_colo_plain_common(args, false, false); > > > +} > > > + > > > +static void test_colo_multifd_primary_failover(char *name, MigrateCommon *args) > > > +{ > > > + test_colo_multifd_common(args, false, true); > > > +} > > > + > > > +static void test_colo_multifd_secondary_failover(char *name, > > > + MigrateCommon *args) > > > +{ > > > + test_colo_multifd_common(args, false, false); > > > +} > > > + > > > +static void test_colo_plain_primary_failover_checkpoint(char *name, > > > + MigrateCommon *args) > > > +{ > > > + test_colo_plain_common(args, true, true); > > > +} > > > + > > > +static void test_colo_plain_secondary_failover_checkpoint(char *name, > > > + MigrateCommon *args) > > > +{ > > > + test_colo_plain_common(args, true, false); > > > +} > > > + > > > +static void test_colo_multifd_primary_failover_checkpoint(char *name, > > > + MigrateCommon *args) > > > +{ > > > + test_colo_multifd_common(args, true, true); > > > +} > > > + > > > +static void test_colo_multifd_secondary_failover_checkpoint(char *name, > > > + MigrateCommon *args) > > > +{ > > > + test_colo_multifd_common(args, true, false); > > > +} > > > + > > > +void migration_test_add_colo(MigrationTestEnv *env) > > > +{ > > > + if (!env->full_set) { > > > + return; > > > + } > > > + > > > + migration_test_add("/migration/colo/plain/primary_failover", > > > + test_colo_plain_primary_failover); > > > + migration_test_add("/migration/colo/plain/secondary_failover", > > > + test_colo_plain_secondary_failover); > > > + > > > + migration_test_add("/migration/colo/multifd/primary_failover", > > > + test_colo_multifd_primary_failover); > > > + migration_test_add("/migration/colo/multifd/secondary_failover", > > > + test_colo_multifd_secondary_failover); > > > + > > > + migration_test_add("/migration/colo/plain/primary_failover_checkpoint", > > > + test_colo_plain_primary_failover_checkpoint); > > > + migration_test_add("/migration/colo/plain/secondary_failover_checkpoint", > > > + test_colo_plain_secondary_failover_checkpoint); > > > + > > > + migration_test_add("/migration/colo/multifd/primary_failover_checkpoint", > > > + test_colo_multifd_primary_failover_checkpoint); > > > + migration_test_add("/migration/colo/multifd/secondary_failover_checkpoint", > > > + test_colo_multifd_secondary_failover_checkpoint); > > > +} > > > diff --git a/tests/qtest/migration/framework.h b/tests/qtest/migration/framework.h > > > index 40984d04930da2d181326d9f6a742bde49018103..80eef758932ce9c301ed6c0f6383d18756144870 100644 > > > --- a/tests/qtest/migration/framework.h > > > +++ b/tests/qtest/migration/framework.h > > > @@ -264,5 +264,10 @@ void migration_test_add_file(MigrationTestEnv *env); > > > void migration_test_add_precopy(MigrationTestEnv *env); > > > void migration_test_add_cpr(MigrationTestEnv *env); > > > void migration_test_add_misc(MigrationTestEnv *env); > > > +#ifdef CONFIG_REPLICATION > > > +void migration_test_add_colo(MigrationTestEnv *env); > > > +#else > > > +static inline void migration_test_add_colo(MigrationTestEnv *env) {}; > > > +#endif > > > > > > #endif /* TEST_FRAMEWORK_H */ > > > > > > -- > > > 2.39.5 > > > > > > > > > > I was running the qtests locally, and I encountered a timeout error. > > > > Command run: mkdir -p build ; cd build ; make check-qtest-x86_64; > > > > Following is the output: > > ====== > > 67/67 qtest+qtest-x86_64 - qemu:qtest-x86_64/migration-test TIMEOUT 480.05s killed by signal 15 SIGTERM > > >>> QTEST_QEMU_IMG=./qemu-img LD_LIBRARY_PATH=/home/arun/workdir/new/devel/upstream/qemu-priv/build/subprojects/slirp RUST_BACKTRACE=1 QTEST_QEMU_BINARY=./qemu-system-x86_64 UBSAN_OPTIONS=halt_on_error=1:abort_on_error=1:print_summary=1:print_stacktrace=1 G_TEST_DBUS_DAEMON=/home/arun/workdir/new/devel/upstream/qemu-priv/tests/dbus-vmstate-daemon.sh ASAN_OPTIONS=halt_on_error=1:abort_on_error=1:print_summary=1 PYTHON=/home/arun/workdir/new/devel/upstream/qemu-priv/build/pyvenv/bin/python3 MESON_TEST_ITERATION=1 MALLOC_PERTURB_=53 QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon MSAN_OPTIONS=halt_on_error=1:abort_on_error=1:print_summary=1:print_stacktrace=1 /home/arun/workdir/new/devel/upstream/qemu-priv/build/tests/qtest/migration-test --tap -k --full > > stderr: > > > > TAP parsing error: Too few tests run (expected 52, got 47) > > > > Summary of Failures: > > 67/67 qtest+qtest-x86_64 - qemu:qtest-x86_64/migration-test TIMEOUT 480.05s killed by signal 15 SIGTERM > > Ok: 64 > > Fail: 0 > > Skipped: 2 > > Timeout: 1 > > ====== > > > > It seems that the test runner is stuck waiting for some input. > > Following is the stack trace > > ====== > > > ps afx > > 127267 pts/0 S+ 0:00 | | \_ make check-qtest-x86_64 -j8 > > 128245 pts/0 S+ 0:01 | | \_ /home/arun/workdir/new/devel/upstream/qemu-priv/build/pyvenv/bin/python3 /home/arun/workdir > > 128276 ? Ssl 0:07 | | \_ /home/arun/workdir/new/devel/upstream/qemu-priv/build/tests/qtest/migration-test --tap > > 134107 ? Sl 0:20 | | \_ ./qemu-system-x86_64 -qtest unix:/tmp/qtest-128276.sock -qtest-log /dev/null -chard > > 134115 ? Sl 0:22 | | \_ ./qemu-system-x86_64 -qtest unix:/tmp/qtest-128276.sock -qtest-log /dev/null -chard > > 5610 pts/2 Ss 0:01 | \_ /usr/bin/bash > > > > ====== > > gstack 128276 > > Thread 2 (Thread 0x7fdd090716c0 (LWP 128279) "call_rcu"): > > #0 0x00007fdd0921434d in syscall () from /lib64/libc.so.6 > > #1 0x0000557fd604563a in qemu_futex_wait (f=0x557fd60a0190 , val=4294967295) at /home/arun/workdir/new/devel/upstream/qemu-priv/include/qemu/futex.h:47 > > #2 0x0000557fd604584e in qemu_event_wait (ev=0x557fd60a0190 ) at ../util/event.c:162 > > #3 0x0000557fd6045fde in call_rcu_thread (opaque=0x0) at ../util/rcu.c:304 > > #4 0x0000557fd600e8fb in qemu_thread_start (args=0x557fd6beec70) at ../util/qemu-thread-posix.c:414 > > #5 0x00007fdd09193464 in start_thread () from /lib64/libc.so.6 > > #6 0x00007fdd092165ac in __clone3 () from /lib64/libc.so.6 > > > > Thread 1 (Thread 0x7fdd09073240 (LWP 128276) "migration-test"): > > #0 0x00007fdd0919b982 in __syscall_cancel_arch () from /lib64/libc.so.6 > > #1 0x00007fdd0918fc3c in __internal_syscall_cancel () from /lib64/libc.so.6 > > #2 0x00007fdd091dfb62 in clock_nanosleep@GLIBC_2.2.5 () from /lib64/libc.so.6 > > #3 0x00007fdd091ebb37 in nanosleep () from /lib64/libc.so.6 > > #4 0x00007fdd0921613a in usleep () from /lib64/libc.so.6 > > #5 0x0000557fd5fd99cd in wait_for_serial (side=0x557fd6065f08 "dest_serial") at ../tests/qtest/migration/framework.c:82 > > #6 0x0000557fd5fe5865 in test_colo_common (args=0x557fd6bfdf50, failover_during_checkpoint=false, primary_failover=true) at ../tests/qtest/migration/colo-tests.c:66 > > 60 migrate_qmp(from, to, args->connect_uri, NULL, "{}"); > 61 > 62 wait_for_migration_status(from, "colo", NULL); > 63 wait_for_resume(to, get_dst()); > 64 > 65 wait_for_serial("src_serial"); > 66 wait_for_serial("dest_serial"); > > Interesting, so the secondary guest is stuck/crahsed after entering > colo state despite having resumed. > > It works fine here on master. And before the merge I have looped the > colo tests for a whole day on my machine without any failures. > > How often does this happen? What is the commit you are on, host, ASAN, > MSAN, UBSAN, configure options? With kvm or without? I have started the run on a fedora-43 VM. I am on commit: de61484ec39f418e5c0d4603017695f9ffb8fe24 master branch. The configuration command I used was: ../configure --target-list=x86_64-softmmu --enable-debug --enable-trace-backends=log --enable-slirp Targets and accelerators KVM support : YES > > Can you try the following and show me the log: > (cd build && QTEST_QEMU_IMG=./qemu-img QTEST_QEMU_BINARY=./qemu-system-x86_64 QTEST_LOG=- tests/qtest/migration-test --full -p /x86_64/migration/colo/plain) > I have attached the log file : qemu.log to this email. It is stuck at the last line : {"timestamp": {"seconds": 1773164370, "microseconds": 609791}, "event": "RESUME"} It should be some configuration miss from my end. Thank you for looking into it. > > #7 0x0000557fd5fe5a0f in test_colo_plain_common (args=0x557fd6bfdf50, failover_during_checkpoint=false, primary_failover=true) at ../tests/qtest/migration/colo-tests.c:106 > > #8 0x0000557fd5fe5ad7 in test_colo_plain_primary_failover (name=0x557fd6bfd050 "/migration/colo/plain/primary_failover", args=0x557fd6bfdf50) at ../tests/qtest/migration/colo-tests.c:126 > > #9 0x0000557fd5fdeff8 in migration_test_wrapper (data=0x557fd6bfd320) at ../tests/qtest/migration/migration-util.c:258 > > #10 0x00007fdd0947bf3e in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 > > #11 0x00007fdd0947beb3 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 > > #12 0x00007fdd0947beb3 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 > > #13 0x00007fdd0947beb3 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 > > #14 0x00007fdd0947beb3 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 > > #15 0x00007fdd0947c46a in g_test_run_suite () from /lib64/libglib-2.0.so.0 > > #16 0x00007fdd0947c500 in g_test_run () from /lib64/libglib-2.0.so.0 > > #17 0x0000557fd5fd9490 in main (argc=1, argv=0x7fff33b51908) at ../tests/qtest/migration-test.c:60 > > > > > > Is there something that I am missing? Can you please look into this? > > > > > > Regards, > > Arun Menon > > > Regards, Arun Menon