* [PATCH V4 01/19] backends/hostmem-shm: factor out allocation of "anonymous shared memory with an fd"
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-09 17:36 ` Peter Xu
2024-12-02 13:19 ` [PATCH V4 02/19] physmem: fd-based shared memory Steve Sistare
` (17 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Let's factor it out so we can reuse it.
Signed-off-by: David Hildenbrand <david@redhat.com>
---
backends/hostmem-shm.c | 45 ++++-----------------------------------
include/qemu/osdep.h | 2 ++
meson.build | 8 +++++--
util/oslib-posix.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
util/oslib-win32.c | 11 ++++++++++
5 files changed, 81 insertions(+), 43 deletions(-)
diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c
index 374edc3..837b9f1 100644
--- a/backends/hostmem-shm.c
+++ b/backends/hostmem-shm.c
@@ -25,11 +25,9 @@ struct HostMemoryBackendShm {
static bool
shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
{
- g_autoptr(GString) shm_name = g_string_new(NULL);
g_autofree char *backend_name = NULL;
uint32_t ram_flags;
- int fd, oflag;
- mode_t mode;
+ int fd;
if (!backend->size) {
error_setg(errp, "can't create shm backend with size 0");
@@ -41,48 +39,13 @@ shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
return false;
}
- /*
- * Let's use `mode = 0` because we don't want other processes to open our
- * memory unless we share the file descriptor with them.
- */
- mode = 0;
- oflag = O_RDWR | O_CREAT | O_EXCL;
- backend_name = host_memory_backend_get_name(backend);
-
- /*
- * Some operating systems allow creating anonymous POSIX shared memory
- * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not
- * defined by POSIX, so let's create a unique name.
- *
- * From Linux's shm_open(3) man-page:
- * For portable use, a shared memory object should be identified
- * by a name of the form /somename;"
- */
- g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%s", getpid(),
- backend_name);
-
- fd = shm_open(shm_name->str, oflag, mode);
+ fd = qemu_shm_alloc(backend->size, errp);
if (fd < 0) {
- error_setg_errno(errp, errno,
- "failed to create POSIX shared memory");
- return false;
- }
-
- /*
- * We have the file descriptor, so we no longer need to expose the
- * POSIX shared memory object. However it will remain allocated as long as
- * there are file descriptors pointing to it.
- */
- shm_unlink(shm_name->str);
-
- if (ftruncate(fd, backend->size) == -1) {
- error_setg_errno(errp, errno,
- "failed to resize POSIX shared memory to %" PRIu64,
- backend->size);
- close(fd);
return false;
}
+ /* Let's do the same as memory-backend-ram,share=on would do. */
+ backend_name = host_memory_backend_get_name(backend);
ram_flags = RAM_SHARED;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index fdff07f..e2a0e15 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -509,6 +509,8 @@ int qemu_daemon(int nochdir, int noclose);
void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared,
bool noreserve);
void qemu_anon_ram_free(void *ptr, size_t size);
+int qemu_shm_alloc(size_t size, Error **errp);
+bool qemu_shm_available(void);
#ifdef _WIN32
#define HAVE_CHARDEV_SERIAL 1
diff --git a/meson.build b/meson.build
index b09bfb1..4f58783 100644
--- a/meson.build
+++ b/meson.build
@@ -3706,9 +3706,13 @@ libqemuutil = static_library('qemuutil',
build_by_default: false,
sources: util_ss.sources() + stub_ss.sources() + genh,
dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc])
+qemuutil_deps = [event_loop_base]
+if host_os != 'windows'
+ qemuutil_deps += [rt]
+endif
qemuutil = declare_dependency(link_with: libqemuutil,
sources: genh + version_res,
- dependencies: [event_loop_base])
+ dependencies: qemuutil_deps)
if have_system or have_user
decodetree = generator(find_program('scripts/decodetree.py'),
@@ -4361,7 +4365,7 @@ if have_tools
subdir('contrib/elf2dmp')
executable('qemu-edid', files('qemu-edid.c', 'hw/display/edid-generate.c'),
- dependencies: qemuutil,
+ dependencies: [qemuutil, rt],
install: true)
if have_vhost_user
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 11b35e4..f8c3724 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -931,3 +931,61 @@ void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
qemu_close_all_open_fd_fallback(skip, nskip, open_max);
}
}
+
+bool qemu_shm_available(void)
+{
+ return true;
+}
+
+int qemu_shm_alloc(size_t size, Error **errp)
+{
+ g_autoptr(GString) shm_name = g_string_new(NULL);
+ int fd, oflag, cur_sequence;
+ static int sequence;
+ mode_t mode;
+
+ cur_sequence = qatomic_fetch_inc(&sequence);
+
+ /*
+ * Let's use `mode = 0` because we don't want other processes to open our
+ * memory unless we share the file descriptor with them.
+ */
+ mode = 0;
+ oflag = O_RDWR | O_CREAT | O_EXCL;
+
+ /*
+ * Some operating systems allow creating anonymous POSIX shared memory
+ * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not
+ * defined by POSIX, so let's create a unique name.
+ *
+ * From Linux's shm_open(3) man-page:
+ * For portable use, a shared memory object should be identified
+ * by a name of the form /somename;"
+ */
+ g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%d", getpid(),
+ cur_sequence);
+
+ fd = shm_open(shm_name->str, oflag, mode);
+ if (fd < 0) {
+ error_setg_errno(errp, errno,
+ "failed to create POSIX shared memory");
+ return -1;
+ }
+
+ /*
+ * We have the file descriptor, so we no longer need to expose the
+ * POSIX shared memory object. However it will remain allocated as long as
+ * there are file descriptors pointing to it.
+ */
+ shm_unlink(shm_name->str);
+
+ if (ftruncate(fd, size) == -1) {
+ error_setg_errno(errp, errno,
+ "failed to resize POSIX shared memory to %" PRIu64,
+ size);
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index b623830..aef5779 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -877,3 +877,14 @@ void qemu_win32_map_free(void *ptr, HANDLE h, Error **errp)
}
CloseHandle(h);
}
+
+bool qemu_shm_available(void)
+{
+ return false;
+}
+
+int qemu_shm_alloc(size_t size, Error **errp)
+{
+ error_setg("Shared memory is not supported.");
+ return -1;
+}
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 01/19] backends/hostmem-shm: factor out allocation of "anonymous shared memory with an fd"
2024-12-02 13:19 ` [PATCH V4 01/19] backends/hostmem-shm: factor out allocation of "anonymous shared memory with an fd" Steve Sistare
@ 2024-12-09 17:36 ` Peter Xu
2024-12-12 20:37 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-09 17:36 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:19:53AM -0800, Steve Sistare wrote:
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index b623830..aef5779 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -877,3 +877,14 @@ void qemu_win32_map_free(void *ptr, HANDLE h, Error **errp)
> }
> CloseHandle(h);
> }
> +
> +bool qemu_shm_available(void)
> +{
> + return false;
> +}
> +
> +int qemu_shm_alloc(size_t size, Error **errp)
> +{
> + error_setg("Shared memory is not supported.");
May need a fixup here to make build pass.
> + return -1;
> +}
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 01/19] backends/hostmem-shm: factor out allocation of "anonymous shared memory with an fd"
2024-12-09 17:36 ` Peter Xu
@ 2024-12-12 20:37 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-12 20:37 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/9/2024 12:36 PM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:19:53AM -0800, Steve Sistare wrote:
>> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
>> index b623830..aef5779 100644
>> --- a/util/oslib-win32.c
>> +++ b/util/oslib-win32.c
>> @@ -877,3 +877,14 @@ void qemu_win32_map_free(void *ptr, HANDLE h, Error **errp)
>> }
>> CloseHandle(h);
>> }
>> +
>> +bool qemu_shm_available(void)
>> +{
>> + return false;
>> +}
>> +
>> +int qemu_shm_alloc(size_t size, Error **errp)
>> +{
>> + error_setg("Shared memory is not supported.");
>
> May need a fixup here to make build pass.
Yup!! Will fix.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
2024-12-02 13:19 ` [PATCH V4 01/19] backends/hostmem-shm: factor out allocation of "anonymous shared memory with an fd" Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-09 19:42 ` Peter Xu
2024-12-02 13:19 ` [PATCH V4 03/19] memory: add RAM_PRIVATE Steve Sistare
` (16 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, this may fail if the shm mount size is too small,
even if the system has free memory, so for backwards compatibility fall
back to qemu_anon_ram_alloc/MAP_ANON on shm_open failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Exclude Xen. Xen ignores RAM_SHARED and does its own allocation.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
system/physmem.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++----
system/trace-events | 1 +
2 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/system/physmem.c b/system/physmem.c
index dc1db3a..b0c4b22 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -47,6 +47,7 @@
#include "qemu/qemu-print.h"
#include "qemu/log.h"
#include "qemu/memalign.h"
+#include "qemu/memfd.h"
#include "exec/memory.h"
#include "exec/ioport.h"
#include "sysemu/dma.h"
@@ -2057,6 +2058,70 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
}
#endif
+static bool qemu_memfd_available(void)
+{
+ static int has_memfd = -1;
+
+ if (has_memfd < 0) {
+ has_memfd = qemu_memfd_check(0);
+ }
+ return has_memfd;
+}
+
+/*
+ * We want anonymous shared memory, similar to MAP_SHARED|MAP_ANON, but
+ * some users want the fd. Allocate shm explicitly to get an fd.
+ */
+static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
+{
+ size_t max_length = new_block->max_length;
+ MemoryRegion *mr = new_block->mr;
+ const char *name = memory_region_name(mr);
+ int fd;
+
+ if (qemu_memfd_available()) {
+ fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
+ if (fd < 0) {
+ return false;
+ }
+ } else if (!qemu_shm_available()) {
+ /*
+ * Backwards compatibility for Windows. The user may specify a
+ * memory backend with shared=on, and Windows ignores shared.
+ * Fall back to qemu_anon_ram_alloc.
+ */
+ return true;
+ } else {
+ Error *local_err = NULL;
+
+ fd = qemu_shm_alloc(max_length, &local_err);
+ if (fd < 0) {
+ /*
+ * Backwards compatibility in case the shm mount size is too small.
+ * Previous QEMU versions called qemu_anon_ram_alloc for anonymous
+ * shared memory, which could succeed.
+ */
+ error_prepend(&local_err,
+ "Retrying using MAP_ANON|MAP_SHARED because: ");
+ warn_report_err(local_err);
+ return true;
+ }
+ }
+
+ new_block->mr->align = QEMU_VMALLOC_ALIGN;
+ new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
+
+ if (new_block->host) {
+ qemu_set_cloexec(fd);
+ new_block->fd = fd;
+ trace_qemu_ram_alloc_shared(name, max_length, fd, new_block->host);
+ return true;
+ }
+
+ close(fd);
+ return false;
+}
+
static
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
void (*resized)(const char*,
@@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
new_block->page_size = qemu_real_host_page_size();
new_block->host = host;
new_block->flags = ram_flags;
+
+ if (!host && !xen_enabled()) {
+ if ((new_block->flags & RAM_SHARED) &&
+ !qemu_ram_alloc_shared(new_block, &local_err)) {
+ goto err;
+ }
+ }
+
ram_block_add(new_block, &local_err);
- if (local_err) {
- g_free(new_block);
- error_propagate(errp, local_err);
- return NULL;
+ if (!local_err) {
+ return new_block;
}
- return new_block;
+
+err:
+ g_free(new_block);
+ error_propagate(errp, local_err);
+ return NULL;
}
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
diff --git a/system/trace-events b/system/trace-events
index 5bbc3fb..831a60c 100644
--- a/system/trace-events
+++ b/system/trace-events
@@ -33,6 +33,7 @@ address_space_map(void *as, uint64_t addr, uint64_t len, bool is_write, uint32_t
find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64
find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
+qemu_ram_alloc_shared(const char *name, size_t max_length, int fd, void *host) "%s size %zu fd %d host %p"
# cpus.c
vm_stop_flush_all(int ret) "ret %d"
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-02 13:19 ` [PATCH V4 02/19] physmem: fd-based shared memory Steve Sistare
@ 2024-12-09 19:42 ` Peter Xu
2024-12-12 20:38 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-09 19:42 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
> @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> new_block->page_size = qemu_real_host_page_size();
> new_block->host = host;
> new_block->flags = ram_flags;
> +
> + if (!host && !xen_enabled()) {
Adding one more xen check is unnecessary. This patch needed it could mean
that the patch can be refactored.. because we have xen checks in both
ram_block_add() and also in the fd allocation path.
At the meantime, see:
qemu_ram_alloc_from_fd():
if (kvm_enabled() && !kvm_has_sync_mmu()) {
error_setg(errp,
"host lacks kvm mmu notifiers, -mem-path unsupported");
return NULL;
}
I don't think any decent kernel could hit this, but that could be another
sign that this patch duplicated some file allocations.
> + if ((new_block->flags & RAM_SHARED) &&
> + !qemu_ram_alloc_shared(new_block, &local_err)) {
> + goto err;
> + }
> + }
> +
> ram_block_add(new_block, &local_err);
> - if (local_err) {
> - g_free(new_block);
> - error_propagate(errp, local_err);
> - return NULL;
> + if (!local_err) {
> + return new_block;
> }
> - return new_block;
> +
> +err:
> + g_free(new_block);
> + error_propagate(errp, local_err);
> + return NULL;
> }
IIUC we only need to conditionally convert an anon-allocation into an
fd-allocation, and then we don't need to mostly duplicate
qemu_ram_alloc_from_fd(), instead we reuse it.
I do have a few other comments elsewhere, but when I was trying to comment.
E.g., we either shouldn't need to bother caching qemu_memfd_check()
results, or do it in qemu_memfd_check() directly.. and some more.
Then I think it's easier I provide a patch, and also show that it can be
also smaller changes to do the same thing, with everything fixed up
(e.g. addressing above mmu notifier missing issue). What do you think as
below?
===8<===
From a90119131a972b0b4f15770fe0b431770456e447 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 9 Dec 2024 13:38:06 -0500
Subject: [PATCH] physmem: Try to always allocate anon and shared memory with
fd
qemu_ram_alloc_internal() is the memory API QEMU uses to allocate anonymous
memory. It allows RAM_SHARED too on top of anonymous.
It might be always beneficial to allocate memory with fd attached whenever
possible because fd is normally more flexible comparing to the virtual
mapping alone. For example, CPR can use it to pass over fds between
processes to share memory, especially useful when the memory can be pinned.
Since there's no harm when it's possible, do it unconditionally for all
such anonymous & shared memory allocations where the memory is to be
allocated. Provide fallbacks when it can fail, e.g., when none of the
memory attached fd is available.
Two extra ERRP_GUARD()s are needed in the used functions, as we will not
care about error even if it happened, so it's easier to allow passing NULL
into them.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
system/physmem.c | 38 ++++++++++++++++++++++++++++++++++++++
util/memfd.c | 2 ++
util/oslib-posix.c | 2 ++
3 files changed, 42 insertions(+)
diff --git a/system/physmem.c b/system/physmem.c
index dc1db3a384..4e795aefa0 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -47,6 +47,7 @@
#include "qemu/qemu-print.h"
#include "qemu/log.h"
#include "qemu/memalign.h"
+#include "qemu/memfd.h"
#include "exec/memory.h"
#include "exec/ioport.h"
#include "sysemu/dma.h"
@@ -2057,6 +2058,24 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
}
#endif
+/*
+ * Try to allocate a zero-sized anonymous fd for shared memory allocations.
+ * Returns >=0 if succeeded, <0 otherwise.
+ *
+ * Prioritize memfd, as it doesn't have the same /dev/shm size limitation
+ * v.s. POSIX shm_open().
+ */
+static int qemu_ram_alloc_anonymous_fd(void)
+{
+ if (qemu_memfd_check(0)) {
+ return qemu_memfd_create("anon-memfd", 0, 0, 0, 0, NULL);
+ } else if (qemu_shm_available()) {
+ return qemu_shm_alloc(0, NULL);
+ } else {
+ return -1;
+ }
+}
+
static
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
void (*resized)(const char*,
@@ -2073,6 +2092,25 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
assert(!host ^ (ram_flags & RAM_PREALLOC));
+ /*
+ * Try to use fd-based allocation for anonymous and shared memory,
+ * because fd is normally more flexible (e.g. on memory sharing between
+ * processes). We can still fallback to old ways if it fails.
+ */
+ if (!host && (ram_flags & RAM_SHARED)) {
+ int fd = qemu_ram_alloc_anonymous_fd();
+
+ if (fd >= 0) {
+ new_block = qemu_ram_alloc_from_fd(size, mr, ram_flags,
+ fd, 0, errp);
+ if (new_block) {
+ return new_block;
+ }
+ close(fd);
+ }
+ /* Either fd or ramblock allocation failed, fallback */
+ }
+
align = qemu_real_host_page_size();
align = MAX(align, TARGET_PAGE_SIZE);
size = ROUND_UP(size, align);
diff --git a/util/memfd.c b/util/memfd.c
index 8a2e906962..0dc15b2f44 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -52,6 +52,8 @@ int qemu_memfd_create(const char *name, size_t size, bool hugetlb,
{
int htsize = hugetlbsize ? ctz64(hugetlbsize) : 0;
+ ERRP_GUARD();
+
if (htsize && 1ULL << htsize != hugetlbsize) {
error_setg(errp, "Hugepage size must be a power of 2");
return -1;
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index f8c3724e68..6ca3e994fc 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -944,6 +944,8 @@ int qemu_shm_alloc(size_t size, Error **errp)
static int sequence;
mode_t mode;
+ ERRP_GUARD();
+
cur_sequence = qatomic_fetch_inc(&sequence);
/*
--
2.47.0
--
Peter Xu
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-09 19:42 ` Peter Xu
@ 2024-12-12 20:38 ` Steven Sistare
2024-12-12 21:22 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-12 20:38 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/9/2024 2:42 PM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
>> @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
>> new_block->page_size = qemu_real_host_page_size();
>> new_block->host = host;
>> new_block->flags = ram_flags;
>> +
>> + if (!host && !xen_enabled()) {
>
> Adding one more xen check is unnecessary. This patch needed it could mean
> that the patch can be refactored.. because we have xen checks in both
> ram_block_add() and also in the fd allocation path.
>
> At the meantime, see:
>
> qemu_ram_alloc_from_fd():
> if (kvm_enabled() && !kvm_has_sync_mmu()) {
> error_setg(errp,
> "host lacks kvm mmu notifiers, -mem-path unsupported");
> return NULL;
> }
>
> I don't think any decent kernel could hit this, but that could be another
> sign that this patch duplicated some file allocations.
>
>> + if ((new_block->flags & RAM_SHARED) &&
>> + !qemu_ram_alloc_shared(new_block, &local_err)) {
>> + goto err;
>> + }
>> + }
>> +
>> ram_block_add(new_block, &local_err);
>> - if (local_err) {
>> - g_free(new_block);
>> - error_propagate(errp, local_err);
>> - return NULL;
>> + if (!local_err) {
>> + return new_block;
>> }
>> - return new_block;
>> +
>> +err:
>> + g_free(new_block);
>> + error_propagate(errp, local_err);
>> + return NULL;
>> }
>
> IIUC we only need to conditionally convert an anon-allocation into an
> fd-allocation, and then we don't need to mostly duplicate
> qemu_ram_alloc_from_fd(), instead we reuse it.
>
> I do have a few other comments elsewhere, but when I was trying to comment.
> E.g., we either shouldn't need to bother caching qemu_memfd_check()
> results, or do it in qemu_memfd_check() directly.. and some more.
Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
and qemu_memfd_check will be called more often. I'll cache the result inside
qemu_memfd_check for the special case of flags=0.
> Then I think it's easier I provide a patch, and also show that it can be
> also smaller changes to do the same thing, with everything fixed up
> (e.g. addressing above mmu notifier missing issue). What do you think as
> below?
The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
which buys the xen and kvm checks for free. Sounds good, I will do that in the
context of my patch.
Here are some other changes in your patch, and my responses:
I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
and not fall back, because something unexpected went wrong. David said the same.
Thus we still need to pass errp to qemu_memfd_create().
I will push the qemu_shm_alloc ERRP_GUARD back to patch
"factor out allocation of anonymous shared memory"
- Steve
>
> ===8<===
> From a90119131a972b0b4f15770fe0b431770456e447 Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Mon, 9 Dec 2024 13:38:06 -0500
> Subject: [PATCH] physmem: Try to always allocate anon and shared memory with
> fd
>
> qemu_ram_alloc_internal() is the memory API QEMU uses to allocate anonymous
> memory. It allows RAM_SHARED too on top of anonymous.
>
> It might be always beneficial to allocate memory with fd attached whenever
> possible because fd is normally more flexible comparing to the virtual
> mapping alone. For example, CPR can use it to pass over fds between
> processes to share memory, especially useful when the memory can be pinned.
>
> Since there's no harm when it's possible, do it unconditionally for all
> such anonymous & shared memory allocations where the memory is to be
> allocated. Provide fallbacks when it can fail, e.g., when none of the
> memory attached fd is available.
>
> Two extra ERRP_GUARD()s are needed in the used functions, as we will not
> care about error even if it happened, so it's easier to allow passing NULL
> into them.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> system/physmem.c | 38 ++++++++++++++++++++++++++++++++++++++
> util/memfd.c | 2 ++
> util/oslib-posix.c | 2 ++
> 3 files changed, 42 insertions(+)
>
> diff --git a/system/physmem.c b/system/physmem.c
> index dc1db3a384..4e795aefa0 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -47,6 +47,7 @@
> #include "qemu/qemu-print.h"
> #include "qemu/log.h"
> #include "qemu/memalign.h"
> +#include "qemu/memfd.h"
> #include "exec/memory.h"
> #include "exec/ioport.h"
> #include "sysemu/dma.h"
> @@ -2057,6 +2058,24 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
> }
> #endif
>
> +/*
> + * Try to allocate a zero-sized anonymous fd for shared memory allocations.
> + * Returns >=0 if succeeded, <0 otherwise.
> + *
> + * Prioritize memfd, as it doesn't have the same /dev/shm size limitation
> + * v.s. POSIX shm_open().
> + */
> +static int qemu_ram_alloc_anonymous_fd(void)
> +{
> + if (qemu_memfd_check(0)) {
> + return qemu_memfd_create("anon-memfd", 0, 0, 0, 0, NULL);
> + } else if (qemu_shm_available()) {
> + return qemu_shm_alloc(0, NULL);
> + } else {
> + return -1;
> + }
> +}
> +
> static
> RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> void (*resized)(const char*,
> @@ -2073,6 +2092,25 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
> assert(!host ^ (ram_flags & RAM_PREALLOC));
>
> + /*
> + * Try to use fd-based allocation for anonymous and shared memory,
> + * because fd is normally more flexible (e.g. on memory sharing between
> + * processes). We can still fallback to old ways if it fails.
> + */
> + if (!host && (ram_flags & RAM_SHARED)) {
> + int fd = qemu_ram_alloc_anonymous_fd();
> +
> + if (fd >= 0) {
> + new_block = qemu_ram_alloc_from_fd(size, mr, ram_flags,
> + fd, 0, errp);
> + if (new_block) {
> + return new_block;
> + }
> + close(fd);
> + }
> + /* Either fd or ramblock allocation failed, fallback */
> + }
> +
> align = qemu_real_host_page_size();
> align = MAX(align, TARGET_PAGE_SIZE);
> size = ROUND_UP(size, align);
> diff --git a/util/memfd.c b/util/memfd.c
> index 8a2e906962..0dc15b2f44 100644
> --- a/util/memfd.c
> +++ b/util/memfd.c
> @@ -52,6 +52,8 @@ int qemu_memfd_create(const char *name, size_t size, bool hugetlb,
> {
> int htsize = hugetlbsize ? ctz64(hugetlbsize) : 0;
>
> + ERRP_GUARD();
> +
> if (htsize && 1ULL << htsize != hugetlbsize) {
> error_setg(errp, "Hugepage size must be a power of 2");
> return -1;
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index f8c3724e68..6ca3e994fc 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -944,6 +944,8 @@ int qemu_shm_alloc(size_t size, Error **errp)
> static int sequence;
> mode_t mode;
>
> + ERRP_GUARD();
> +
> cur_sequence = qatomic_fetch_inc(&sequence);
>
> /*
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-12 20:38 ` Steven Sistare
@ 2024-12-12 21:22 ` Peter Xu
2024-12-13 16:41 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-12 21:22 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
> On 12/9/2024 2:42 PM, Peter Xu wrote:
> > On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
> > > @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> > > new_block->page_size = qemu_real_host_page_size();
> > > new_block->host = host;
> > > new_block->flags = ram_flags;
> > > +
> > > + if (!host && !xen_enabled()) {
> >
> > Adding one more xen check is unnecessary. This patch needed it could mean
> > that the patch can be refactored.. because we have xen checks in both
> > ram_block_add() and also in the fd allocation path.
> >
> > At the meantime, see:
> >
> > qemu_ram_alloc_from_fd():
> > if (kvm_enabled() && !kvm_has_sync_mmu()) {
> > error_setg(errp,
> > "host lacks kvm mmu notifiers, -mem-path unsupported");
> > return NULL;
> > }
> >
> > I don't think any decent kernel could hit this, but that could be another
> > sign that this patch duplicated some file allocations.
> >
> > > + if ((new_block->flags & RAM_SHARED) &&
> > > + !qemu_ram_alloc_shared(new_block, &local_err)) {
> > > + goto err;
> > > + }
> > > + }
> > > +
> > > ram_block_add(new_block, &local_err);
> > > - if (local_err) {
> > > - g_free(new_block);
> > > - error_propagate(errp, local_err);
> > > - return NULL;
> > > + if (!local_err) {
> > > + return new_block;
> > > }
> > > - return new_block;
> > > +
> > > +err:
> > > + g_free(new_block);
> > > + error_propagate(errp, local_err);
> > > + return NULL;
> > > }
> >
> > IIUC we only need to conditionally convert an anon-allocation into an
> > fd-allocation, and then we don't need to mostly duplicate
> > qemu_ram_alloc_from_fd(), instead we reuse it.
> >
> > I do have a few other comments elsewhere, but when I was trying to comment.
> > E.g., we either shouldn't need to bother caching qemu_memfd_check()
> > results, or do it in qemu_memfd_check() directly.. and some more.
>
> Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
> and qemu_memfd_check will be called more often. I'll cache the result inside
> qemu_memfd_check for the special case of flags=0.
OK.
>
> > Then I think it's easier I provide a patch, and also show that it can be
> > also smaller changes to do the same thing, with everything fixed up
> > (e.g. addressing above mmu notifier missing issue). What do you think as
> > below?
>
> The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
> which buys the xen and kvm checks for free. Sounds good, I will do that in the
> context of my patch.
>
> Here are some other changes in your patch, and my responses:
>
> I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
>
> However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
I guess no huge deal on these, however since we're talking.. Is that
QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
A quick dig tells me that it was used to be for anon THPs..
commit 36b586284e678da28df3af9fd0907d2b16f9311c
Author: Avi Kivity <avi@redhat.com>
Date: Mon Sep 5 11:07:05 2011 +0300
qemu_vmalloc: align properly for transparent hugepages and KVM
And I'm guessing if at that time was also majorly for guest ram.
Considering that this path won't make an effect until the new aux mem
option is on, I'd think it better to stick without anything special like
QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
to explicitly mention this in that commit message:
Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
to avoid fragmentation.
And this is exactly mostly small regions when it's AUX.. probably except
VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
it'll be totally different things.
So I won't worry on that 2M alignment, and I will try to not carry over
that, because then trying to remove it will be harder.. even when we want.
For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
For 3rd, tracepoint would definitely be fine whenever you feel necessary.
>
> Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
> and not fall back, because something unexpected went wrong. David said the same.
Why? I was trying to rely on such fallback to make it work on e.g. Xen.
In that case, Xen fails there and fallback to xen_ram_alloc() inside the
later call to ram_block_add(), no?
> Thus we still need to pass errp to qemu_memfd_create().
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-12 21:22 ` Peter Xu
@ 2024-12-13 16:41 ` Steven Sistare
2024-12-13 17:05 ` Steven Sistare
2024-12-16 18:19 ` Peter Xu
0 siblings, 2 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-13 16:41 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/12/2024 4:22 PM, Peter Xu wrote:
> On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
>> On 12/9/2024 2:42 PM, Peter Xu wrote:
>>> On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
>>>> @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
>>>> new_block->page_size = qemu_real_host_page_size();
>>>> new_block->host = host;
>>>> new_block->flags = ram_flags;
>>>> +
>>>> + if (!host && !xen_enabled()) {
>>>
>>> Adding one more xen check is unnecessary. This patch needed it could mean
>>> that the patch can be refactored.. because we have xen checks in both
>>> ram_block_add() and also in the fd allocation path.
>>>
>>> At the meantime, see:
>>>
>>> qemu_ram_alloc_from_fd():
>>> if (kvm_enabled() && !kvm_has_sync_mmu()) {
>>> error_setg(errp,
>>> "host lacks kvm mmu notifiers, -mem-path unsupported");
>>> return NULL;
>>> }
>>>
>>> I don't think any decent kernel could hit this, but that could be another
>>> sign that this patch duplicated some file allocations.
>>>
>>>> + if ((new_block->flags & RAM_SHARED) &&
>>>> + !qemu_ram_alloc_shared(new_block, &local_err)) {
>>>> + goto err;
>>>> + }
>>>> + }
>>>> +
>>>> ram_block_add(new_block, &local_err);
>>>> - if (local_err) {
>>>> - g_free(new_block);
>>>> - error_propagate(errp, local_err);
>>>> - return NULL;
>>>> + if (!local_err) {
>>>> + return new_block;
>>>> }
>>>> - return new_block;
>>>> +
>>>> +err:
>>>> + g_free(new_block);
>>>> + error_propagate(errp, local_err);
>>>> + return NULL;
>>>> }
>>>
>>> IIUC we only need to conditionally convert an anon-allocation into an
>>> fd-allocation, and then we don't need to mostly duplicate
>>> qemu_ram_alloc_from_fd(), instead we reuse it.
>>>
>>> I do have a few other comments elsewhere, but when I was trying to comment.
>>> E.g., we either shouldn't need to bother caching qemu_memfd_check()
>>> results, or do it in qemu_memfd_check() directly.. and some more.
>>
>> Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
>> and qemu_memfd_check will be called more often. I'll cache the result inside
>> qemu_memfd_check for the special case of flags=0.
>
> OK.
>
>>
>>> Then I think it's easier I provide a patch, and also show that it can be
>>> also smaller changes to do the same thing, with everything fixed up
>>> (e.g. addressing above mmu notifier missing issue). What do you think as
>>> below?
>>
>> The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
>> which buys the xen and kvm checks for free. Sounds good, I will do that in the
>> context of my patch.
>>
>> Here are some other changes in your patch, and my responses:
>>
>> I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
>>
>> However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
>
> I guess no huge deal on these, however since we're talking.. Is that
> QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
>
> A quick dig tells me that it was used to be for anon THPs..
>
> commit 36b586284e678da28df3af9fd0907d2b16f9311c
> Author: Avi Kivity <avi@redhat.com>
> Date: Mon Sep 5 11:07:05 2011 +0300
>
> qemu_vmalloc: align properly for transparent hugepages and KVM
>
> And I'm guessing if at that time was also majorly for guest ram.
>
> Considering that this path won't make an effect until the new aux mem
> option is on, I'd think it better to stick without anything special like
> QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
> to explicitly mention this in that commit message:
>
> Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
> to avoid fragmentation.
>
> And this is exactly mostly small regions when it's AUX.. probably except
> VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
> it'll be totally different things.
>
> So I won't worry on that 2M alignment, and I will try to not carry over
> that, because then trying to remove it will be harder.. even when we want.
Yes, currently the aux allocations get QEMU_VMALLOC_ALIGN alignment in
qemu_anon_ram_alloc. I do the same for the shared fd mappings to guarantee
no performance regression, as some of them are larger than 2M and would
benefit from using huge pages. The VA fragmentation is trivial for this small
number of aux blocks in a 64-bit address space, and is no different than it was
for qemu_anon_ram_alloc.
> For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
qemu sets cloexec for all descriptors it opens to prevent them from accidentally
being leaked to another process via fork+exec.
> For 3rd, tracepoint would definitely be fine whenever you feel necessary.
>
>> Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
>> and not fall back, because something unexpected went wrong. David said the same.
>
> Why? I was trying to rely on such fallback to make it work on e.g. Xen.
> In that case, Xen fails there and fallback to xen_ram_alloc() inside the
> later call to ram_block_add(), no?
Why -- because something went wrong that should have worked, and we should report the
first fault so its cause can be fixed and cpr can be used.
However, to do the above, but still quietly fallback if qemu_ram_alloc_from_fd
fails because of xen or kvm, I would need to return different error codes from
qemu_ram_alloc_from_fd. Doable, but requires tweaks to all occurrences of
qemu_ram_alloc_from_fd.
And BTW, qemu_ram_alloc_from_fd is defined for CONFIG_POSIX only. I need
to modify the call site in the patch accordingly.
Overall, I am not convinced that using qemu_ram_alloc_from_fd in this patch
is better/simpler than my V4 patch using file_ram_alloc, plus adding xen and
kvm_has_sync_mmu checks in qemu_ram_alloc_internal.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-13 16:41 ` Steven Sistare
@ 2024-12-13 17:05 ` Steven Sistare
2024-12-16 18:19 ` Peter Xu
1 sibling, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-13 17:05 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/13/2024 11:41 AM, Steven Sistare wrote:
> On 12/12/2024 4:22 PM, Peter Xu wrote:
>> On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
>>> On 12/9/2024 2:42 PM, Peter Xu wrote:
>>>> On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
>>>>> @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
>>>>> new_block->page_size = qemu_real_host_page_size();
>>>>> new_block->host = host;
>>>>> new_block->flags = ram_flags;
>>>>> +
>>>>> + if (!host && !xen_enabled()) {
>>>>
>>>> Adding one more xen check is unnecessary. This patch needed it could mean
>>>> that the patch can be refactored.. because we have xen checks in both
>>>> ram_block_add() and also in the fd allocation path.
>>>>
>>>> At the meantime, see:
>>>>
>>>> qemu_ram_alloc_from_fd():
>>>> if (kvm_enabled() && !kvm_has_sync_mmu()) {
>>>> error_setg(errp,
>>>> "host lacks kvm mmu notifiers, -mem-path unsupported");
>>>> return NULL;
>>>> }
>>>>
>>>> I don't think any decent kernel could hit this, but that could be another
>>>> sign that this patch duplicated some file allocations.
>>>>
>>>>> + if ((new_block->flags & RAM_SHARED) &&
>>>>> + !qemu_ram_alloc_shared(new_block, &local_err)) {
>>>>> + goto err;
>>>>> + }
>>>>> + }
>>>>> +
>>>>> ram_block_add(new_block, &local_err);
>>>>> - if (local_err) {
>>>>> - g_free(new_block);
>>>>> - error_propagate(errp, local_err);
>>>>> - return NULL;
>>>>> + if (!local_err) {
>>>>> + return new_block;
>>>>> }
>>>>> - return new_block;
>>>>> +
>>>>> +err:
>>>>> + g_free(new_block);
>>>>> + error_propagate(errp, local_err);
>>>>> + return NULL;
>>>>> }
>>>>
>>>> IIUC we only need to conditionally convert an anon-allocation into an
>>>> fd-allocation, and then we don't need to mostly duplicate
>>>> qemu_ram_alloc_from_fd(), instead we reuse it.
>>>>
>>>> I do have a few other comments elsewhere, but when I was trying to comment.
>>>> E.g., we either shouldn't need to bother caching qemu_memfd_check()
>>>> results, or do it in qemu_memfd_check() directly.. and some more.
>>>
>>> Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
>>> and qemu_memfd_check will be called more often. I'll cache the result inside
>>> qemu_memfd_check for the special case of flags=0.
>>
>> OK.
>>
>>>
>>>> Then I think it's easier I provide a patch, and also show that it can be
>>>> also smaller changes to do the same thing, with everything fixed up
>>>> (e.g. addressing above mmu notifier missing issue). What do you think as
>>>> below?
>>>
>>> The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
>>> which buys the xen and kvm checks for free. Sounds good, I will do that in the
>>> context of my patch.
>>>
>>> Here are some other changes in your patch, and my responses:
>>>
>>> I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
>>>
>>> However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
>>
>> I guess no huge deal on these, however since we're talking.. Is that
>> QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
>>
>> A quick dig tells me that it was used to be for anon THPs..
>>
>> commit 36b586284e678da28df3af9fd0907d2b16f9311c
>> Author: Avi Kivity <avi@redhat.com>
>> Date: Mon Sep 5 11:07:05 2011 +0300
>>
>> qemu_vmalloc: align properly for transparent hugepages and KVM
>>
>> And I'm guessing if at that time was also majorly for guest ram.
>>
>> Considering that this path won't make an effect until the new aux mem
>> option is on, I'd think it better to stick without anything special like
>> QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
>> to explicitly mention this in that commit message:
>>
>> Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
>> to avoid fragmentation.
>>
>> And this is exactly mostly small regions when it's AUX.. probably except
>> VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
>> it'll be totally different things.
>>
>> So I won't worry on that 2M alignment, and I will try to not carry over
>> that, because then trying to remove it will be harder.. even when we want.
>
> Yes, currently the aux allocations get QEMU_VMALLOC_ALIGN alignment in
> qemu_anon_ram_alloc. I do the same for the shared fd mappings to guarantee
> no performance regression, as some of them are larger than 2M and would
> benefit from using huge pages. The VA fragmentation is trivial for this small
> number of aux blocks in a 64-bit address space, and is no different than it was
> for qemu_anon_ram_alloc.
>
>> For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
>
> qemu sets cloexec for all descriptors it opens to prevent them from accidentally
> being leaked to another process via fork+exec.
>
>> For 3rd, tracepoint would definitely be fine whenever you feel necessary.
>>
>>> Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
>>> and not fall back, because something unexpected went wrong. David said the same.
>>
>> Why? I was trying to rely on such fallback to make it work on e.g. Xen.
>> In that case, Xen fails there and fallback to xen_ram_alloc() inside the
>> later call to ram_block_add(), no?
>
> Why -- because something went wrong that should have worked, and we should report the
> first fault so its cause can be fixed and cpr can be used.
>
> However, to do the above, but still quietly fallback if qemu_ram_alloc_from_fd
> fails because of xen or kvm, I would need to return different error codes from
> qemu_ram_alloc_from_fd. Doable, but requires tweaks to all occurrences of
> qemu_ram_alloc_from_fd.
>
> And BTW, qemu_ram_alloc_from_fd is defined for CONFIG_POSIX only. I need
> to modify the call site in the patch accordingly.
And qemu_ram_alloc_from_fd does not accept RAM_RESIZEABLE. More tweaking required.
- Steve
> Overall, I am not convinced that using qemu_ram_alloc_from_fd in this patch
> is better/simpler than my V4 patch using file_ram_alloc, plus adding xen and
> kvm_has_sync_mmu checks in qemu_ram_alloc_internal.
>
> - Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-13 16:41 ` Steven Sistare
2024-12-13 17:05 ` Steven Sistare
@ 2024-12-16 18:19 ` Peter Xu
2024-12-17 21:54 ` Steven Sistare
1 sibling, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-16 18:19 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Fri, Dec 13, 2024 at 11:41:45AM -0500, Steven Sistare wrote:
> On 12/12/2024 4:22 PM, Peter Xu wrote:
> > On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
> > > On 12/9/2024 2:42 PM, Peter Xu wrote:
> > > > On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
> > > > > @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> > > > > new_block->page_size = qemu_real_host_page_size();
> > > > > new_block->host = host;
> > > > > new_block->flags = ram_flags;
> > > > > +
> > > > > + if (!host && !xen_enabled()) {
> > > >
> > > > Adding one more xen check is unnecessary. This patch needed it could mean
> > > > that the patch can be refactored.. because we have xen checks in both
> > > > ram_block_add() and also in the fd allocation path.
> > > >
> > > > At the meantime, see:
> > > >
> > > > qemu_ram_alloc_from_fd():
> > > > if (kvm_enabled() && !kvm_has_sync_mmu()) {
> > > > error_setg(errp,
> > > > "host lacks kvm mmu notifiers, -mem-path unsupported");
> > > > return NULL;
> > > > }
> > > >
> > > > I don't think any decent kernel could hit this, but that could be another
> > > > sign that this patch duplicated some file allocations.
> > > >
> > > > > + if ((new_block->flags & RAM_SHARED) &&
> > > > > + !qemu_ram_alloc_shared(new_block, &local_err)) {
> > > > > + goto err;
> > > > > + }
> > > > > + }
> > > > > +
> > > > > ram_block_add(new_block, &local_err);
> > > > > - if (local_err) {
> > > > > - g_free(new_block);
> > > > > - error_propagate(errp, local_err);
> > > > > - return NULL;
> > > > > + if (!local_err) {
> > > > > + return new_block;
> > > > > }
> > > > > - return new_block;
> > > > > +
> > > > > +err:
> > > > > + g_free(new_block);
> > > > > + error_propagate(errp, local_err);
> > > > > + return NULL;
> > > > > }
> > > >
> > > > IIUC we only need to conditionally convert an anon-allocation into an
> > > > fd-allocation, and then we don't need to mostly duplicate
> > > > qemu_ram_alloc_from_fd(), instead we reuse it.
> > > >
> > > > I do have a few other comments elsewhere, but when I was trying to comment.
> > > > E.g., we either shouldn't need to bother caching qemu_memfd_check()
> > > > results, or do it in qemu_memfd_check() directly.. and some more.
> > >
> > > Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
> > > and qemu_memfd_check will be called more often. I'll cache the result inside
> > > qemu_memfd_check for the special case of flags=0.
> >
> > OK.
> >
> > >
> > > > Then I think it's easier I provide a patch, and also show that it can be
> > > > also smaller changes to do the same thing, with everything fixed up
> > > > (e.g. addressing above mmu notifier missing issue). What do you think as
> > > > below?
> > >
> > > The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
> > > which buys the xen and kvm checks for free. Sounds good, I will do that in the
> > > context of my patch.
> > >
> > > Here are some other changes in your patch, and my responses:
> > >
> > > I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
> > >
> > > However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
> >
> > I guess no huge deal on these, however since we're talking.. Is that
> > QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
> >
> > A quick dig tells me that it was used to be for anon THPs..
> >
> > commit 36b586284e678da28df3af9fd0907d2b16f9311c
> > Author: Avi Kivity <avi@redhat.com>
> > Date: Mon Sep 5 11:07:05 2011 +0300
> >
> > qemu_vmalloc: align properly for transparent hugepages and KVM
> >
> > And I'm guessing if at that time was also majorly for guest ram.
> >
> > Considering that this path won't make an effect until the new aux mem
> > option is on, I'd think it better to stick without anything special like
> > QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
> > to explicitly mention this in that commit message:
> >
> > Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
> > to avoid fragmentation.
> >
> > And this is exactly mostly small regions when it's AUX.. probably except
> > VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
> > it'll be totally different things.
> >
> > So I won't worry on that 2M alignment, and I will try to not carry over
> > that, because then trying to remove it will be harder.. even when we want.
>
> Yes, currently the aux allocations get QEMU_VMALLOC_ALIGN alignment in
> qemu_anon_ram_alloc. I do the same for the shared fd mappings to guarantee
> no performance regression,
I don't know how we could guarantee that at all - anon and shmem uses
different knobs to enable/disable THPs after all.. For example:
$ ls /sys/kernel/mm/transparent_hugepage/*enabled
/sys/kernel/mm/transparent_hugepage/enabled
/sys/kernel/mm/transparent_hugepage/shmem_enabled
And their default values normally differ too... it means after switching to
fd based we do face the possibility that thp can be gone at least on the
1st 2mb.
When I was suggesting it, I was hoping thp doesn't really matter that lot
on aux mem, even for VGA.
Btw, I don't even think the alignment will affect THP allocations for the
whole vma, anyway? I mean, it's only about the initial 2MB portion.. IOW,
when not aligned, I think the worst case is we have <2MB at start address
that is not using THP, but later on when it starts to align with 2MB, THPs
will be allocated again.
The challenge is more on the "fd-based" side, where shmem on most distros
will disable THP completely.
> as some of them are larger than 2M and would
> benefit from using huge pages. The VA fragmentation is trivial for this small
> number of aux blocks in a 64-bit address space, and is no different than it was
> for qemu_anon_ram_alloc.
>
> > For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
>
> qemu sets cloexec for all descriptors it opens to prevent them from accidentally
> being leaked to another process via fork+exec.
But my question is why this is special? For example, we don't do that for
"-object memory-backend-memfd", am I right?
>
> > For 3rd, tracepoint would definitely be fine whenever you feel necessary.
> >
> > > Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
> > > and not fall back, because something unexpected went wrong. David said the same.
> >
> > Why? I was trying to rely on such fallback to make it work on e.g. Xen.
> > In that case, Xen fails there and fallback to xen_ram_alloc() inside the
> > later call to ram_block_add(), no?
>
> Why -- because something went wrong that should have worked, and we should report the
> first fault so its cause can be fixed and cpr can be used.
Ahh so it's only about the corner cases where CPR could raise an error?
Can we rely on the failure later on "migrate" command to tell which
ramblock doesn't support it, so the user could be aware as well?
>
> However, to do the above, but still quietly fallback if qemu_ram_alloc_from_fd
> fails because of xen or kvm, I would need to return different error codes from
> qemu_ram_alloc_from_fd. Doable, but requires tweaks to all occurrences of
> qemu_ram_alloc_from_fd.
>
> And BTW, qemu_ram_alloc_from_fd is defined for CONFIG_POSIX only. I need
> to modify the call site in the patch accordingly.
Yep, I was thinking maybe qemu_ram_alloc_from_fd() had a stub function,
indeed looks not.. "allocating the fd" part definitely has, which I
remember I checked..
>
> Overall, I am not convinced that using qemu_ram_alloc_from_fd in this patch
> is better/simpler than my V4 patch using file_ram_alloc, plus adding xen and
> kvm_has_sync_mmu checks in qemu_ram_alloc_internal.
As long as you don't need to duplicate these two checks (or duplicate any
such check..) I'm ok.
Reusing qemu_ram_alloc_from_fd() still sounds like the easiest to go. Yes
we'll need to teach it about resize(), used_length etc. to it, but they all
look sane to me. We didn't have those simply because we don't have use of
them, now we want to have resizable fd-based mem, that's the right thing to
do to support that on fd allocations.
OTOH, duplicating xen/mmu checks isn't sane to me.. :( It will make the
code harder to maintain because the 3rd qemu_ram_alloc_from_fd() in the
future will need to duplicate it once more (or worse, forget it again until
xen / old kernels reports a failure)..
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-16 18:19 ` Peter Xu
@ 2024-12-17 21:54 ` Steven Sistare
2024-12-17 22:46 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-17 21:54 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/16/2024 1:19 PM, Peter Xu wrote:
> On Fri, Dec 13, 2024 at 11:41:45AM -0500, Steven Sistare wrote:
>> On 12/12/2024 4:22 PM, Peter Xu wrote:
>>> On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
>>>> On 12/9/2024 2:42 PM, Peter Xu wrote:
>>>>> On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
>>>>>> @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
>>>>>> new_block->page_size = qemu_real_host_page_size();
>>>>>> new_block->host = host;
>>>>>> new_block->flags = ram_flags;
>>>>>> +
>>>>>> + if (!host && !xen_enabled()) {
>>>>>
>>>>> Adding one more xen check is unnecessary. This patch needed it could mean
>>>>> that the patch can be refactored.. because we have xen checks in both
>>>>> ram_block_add() and also in the fd allocation path.
>>>>>
>>>>> At the meantime, see:
>>>>>
>>>>> qemu_ram_alloc_from_fd():
>>>>> if (kvm_enabled() && !kvm_has_sync_mmu()) {
>>>>> error_setg(errp,
>>>>> "host lacks kvm mmu notifiers, -mem-path unsupported");
>>>>> return NULL;
>>>>> }
>>>>>
>>>>> I don't think any decent kernel could hit this, but that could be another
>>>>> sign that this patch duplicated some file allocations.
>>>>>
>>>>>> + if ((new_block->flags & RAM_SHARED) &&
>>>>>> + !qemu_ram_alloc_shared(new_block, &local_err)) {
>>>>>> + goto err;
>>>>>> + }
>>>>>> + }
>>>>>> +
>>>>>> ram_block_add(new_block, &local_err);
>>>>>> - if (local_err) {
>>>>>> - g_free(new_block);
>>>>>> - error_propagate(errp, local_err);
>>>>>> - return NULL;
>>>>>> + if (!local_err) {
>>>>>> + return new_block;
>>>>>> }
>>>>>> - return new_block;
>>>>>> +
>>>>>> +err:
>>>>>> + g_free(new_block);
>>>>>> + error_propagate(errp, local_err);
>>>>>> + return NULL;
>>>>>> }
>>>>>
>>>>> IIUC we only need to conditionally convert an anon-allocation into an
>>>>> fd-allocation, and then we don't need to mostly duplicate
>>>>> qemu_ram_alloc_from_fd(), instead we reuse it.
>>>>>
>>>>> I do have a few other comments elsewhere, but when I was trying to comment.
>>>>> E.g., we either shouldn't need to bother caching qemu_memfd_check()
>>>>> results, or do it in qemu_memfd_check() directly.. and some more.
>>>>
>>>> Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
>>>> and qemu_memfd_check will be called more often. I'll cache the result inside
>>>> qemu_memfd_check for the special case of flags=0.
>>>
>>> OK.
>>>
>>>>
>>>>> Then I think it's easier I provide a patch, and also show that it can be
>>>>> also smaller changes to do the same thing, with everything fixed up
>>>>> (e.g. addressing above mmu notifier missing issue). What do you think as
>>>>> below?
>>>>
>>>> The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
>>>> which buys the xen and kvm checks for free. Sounds good, I will do that in the
>>>> context of my patch.
>>>>
>>>> Here are some other changes in your patch, and my responses:
>>>>
>>>> I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
>>>>
>>>> However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
>>>
>>> I guess no huge deal on these, however since we're talking.. Is that
>>> QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
>>>
>>> A quick dig tells me that it was used to be for anon THPs..
>>>
>>> commit 36b586284e678da28df3af9fd0907d2b16f9311c
>>> Author: Avi Kivity <avi@redhat.com>
>>> Date: Mon Sep 5 11:07:05 2011 +0300
>>>
>>> qemu_vmalloc: align properly for transparent hugepages and KVM
>>>
>>> And I'm guessing if at that time was also majorly for guest ram.
>>>
>>> Considering that this path won't make an effect until the new aux mem
>>> option is on, I'd think it better to stick without anything special like
>>> QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
>>> to explicitly mention this in that commit message:
>>>
>>> Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
>>> to avoid fragmentation.
>>>
>>> And this is exactly mostly small regions when it's AUX.. probably except
>>> VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
>>> it'll be totally different things.
>>>
>>> So I won't worry on that 2M alignment, and I will try to not carry over
>>> that, because then trying to remove it will be harder.. even when we want.
>>
>> Yes, currently the aux allocations get QEMU_VMALLOC_ALIGN alignment in
>> qemu_anon_ram_alloc. I do the same for the shared fd mappings to guarantee
>> no performance regression,
>
> I don't know how we could guarantee that at all - anon and shmem uses
> different knobs to enable/disable THPs after all.. For example:
>
> $ ls /sys/kernel/mm/transparent_hugepage/*enabled
> /sys/kernel/mm/transparent_hugepage/enabled
> /sys/kernel/mm/transparent_hugepage/shmem_enabled
Yes, but at least shmem_enabled is something the end user can fix. If
we bake a poor alignment into qemu, the user has no recourse. By setting
it to QEMU_VMALLOC_ALIGN, I eliminate alignment as a potential performance
issue. There is no practical downside. We should just do it, especially if
you believe "no huge deal on these" as written above :)
> And their default values normally differ too... it means after switching to
> fd based we do face the possibility that thp can be gone at least on the
> 1st 2mb.
>
> When I was suggesting it, I was hoping thp doesn't really matter that lot
> on aux mem, even for VGA.
>
> Btw, I don't even think the alignment will affect THP allocations for the
> whole vma, anyway? I mean, it's only about the initial 2MB portion.. IOW,
> when not aligned, I think the worst case is we have <2MB at start address
> that is not using THP, but later on when it starts to align with 2MB, THPs
> will be allocated again.
It depends on the kernel version/implementation. In 6.13, it is not that
clever for memfd_create + mmap. An unaligned start means no huge pages anywhere
in the allocation, as shown by the page-types utility. Add QEMU_VMALLOC_ALIGN,
and I get huge pages.
> The challenge is more on the "fd-based" side, where shmem on most distros
> will disable THP completely.
>
>> as some of them are larger than 2M and would
>> benefit from using huge pages. The VA fragmentation is trivial for this small
>> number of aux blocks in a 64-bit address space, and is no different than it was
>> for qemu_anon_ram_alloc.
>>
>>> For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
>>
>> qemu sets cloexec for all descriptors it opens to prevent them from accidentally
>> being leaked to another process via fork+exec.
>
> But my question is why this is special? For example, we don't do that for
> "-object memory-backend-memfd", am I right?
We should, the backends also need to set cloexec when they use a cpr fd.
I'll delete the call here and push it into cpr_find_fd.
>>> For 3rd, tracepoint would definitely be fine whenever you feel necessary.
>>>
>>>> Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
>>>> and not fall back, because something unexpected went wrong. David said the same.
>>>
>>> Why? I was trying to rely on such fallback to make it work on e.g. Xen.
>>> In that case, Xen fails there and fallback to xen_ram_alloc() inside the
>>> later call to ram_block_add(), no?
>>
>> Why -- because something went wrong that should have worked, and we should report the
>> first fault so its cause can be fixed and cpr can be used.
>
> Ahh so it's only about the corner cases where CPR could raise an error?
> Can we rely on the failure later on "migrate" command to tell which
> ramblock doesn't support it, so the user could be aware as well?
The ramblock migration blocker will indeed tell us which block is a problem.
But, we are throwing away potentially useful information by dropping the
first error message on the floor. We should only fall back for expected
failures. Unexpected failures mean there is something to fix.
I can compromise and fail on errors from these:
qemu_memfd_create(name, 0, 0, 0, 0, errp);
qemu_shm_alloc(0, errp);
but ignore errors from the subsequent call to qemu_ram_alloc_from_fd,
and fall back. That keeps the code simple.
>> However, to do the above, but still quietly fallback if qemu_ram_alloc_from_fd
>> fails because of xen or kvm, I would need to return different error codes from
>> qemu_ram_alloc_from_fd. Doable, but requires tweaks to all occurrences of
>> qemu_ram_alloc_from_fd.
>>
>> And BTW, qemu_ram_alloc_from_fd is defined for CONFIG_POSIX only. I need
>> to modify the call site in the patch accordingly.
>
> Yep, I was thinking maybe qemu_ram_alloc_from_fd() had a stub function,
> indeed looks not.. "allocating the fd" part definitely has, which I
> remember I checked..
>
>> Overall, I am not convinced that using qemu_ram_alloc_from_fd in this patch
>> is better/simpler than my V4 patch using file_ram_alloc, plus adding xen and
>> kvm_has_sync_mmu checks in qemu_ram_alloc_internal.
>
> As long as you don't need to duplicate these two checks (or duplicate any
> such check..) I'm ok.
>
> Reusing qemu_ram_alloc_from_fd() still sounds like the easiest to go. Yes
> we'll need to teach it about resize(), used_length etc. to it, but they all
> look sane to me. We didn't have those simply because we don't have use of
> them, now we want to have resizable fd-based mem, that's the right thing to
> do to support that on fd allocations.
>
> OTOH, duplicating xen/mmu checks isn't sane to me.. :( It will make the
> code harder to maintain because the 3rd qemu_ram_alloc_from_fd() in the
> future will need to duplicate it once more (or worse, forget it again until
> xen / old kernels reports a failure)..
I'll make the necessary changes to use qemu_ram_alloc_from_fd.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-17 21:54 ` Steven Sistare
@ 2024-12-17 22:46 ` Peter Xu
2024-12-18 16:34 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-17 22:46 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Tue, Dec 17, 2024 at 04:54:43PM -0500, Steven Sistare wrote:
> On 12/16/2024 1:19 PM, Peter Xu wrote:
> > On Fri, Dec 13, 2024 at 11:41:45AM -0500, Steven Sistare wrote:
> > > On 12/12/2024 4:22 PM, Peter Xu wrote:
> > > > On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
> > > > > On 12/9/2024 2:42 PM, Peter Xu wrote:
> > > > > > On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
> > > > > > > @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> > > > > > > new_block->page_size = qemu_real_host_page_size();
> > > > > > > new_block->host = host;
> > > > > > > new_block->flags = ram_flags;
> > > > > > > +
> > > > > > > + if (!host && !xen_enabled()) {
> > > > > >
> > > > > > Adding one more xen check is unnecessary. This patch needed it could mean
> > > > > > that the patch can be refactored.. because we have xen checks in both
> > > > > > ram_block_add() and also in the fd allocation path.
> > > > > >
> > > > > > At the meantime, see:
> > > > > >
> > > > > > qemu_ram_alloc_from_fd():
> > > > > > if (kvm_enabled() && !kvm_has_sync_mmu()) {
> > > > > > error_setg(errp,
> > > > > > "host lacks kvm mmu notifiers, -mem-path unsupported");
> > > > > > return NULL;
> > > > > > }
> > > > > >
> > > > > > I don't think any decent kernel could hit this, but that could be another
> > > > > > sign that this patch duplicated some file allocations.
> > > > > >
> > > > > > > + if ((new_block->flags & RAM_SHARED) &&
> > > > > > > + !qemu_ram_alloc_shared(new_block, &local_err)) {
> > > > > > > + goto err;
> > > > > > > + }
> > > > > > > + }
> > > > > > > +
> > > > > > > ram_block_add(new_block, &local_err);
> > > > > > > - if (local_err) {
> > > > > > > - g_free(new_block);
> > > > > > > - error_propagate(errp, local_err);
> > > > > > > - return NULL;
> > > > > > > + if (!local_err) {
> > > > > > > + return new_block;
> > > > > > > }
> > > > > > > - return new_block;
> > > > > > > +
> > > > > > > +err:
> > > > > > > + g_free(new_block);
> > > > > > > + error_propagate(errp, local_err);
> > > > > > > + return NULL;
> > > > > > > }
> > > > > >
> > > > > > IIUC we only need to conditionally convert an anon-allocation into an
> > > > > > fd-allocation, and then we don't need to mostly duplicate
> > > > > > qemu_ram_alloc_from_fd(), instead we reuse it.
> > > > > >
> > > > > > I do have a few other comments elsewhere, but when I was trying to comment.
> > > > > > E.g., we either shouldn't need to bother caching qemu_memfd_check()
> > > > > > results, or do it in qemu_memfd_check() directly.. and some more.
> > > > >
> > > > > Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
> > > > > and qemu_memfd_check will be called more often. I'll cache the result inside
> > > > > qemu_memfd_check for the special case of flags=0.
> > > >
> > > > OK.
> > > >
> > > > >
> > > > > > Then I think it's easier I provide a patch, and also show that it can be
> > > > > > also smaller changes to do the same thing, with everything fixed up
> > > > > > (e.g. addressing above mmu notifier missing issue). What do you think as
> > > > > > below?
> > > > >
> > > > > The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
> > > > > which buys the xen and kvm checks for free. Sounds good, I will do that in the
> > > > > context of my patch.
> > > > >
> > > > > Here are some other changes in your patch, and my responses:
> > > > >
> > > > > I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
> > > > >
> > > > > However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
> > > >
> > > > I guess no huge deal on these, however since we're talking.. Is that
> > > > QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
> > > >
> > > > A quick dig tells me that it was used to be for anon THPs..
> > > >
> > > > commit 36b586284e678da28df3af9fd0907d2b16f9311c
> > > > Author: Avi Kivity <avi@redhat.com>
> > > > Date: Mon Sep 5 11:07:05 2011 +0300
> > > >
> > > > qemu_vmalloc: align properly for transparent hugepages and KVM
> > > >
> > > > And I'm guessing if at that time was also majorly for guest ram.
> > > >
> > > > Considering that this path won't make an effect until the new aux mem
> > > > option is on, I'd think it better to stick without anything special like
> > > > QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
> > > > to explicitly mention this in that commit message:
> > > >
> > > > Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
> > > > to avoid fragmentation.
> > > >
> > > > And this is exactly mostly small regions when it's AUX.. probably except
> > > > VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
> > > > it'll be totally different things.
> > > >
> > > > So I won't worry on that 2M alignment, and I will try to not carry over
> > > > that, because then trying to remove it will be harder.. even when we want.
> > >
> > > Yes, currently the aux allocations get QEMU_VMALLOC_ALIGN alignment in
> > > qemu_anon_ram_alloc. I do the same for the shared fd mappings to guarantee
> > > no performance regression,
> >
> > I don't know how we could guarantee that at all - anon and shmem uses
> > different knobs to enable/disable THPs after all.. For example:
> >
> > $ ls /sys/kernel/mm/transparent_hugepage/*enabled
> > /sys/kernel/mm/transparent_hugepage/enabled
> > /sys/kernel/mm/transparent_hugepage/shmem_enabled
>
> Yes, but at least shmem_enabled is something the end user can fix. If
> we bake a poor alignment into qemu, the user has no recourse. By setting
> it to QEMU_VMALLOC_ALIGN, I eliminate alignment as a potential performance
> issue. There is no practical downside. We should just do it, especially if
> you believe "no huge deal on these" as written above :)
I'd wager nobody will be able to notice the anon/shmem difference at all,
so if it really regressed nobody will be able fix it. :)
Not to mention it's a global knob, and IMHO it doesn't make a lot of sense
to change it for an aux mem not aligned.. while changing a global knob
could OTOH break other things.
But sure, if you do prefer having that I'm ok. Please still consider adding
a comment then explaining where it came from..
>
> > And their default values normally differ too... it means after switching to
> > fd based we do face the possibility that thp can be gone at least on the
> > 1st 2mb.
> >
> > When I was suggesting it, I was hoping thp doesn't really matter that lot
> > on aux mem, even for VGA.
> >
> > Btw, I don't even think the alignment will affect THP allocations for the
> > whole vma, anyway? I mean, it's only about the initial 2MB portion.. IOW,
> > when not aligned, I think the worst case is we have <2MB at start address
> > that is not using THP, but later on when it starts to align with 2MB, THPs
> > will be allocated again.
>
> It depends on the kernel version/implementation. In 6.13, it is not that
> clever for memfd_create + mmap. An unaligned start means no huge pages anywhere
> in the allocation, as shown by the page-types utility. Add QEMU_VMALLOC_ALIGN,
> and I get huge pages.
>
> > The challenge is more on the "fd-based" side, where shmem on most distros
> > will disable THP completely.
> >
> > > as some of them are larger than 2M and would
> > > benefit from using huge pages. The VA fragmentation is trivial for this small
> > > number of aux blocks in a 64-bit address space, and is no different than it was
> > > for qemu_anon_ram_alloc.
> > >
> > > > For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
> > >
> > > qemu sets cloexec for all descriptors it opens to prevent them from accidentally
> > > being leaked to another process via fork+exec.
> >
> > But my question is why this is special? For example, we don't do that for
> > "-object memory-backend-memfd", am I right?
>
> We should, the backends also need to set cloexec when they use a cpr fd.
> I'll delete the call here and push it into cpr_find_fd.
Maybe we already have that? As CPR receives fds from iochannels. I am
looking at qio_channel_socket_copy_fds(), where we have:
#ifndef MSG_CMSG_CLOEXEC
qemu_set_cloexec(fd);
#endif
>
> > > > For 3rd, tracepoint would definitely be fine whenever you feel necessary.
> > > >
> > > > > Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
> > > > > and not fall back, because something unexpected went wrong. David said the same.
> > > >
> > > > Why? I was trying to rely on such fallback to make it work on e.g. Xen.
> > > > In that case, Xen fails there and fallback to xen_ram_alloc() inside the
> > > > later call to ram_block_add(), no?
> > >
> > > Why -- because something went wrong that should have worked, and we should report the
> > > first fault so its cause can be fixed and cpr can be used.
> >
> > Ahh so it's only about the corner cases where CPR could raise an error?
> > Can we rely on the failure later on "migrate" command to tell which
> > ramblock doesn't support it, so the user could be aware as well?
>
> The ramblock migration blocker will indeed tell us which block is a problem.
>
> But, we are throwing away potentially useful information by dropping the
> first error message on the floor. We should only fall back for expected
> failures. Unexpected failures mean there is something to fix.
>
> I can compromise and fail on errors from these:
> qemu_memfd_create(name, 0, 0, 0, 0, errp);
> qemu_shm_alloc(0, errp);
How are we going to be sure all existing systems using RAM_SHARED ramblocks
will always succeed on either memfd or sysv shm? IOW, what if there's a
system that can only support mmap(MAP_SHARED) but none of the two?
That's my major concern, on start failing some systems where it used to
work, even if they're corner cases.
>
> but ignore errors from the subsequent call to qemu_ram_alloc_from_fd,
> and fall back. That keeps the code simple.
>
> > > However, to do the above, but still quietly fallback if qemu_ram_alloc_from_fd
> > > fails because of xen or kvm, I would need to return different error codes from
> > > qemu_ram_alloc_from_fd. Doable, but requires tweaks to all occurrences of
> > > qemu_ram_alloc_from_fd.
> > >
> > > And BTW, qemu_ram_alloc_from_fd is defined for CONFIG_POSIX only. I need
> > > to modify the call site in the patch accordingly.
> >
> > Yep, I was thinking maybe qemu_ram_alloc_from_fd() had a stub function,
> > indeed looks not.. "allocating the fd" part definitely has, which I
> > remember I checked..
> >
> > > Overall, I am not convinced that using qemu_ram_alloc_from_fd in this patch
> > > is better/simpler than my V4 patch using file_ram_alloc, plus adding xen and
> > > kvm_has_sync_mmu checks in qemu_ram_alloc_internal.
> >
> > As long as you don't need to duplicate these two checks (or duplicate any
> > such check..) I'm ok.
> >
> > Reusing qemu_ram_alloc_from_fd() still sounds like the easiest to go. Yes
> > we'll need to teach it about resize(), used_length etc. to it, but they all
> > look sane to me. We didn't have those simply because we don't have use of
> > them, now we want to have resizable fd-based mem, that's the right thing to
> > do to support that on fd allocations.
> >
> > OTOH, duplicating xen/mmu checks isn't sane to me.. :( It will make the
> > code harder to maintain because the 3rd qemu_ram_alloc_from_fd() in the
> > future will need to duplicate it once more (or worse, forget it again until
> > xen / old kernels reports a failure)..
>
> I'll make the necessary changes to use qemu_ram_alloc_from_fd.
Thanks.
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 02/19] physmem: fd-based shared memory
2024-12-17 22:46 ` Peter Xu
@ 2024-12-18 16:34 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 16:34 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/17/2024 5:46 PM, Peter Xu wrote:
> On Tue, Dec 17, 2024 at 04:54:43PM -0500, Steven Sistare wrote:
>> On 12/16/2024 1:19 PM, Peter Xu wrote:
>>> On Fri, Dec 13, 2024 at 11:41:45AM -0500, Steven Sistare wrote:
>>>> On 12/12/2024 4:22 PM, Peter Xu wrote:
>>>>> On Thu, Dec 12, 2024 at 03:38:00PM -0500, Steven Sistare wrote:
>>>>>> On 12/9/2024 2:42 PM, Peter Xu wrote:
>>>>>>> On Mon, Dec 02, 2024 at 05:19:54AM -0800, Steve Sistare wrote:
>>>>>>>> @@ -2089,13 +2154,23 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
>>>>>>>> new_block->page_size = qemu_real_host_page_size();
>>>>>>>> new_block->host = host;
>>>>>>>> new_block->flags = ram_flags;
>>>>>>>> +
>>>>>>>> + if (!host && !xen_enabled()) {
>>>>>>>
>>>>>>> Adding one more xen check is unnecessary. This patch needed it could mean
>>>>>>> that the patch can be refactored.. because we have xen checks in both
>>>>>>> ram_block_add() and also in the fd allocation path.
>>>>>>>
>>>>>>> At the meantime, see:
>>>>>>>
>>>>>>> qemu_ram_alloc_from_fd():
>>>>>>> if (kvm_enabled() && !kvm_has_sync_mmu()) {
>>>>>>> error_setg(errp,
>>>>>>> "host lacks kvm mmu notifiers, -mem-path unsupported");
>>>>>>> return NULL;
>>>>>>> }
>>>>>>>
>>>>>>> I don't think any decent kernel could hit this, but that could be another
>>>>>>> sign that this patch duplicated some file allocations.
>>>>>>>
>>>>>>>> + if ((new_block->flags & RAM_SHARED) &&
>>>>>>>> + !qemu_ram_alloc_shared(new_block, &local_err)) {
>>>>>>>> + goto err;
>>>>>>>> + }
>>>>>>>> + }
>>>>>>>> +
>>>>>>>> ram_block_add(new_block, &local_err);
>>>>>>>> - if (local_err) {
>>>>>>>> - g_free(new_block);
>>>>>>>> - error_propagate(errp, local_err);
>>>>>>>> - return NULL;
>>>>>>>> + if (!local_err) {
>>>>>>>> + return new_block;
>>>>>>>> }
>>>>>>>> - return new_block;
>>>>>>>> +
>>>>>>>> +err:
>>>>>>>> + g_free(new_block);
>>>>>>>> + error_propagate(errp, local_err);
>>>>>>>> + return NULL;
>>>>>>>> }
>>>>>>>
>>>>>>> IIUC we only need to conditionally convert an anon-allocation into an
>>>>>>> fd-allocation, and then we don't need to mostly duplicate
>>>>>>> qemu_ram_alloc_from_fd(), instead we reuse it.
>>>>>>>
>>>>>>> I do have a few other comments elsewhere, but when I was trying to comment.
>>>>>>> E.g., we either shouldn't need to bother caching qemu_memfd_check()
>>>>>>> results, or do it in qemu_memfd_check() directly.. and some more.
>>>>>>
>>>>>> Someone thought it a good idea to cache the result of qemu_memfd_alloc_check,
>>>>>> and qemu_memfd_check will be called more often. I'll cache the result inside
>>>>>> qemu_memfd_check for the special case of flags=0.
>>>>>
>>>>> OK.
>>>>>
>>>>>>
>>>>>>> Then I think it's easier I provide a patch, and also show that it can be
>>>>>>> also smaller changes to do the same thing, with everything fixed up
>>>>>>> (e.g. addressing above mmu notifier missing issue). What do you think as
>>>>>>> below?
>>>>>>
>>>>>> The key change you make is calling qemu_ram_alloc_from_fd instead of file_ram_alloc,
>>>>>> which buys the xen and kvm checks for free. Sounds good, I will do that in the
>>>>>> context of my patch.
>>>>>>
>>>>>> Here are some other changes in your patch, and my responses:
>>>>>>
>>>>>> I will drop the "Retrying using MAP_ANON|MAP_SHARED" message, as you did.
>>>>>>
>>>>>> However, I am keeping QEMU_VMALLOC_ALIGN, qemu_set_cloexec, and trace_qemu_ram_alloc_shared.
>>>>>
>>>>> I guess no huge deal on these, however since we're talking.. Is that
>>>>> QEMU_VMALLOC_ALIGN from qemu_anon_ram_alloc()?
>>>>>
>>>>> A quick dig tells me that it was used to be for anon THPs..
>>>>>
>>>>> commit 36b586284e678da28df3af9fd0907d2b16f9311c
>>>>> Author: Avi Kivity <avi@redhat.com>
>>>>> Date: Mon Sep 5 11:07:05 2011 +0300
>>>>>
>>>>> qemu_vmalloc: align properly for transparent hugepages and KVM
>>>>>
>>>>> And I'm guessing if at that time was also majorly for guest ram.
>>>>>
>>>>> Considering that this path won't make an effect until the new aux mem
>>>>> option is on, I'd think it better to stick without anything special like
>>>>> QEMU_VMALLOC_ALIGN, until it's justified to be worthwhile. E.g., Avi used
>>>>> to explicitly mention this in that commit message:
>>>>>
>>>>> Adjust qemu_vmalloc() to honor that requirement. Ignore it for small regions
>>>>> to avoid fragmentation.
>>>>>
>>>>> And this is exactly mostly small regions when it's AUX.. probably except
>>>>> VGA, but it'll be SHARED on top of shmem not PRIVATE on anon anyway... so
>>>>> it'll be totally different things.
>>>>>
>>>>> So I won't worry on that 2M alignment, and I will try to not carry over
>>>>> that, because then trying to remove it will be harder.. even when we want.
>>>>
>>>> Yes, currently the aux allocations get QEMU_VMALLOC_ALIGN alignment in
>>>> qemu_anon_ram_alloc. I do the same for the shared fd mappings to guarantee
>>>> no performance regression,
>>>
>>> I don't know how we could guarantee that at all - anon and shmem uses
>>> different knobs to enable/disable THPs after all.. For example:
>>>
>>> $ ls /sys/kernel/mm/transparent_hugepage/*enabled
>>> /sys/kernel/mm/transparent_hugepage/enabled
>>> /sys/kernel/mm/transparent_hugepage/shmem_enabled
>>
>> Yes, but at least shmem_enabled is something the end user can fix. If
>> we bake a poor alignment into qemu, the user has no recourse. By setting
>> it to QEMU_VMALLOC_ALIGN, I eliminate alignment as a potential performance
>> issue. There is no practical downside. We should just do it, especially if
>> you believe "no huge deal on these" as written above :)
>
> I'd wager nobody will be able to notice the anon/shmem difference at all,
> so if it really regressed nobody will be able fix it. :)
>
> Not to mention it's a global knob, and IMHO it doesn't make a lot of sense
> to change it for an aux mem not aligned.. while changing a global knob
> could OTOH break other things.
>
> But sure, if you do prefer having that I'm ok. Please still consider adding
> a comment then explaining where it came from..
>
>>
>>> And their default values normally differ too... it means after switching to
>>> fd based we do face the possibility that thp can be gone at least on the
>>> 1st 2mb.
>>>
>>> When I was suggesting it, I was hoping thp doesn't really matter that lot
>>> on aux mem, even for VGA.
>>>
>>> Btw, I don't even think the alignment will affect THP allocations for the
>>> whole vma, anyway? I mean, it's only about the initial 2MB portion.. IOW,
>>> when not aligned, I think the worst case is we have <2MB at start address
>>> that is not using THP, but later on when it starts to align with 2MB, THPs
>>> will be allocated again.
>>
>> It depends on the kernel version/implementation. In 6.13, it is not that
>> clever for memfd_create + mmap. An unaligned start means no huge pages anywhere
>> in the allocation, as shown by the page-types utility. Add QEMU_VMALLOC_ALIGN,
>> and I get huge pages.
>>
>>> The challenge is more on the "fd-based" side, where shmem on most distros
>>> will disable THP completely.
>>>
>>>> as some of them are larger than 2M and would
>>>> benefit from using huge pages. The VA fragmentation is trivial for this small
>>>> number of aux blocks in a 64-bit address space, and is no different than it was
>>>> for qemu_anon_ram_alloc.
>>>>
>>>>> For the 2nd.. Any quick answer on why explicit qemu_set_cloexec() needed?
>>>>
>>>> qemu sets cloexec for all descriptors it opens to prevent them from accidentally
>>>> being leaked to another process via fork+exec.
>>>
>>> But my question is why this is special? For example, we don't do that for
>>> "-object memory-backend-memfd", am I right?
>>
>> We should, the backends also need to set cloexec when they use a cpr fd.
>> I'll delete the call here and push it into cpr_find_fd.
>
> Maybe we already have that? As CPR receives fds from iochannels. I am
> looking at qio_channel_socket_copy_fds(), where we have:
>
> #ifndef MSG_CMSG_CLOEXEC
> qemu_set_cloexec(fd);
> #endif
Oh, interesting. qio_channel_socket_readv sets MSG_CMSG_CLOEXEC, and this
is the fallback if MSG_CMSG_CLOEXEC is not defined. QEMU does a good job of
setting cloexec everywhere. I'll remove my extra call.
>>>>> For 3rd, tracepoint would definitely be fine whenever you feel necessary.
>>>>>
>>>>>> Also, when qemu_memfd_create + qemu_ram_alloc_from_fd fails, qemu should fail and exit,
>>>>>> and not fall back, because something unexpected went wrong. David said the same.
>>>>>
>>>>> Why? I was trying to rely on such fallback to make it work on e.g. Xen.
>>>>> In that case, Xen fails there and fallback to xen_ram_alloc() inside the
>>>>> later call to ram_block_add(), no?
>>>>
>>>> Why -- because something went wrong that should have worked, and we should report the
>>>> first fault so its cause can be fixed and cpr can be used.
>>>
>>> Ahh so it's only about the corner cases where CPR could raise an error?
>>> Can we rely on the failure later on "migrate" command to tell which
>>> ramblock doesn't support it, so the user could be aware as well?
>>
>> The ramblock migration blocker will indeed tell us which block is a problem.
>>
>> But, we are throwing away potentially useful information by dropping the
>> first error message on the floor. We should only fall back for expected
>> failures. Unexpected failures mean there is something to fix.
>>
>> I can compromise and fail on errors from these:
>> qemu_memfd_create(name, 0, 0, 0, 0, errp);
>> qemu_shm_alloc(0, errp);
>
> How are we going to be sure all existing systems using RAM_SHARED ramblocks
> will always succeed on either memfd or sysv shm? IOW, what if there's a
> system that can only support mmap(MAP_SHARED) but none of the two?
Non-POSIX will still quietly use mmap(MAP_SHARED) via ifdefs.
I'll post V5 soon and we can further refine if necessary.
- Steve
> That's my major concern, on start failing some systems where it used to
> work, even if they're corner cases.
>
>>
>> but ignore errors from the subsequent call to qemu_ram_alloc_from_fd,
>> and fall back. That keeps the code simple.
>>
>>>> However, to do the above, but still quietly fallback if qemu_ram_alloc_from_fd
>>>> fails because of xen or kvm, I would need to return different error codes from
>>>> qemu_ram_alloc_from_fd. Doable, but requires tweaks to all occurrences of
>>>> qemu_ram_alloc_from_fd.
>>>>
>>>> And BTW, qemu_ram_alloc_from_fd is defined for CONFIG_POSIX only. I need
>>>> to modify the call site in the patch accordingly.
>>>
>>> Yep, I was thinking maybe qemu_ram_alloc_from_fd() had a stub function,
>>> indeed looks not.. "allocating the fd" part definitely has, which I
>>> remember I checked..
>>>
>>>> Overall, I am not convinced that using qemu_ram_alloc_from_fd in this patch
>>>> is better/simpler than my V4 patch using file_ram_alloc, plus adding xen and
>>>> kvm_has_sync_mmu checks in qemu_ram_alloc_internal.
>>>
>>> As long as you don't need to duplicate these two checks (or duplicate any
>>> such check..) I'm ok.
>>>
>>> Reusing qemu_ram_alloc_from_fd() still sounds like the easiest to go. Yes
>>> we'll need to teach it about resize(), used_length etc. to it, but they all
>>> look sane to me. We didn't have those simply because we don't have use of
>>> them, now we want to have resizable fd-based mem, that's the right thing to
>>> do to support that on fd allocations.
>>>
>>> OTOH, duplicating xen/mmu checks isn't sane to me.. :( It will make the
>>> code harder to maintain because the 3rd qemu_ram_alloc_from_fd() in the
>>> future will need to duplicate it once more (or worse, forget it again until
>>> xen / old kernels reports a failure)..
>>
>> I'll make the necessary changes to use qemu_ram_alloc_from_fd.
>
> Thanks.
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 03/19] memory: add RAM_PRIVATE
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
2024-12-02 13:19 ` [PATCH V4 01/19] backends/hostmem-shm: factor out allocation of "anonymous shared memory with an fd" Steve Sistare
2024-12-02 13:19 ` [PATCH V4 02/19] physmem: fd-based shared memory Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-09 19:45 ` Peter Xu
2024-12-02 13:19 ` [PATCH V4 04/19] machine: aux-ram-share option Steve Sistare
` (15 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Define the RAM_PRIVATE flag.
In RAMBlock creation functions, if MAP_SHARED is 0 in the flags parameter,
in a subsequent patch the implementation may still create a shared mapping
if other conditions require it. Callers who specifically want a private
mapping, eg for objects specified by the user, must pass RAM_PRIVATE.
After RAMBlock creation, MAP_SHARED in the block's flags indicates whether
the block is shared or private, and MAP_PRIVATE is omitted.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
backends/hostmem-epc.c | 2 +-
backends/hostmem-file.c | 2 +-
backends/hostmem-memfd.c | 2 +-
backends/hostmem-ram.c | 2 +-
include/exec/memory.h | 10 ++++++++++
system/physmem.c | 15 ++++++++++++---
6 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/backends/hostmem-epc.c b/backends/hostmem-epc.c
index 6c024d6..3148ffa 100644
--- a/backends/hostmem-epc.c
+++ b/backends/hostmem-epc.c
@@ -36,7 +36,7 @@ sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
backend->aligned = true;
name = object_get_canonical_path(OBJECT(backend));
- ram_flags = (backend->share ? RAM_SHARED : 0) | RAM_PROTECTED;
+ ram_flags = (backend->share ? RAM_SHARED : RAM_PRIVATE) | RAM_PROTECTED;
return memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name,
backend->size, ram_flags, fd, 0, errp);
}
diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index 7e5072e..8cc10cb 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -82,7 +82,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
backend->aligned = true;
name = host_memory_backend_get_name(backend);
- ram_flags = backend->share ? RAM_SHARED : 0;
+ ram_flags = backend->share ? RAM_SHARED : RAM_PRIVATE;
ram_flags |= fb->readonly ? RAM_READONLY_FD : 0;
ram_flags |= fb->rom == ON_OFF_AUTO_ON ? RAM_READONLY : 0;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index 9f890a8..1bcae4b 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -52,7 +52,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
backend->aligned = true;
name = host_memory_backend_get_name(backend);
- ram_flags = backend->share ? RAM_SHARED : 0;
+ ram_flags = backend->share ? RAM_SHARED : RAM_PRIVATE;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
return memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name,
diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
index f7d81af..b380563 100644
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -28,7 +28,7 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
}
name = host_memory_backend_get_name(backend);
- ram_flags = backend->share ? RAM_SHARED : 0;
+ ram_flags = backend->share ? RAM_SHARED : RAM_PRIVATE;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
return memory_region_init_ram_flags_nomigrate(&backend->mr, OBJECT(backend),
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 9458e28..0ac21cc 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -246,6 +246,16 @@ typedef struct IOMMUTLBEvent {
/* RAM can be private that has kvm guest memfd backend */
#define RAM_GUEST_MEMFD (1 << 12)
+/*
+ * In RAMBlock creation functions, if MAP_SHARED is 0 in the flags parameter,
+ * the implementation may still create a shared mapping if other conditions
+ * require it. Callers who specifically want a private mapping, eg objects
+ * specified by the user, must pass RAM_PRIVATE.
+ * After RAMBlock creation, MAP_SHARED in the block's flags indicates whether
+ * the block is shared or private, and MAP_PRIVATE is omitted.
+ */
+#define RAM_PRIVATE (1 << 13)
+
static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
IOMMUNotifierFlag flags,
hwaddr start, hwaddr end,
diff --git a/system/physmem.c b/system/physmem.c
index b0c4b22..36f0811 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1949,7 +1949,11 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
{
RAMBlock *new_block;
Error *local_err = NULL;
- int64_t file_size, file_align;
+ int64_t file_size, file_align, share_flags;
+
+ share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
+ assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
+ ram_flags &= ~RAM_PRIVATE;
/* Just support these ram flags by now. */
assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
@@ -2132,7 +2136,11 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
{
RAMBlock *new_block;
Error *local_err = NULL;
- int align;
+ int align, share_flags;
+
+ share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
+ assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
+ ram_flags &= ~RAM_PRIVATE;
assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
@@ -2183,7 +2191,8 @@ RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
MemoryRegion *mr, Error **errp)
{
- assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
+ assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD |
+ RAM_PRIVATE)) == 0);
return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
}
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 03/19] memory: add RAM_PRIVATE
2024-12-02 13:19 ` [PATCH V4 03/19] memory: add RAM_PRIVATE Steve Sistare
@ 2024-12-09 19:45 ` Peter Xu
0 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-09 19:45 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:19:55AM -0800, Steve Sistare wrote:
> Define the RAM_PRIVATE flag.
>
> In RAMBlock creation functions, if MAP_SHARED is 0 in the flags parameter,
> in a subsequent patch the implementation may still create a shared mapping
> if other conditions require it. Callers who specifically want a private
> mapping, eg for objects specified by the user, must pass RAM_PRIVATE.
>
> After RAMBlock creation, MAP_SHARED in the block's flags indicates whether
> the block is shared or private, and MAP_PRIVATE is omitted.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 04/19] machine: aux-ram-share option
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (2 preceding siblings ...)
2024-12-02 13:19 ` [PATCH V4 03/19] memory: add RAM_PRIVATE Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-05 8:25 ` Markus Armbruster
` (2 more replies)
2024-12-02 13:19 ` [PATCH V4 05/19] migration: cpr-state Steve Sistare
` (14 subsequent siblings)
18 siblings, 3 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Allocate auxilliary guest RAM as an anonymous file that is shareable
with an external process. This option applies to memory allocated as
a side effect of creating various devices. It does not apply to
memory-backend-objects, whether explicitly specified on the command
line, or implicitly created by the -m command line option.
This option is intended to support new migration modes, in which the
memory region can be transferred in place to a new QEMU process, by sending
the memfd file descriptor to the process. Memory contents are preserved,
and if the mode also transfers device descriptors, then pages that are
locked in memory for DMA remain locked. This behavior is a pre-requisite
for supporting vfio, vdpa, and iommufd devices with the new modes.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
hw/core/machine.c | 18 ++++++++++++++++++
include/hw/boards.h | 1 +
qemu-options.hx | 15 +++++++++++++++
system/physmem.c | 3 +++
4 files changed, 37 insertions(+)
diff --git a/hw/core/machine.c b/hw/core/machine.c
index a35c4a8..b299b40 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -437,6 +437,20 @@ static void machine_set_mem_merge(Object *obj, bool value, Error **errp)
ms->mem_merge = value;
}
+static bool machine_get_aux_ram_share(Object *obj, Error **errp)
+{
+ MachineState *ms = MACHINE(obj);
+
+ return ms->aux_ram_share;
+}
+
+static void machine_set_aux_ram_share(Object *obj, bool value, Error **errp)
+{
+ MachineState *ms = MACHINE(obj);
+
+ ms->aux_ram_share = value;
+}
+
static bool machine_get_usb(Object *obj, Error **errp)
{
MachineState *ms = MACHINE(obj);
@@ -1129,6 +1143,10 @@ static void machine_class_init(ObjectClass *oc, void *data)
object_class_property_set_description(oc, "mem-merge",
"Enable/disable memory merge support");
+ object_class_property_add_bool(oc, "aux-ram-share",
+ machine_get_aux_ram_share,
+ machine_set_aux_ram_share);
+
object_class_property_add_bool(oc, "usb",
machine_get_usb, machine_set_usb);
object_class_property_set_description(oc, "usb",
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 36fbb9b..922ecd4 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -410,6 +410,7 @@ struct MachineState {
bool enable_graphics;
ConfidentialGuestSupport *cgs;
HostMemoryBackend *memdev;
+ bool aux_ram_share;
/*
* convenience alias to ram_memdev_id backend memory region
* or to numa container memory region
diff --git a/qemu-options.hx b/qemu-options.hx
index dacc979..02b9118 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -38,6 +38,9 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
" nvdimm=on|off controls NVDIMM support (default=off)\n"
" memory-encryption=@var{} memory encryption object to use (default=none)\n"
" hmat=on|off controls ACPI HMAT support (default=off)\n"
+#ifdef CONFIG_POSIX
+ " aux-ram-share=on|off allocate auxiliary guest RAM as shared (default: off)\n"
+#endif
" memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
" cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
QEMU_ARCH_ALL)
@@ -101,6 +104,18 @@ SRST
Enables or disables ACPI Heterogeneous Memory Attribute Table
(HMAT) support. The default is off.
+#ifdef CONFIG_POSIX
+ ``aux-ram-share=on|off``
+ Allocate auxiliary guest RAM as an anonymous file that is
+ shareable with an external process. This option applies to
+ memory allocated as a side effect of creating various devices.
+ It does not apply to memory-backend-objects, whether explicitly
+ specified on the command line, or implicitly created by the -m
+ command line option.
+
+ Some migration modes require aux-ram-share=on.
+#endif
+
``memory-backend='id'``
An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options.
Allows to use a memory backend as main RAM.
diff --git a/system/physmem.c b/system/physmem.c
index 36f0811..0bcb2cc 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2164,6 +2164,9 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
new_block->flags = ram_flags;
if (!host && !xen_enabled()) {
+ if (!share_flags && current_machine->aux_ram_share) {
+ new_block->flags |= RAM_SHARED;
+ }
if ((new_block->flags & RAM_SHARED) &&
!qemu_ram_alloc_shared(new_block, &local_err)) {
goto err;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-02 13:19 ` [PATCH V4 04/19] machine: aux-ram-share option Steve Sistare
@ 2024-12-05 8:25 ` Markus Armbruster
2024-12-05 14:24 ` Steven Sistare
2024-12-05 12:08 ` Markus Armbruster
2024-12-09 19:54 ` Peter Xu
2 siblings, 1 reply; 78+ messages in thread
From: Markus Armbruster @ 2024-12-05 8:25 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Steve Sistare <steven.sistare@oracle.com> writes:
> Allocate auxilliary guest RAM as an anonymous file that is shareable
> with an external process. This option applies to memory allocated as
> a side effect of creating various devices. It does not apply to
> memory-backend-objects, whether explicitly specified on the command
> line, or implicitly created by the -m command line option.
>
> This option is intended to support new migration modes, in which the
> memory region can be transferred in place to a new QEMU process, by sending
> the memfd file descriptor to the process. Memory contents are preserved,
> and if the mode also transfers device descriptors, then pages that are
> locked in memory for DMA remain locked. This behavior is a pre-requisite
> for supporting vfio, vdpa, and iommufd devices with the new modes.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
[...]
> diff --git a/qemu-options.hx b/qemu-options.hx
> index dacc979..02b9118 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -38,6 +38,9 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
> " nvdimm=on|off controls NVDIMM support (default=off)\n"
> " memory-encryption=@var{} memory encryption object to use (default=none)\n"
> " hmat=on|off controls ACPI HMAT support (default=off)\n"
> +#ifdef CONFIG_POSIX
> + " aux-ram-share=on|off allocate auxiliary guest RAM as shared (default: off)\n"
> +#endif
> " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
> " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
> QEMU_ARCH_ALL)
> @@ -101,6 +104,18 @@ SRST
> Enables or disables ACPI Heterogeneous Memory Attribute Table
> (HMAT) support. The default is off.
>
> +#ifdef CONFIG_POSIX
> + ``aux-ram-share=on|off``
> + Allocate auxiliary guest RAM as an anonymous file that is
> + shareable with an external process. This option applies to
> + memory allocated as a side effect of creating various devices.
> + It does not apply to memory-backend-objects, whether explicitly
> + specified on the command line, or implicitly created by the -m
> + command line option.
> +
> + Some migration modes require aux-ram-share=on.
This leaves the one thing users really need to know unsaid: when exactly
should users enable it.
"Some migration modes require aux-ram-share=on": do they enable it by
default, or is that left to the user? If the latter, why?
Please document the default, whatever it is.
> +#endif
> +
> ``memory-backend='id'``
> An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options.
> Allows to use a memory backend as main RAM.
[...]
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-05 8:25 ` Markus Armbruster
@ 2024-12-05 14:24 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-05 14:24 UTC (permalink / raw)
To: Markus Armbruster
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/5/2024 3:25 AM, Markus Armbruster wrote:
> Steve Sistare <steven.sistare@oracle.com> writes:
>
>> Allocate auxilliary guest RAM as an anonymous file that is shareable
>> with an external process. This option applies to memory allocated as
>> a side effect of creating various devices. It does not apply to
>> memory-backend-objects, whether explicitly specified on the command
>> line, or implicitly created by the -m command line option.
>>
>> This option is intended to support new migration modes, in which the
>> memory region can be transferred in place to a new QEMU process, by sending
>> the memfd file descriptor to the process. Memory contents are preserved,
>> and if the mode also transfers device descriptors, then pages that are
>> locked in memory for DMA remain locked. This behavior is a pre-requisite
>> for supporting vfio, vdpa, and iommufd devices with the new modes.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> [...]
>
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index dacc979..02b9118 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -38,6 +38,9 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
>> " nvdimm=on|off controls NVDIMM support (default=off)\n"
>> " memory-encryption=@var{} memory encryption object to use (default=none)\n"
>> " hmat=on|off controls ACPI HMAT support (default=off)\n"
>> +#ifdef CONFIG_POSIX
>> + " aux-ram-share=on|off allocate auxiliary guest RAM as shared (default: off)\n"
>> +#endif
>> " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
>> " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
>> QEMU_ARCH_ALL)
>> @@ -101,6 +104,18 @@ SRST
>> Enables or disables ACPI Heterogeneous Memory Attribute Table
>> (HMAT) support. The default is off.
>>
>> +#ifdef CONFIG_POSIX
>> + ``aux-ram-share=on|off``
>> + Allocate auxiliary guest RAM as an anonymous file that is
>> + shareable with an external process. This option applies to
>> + memory allocated as a side effect of creating various devices.
>> + It does not apply to memory-backend-objects, whether explicitly
>> + specified on the command line, or implicitly created by the -m
>> + command line option.
>> +
>> + Some migration modes require aux-ram-share=on.
>
> This leaves the one thing users really need to know unsaid: when exactly
> should users enable it.
>
> "Some migration modes require aux-ram-share=on": do they enable it by
> default, or is that left to the user? If the latter, why?
>
> Please document the default, whatever it is.
How about:
``aux-ram-share=on|off`
...
command line option. The default is off.
To use the cpr-transfer migration mode, you must set aux-ram-share=on.
cpr-transfer is a forward reference at this point in the series, so I will
move that last line to the "cpr-transfer mode" patch.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-02 13:19 ` [PATCH V4 04/19] machine: aux-ram-share option Steve Sistare
2024-12-05 8:25 ` Markus Armbruster
@ 2024-12-05 12:08 ` Markus Armbruster
2024-12-05 12:19 ` Markus Armbruster
2024-12-09 19:54 ` Peter Xu
2 siblings, 1 reply; 78+ messages in thread
From: Markus Armbruster @ 2024-12-05 12:08 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange, Markus Armbruster
Steve Sistare <steven.sistare@oracle.com> writes:
> Allocate auxilliary guest RAM as an anonymous file that is shareable
> with an external process. This option applies to memory allocated as
> a side effect of creating various devices. It does not apply to
> memory-backend-objects, whether explicitly specified on the command
> line, or implicitly created by the -m command line option.
>
> This option is intended to support new migration modes, in which the
> memory region can be transferred in place to a new QEMU process, by sending
> the memfd file descriptor to the process. Memory contents are preserved,
> and if the mode also transfers device descriptors, then pages that are
> locked in memory for DMA remain locked. This behavior is a pre-requisite
> for supporting vfio, vdpa, and iommufd devices with the new modes.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
[...]
> diff --git a/qemu-options.hx b/qemu-options.hx
> index dacc979..02b9118 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -38,6 +38,9 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
> " nvdimm=on|off controls NVDIMM support (default=off)\n"
> " memory-encryption=@var{} memory encryption object to use (default=none)\n"
> " hmat=on|off controls ACPI HMAT support (default=off)\n"
> +#ifdef CONFIG_POSIX
> + " aux-ram-share=on|off allocate auxiliary guest RAM as shared (default: off)\n"
> +#endif
> " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
> " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
> QEMU_ARCH_ALL)
> @@ -101,6 +104,18 @@ SRST
> Enables or disables ACPI Heterogeneous Memory Attribute Table
> (HMAT) support. The default is off.
>
> +#ifdef CONFIG_POSIX
> + ``aux-ram-share=on|off``
> + Allocate auxiliary guest RAM as an anonymous file that is
> + shareable with an external process. This option applies to
> + memory allocated as a side effect of creating various devices.
> + It does not apply to memory-backend-objects, whether explicitly
> + specified on the command line, or implicitly created by the -m
> + command line option.
> +
> + Some migration modes require aux-ram-share=on.
> +#endif
> +
I get
Warning, treated as error:
.../qemu-options.hx:117:Definition list ends without a blank line; unexpected unindent.
Putting the blank line before #endif works for me.
> ``memory-backend='id'``
> An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options.
> Allows to use a memory backend as main RAM.
[...]
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-05 12:08 ` Markus Armbruster
@ 2024-12-05 12:19 ` Markus Armbruster
2024-12-05 14:24 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Markus Armbruster @ 2024-12-05 12:19 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Markus Armbruster <armbru@redhat.com> writes:
> Steve Sistare <steven.sistare@oracle.com> writes:
>
>> Allocate auxilliary guest RAM as an anonymous file that is shareable
>> with an external process. This option applies to memory allocated as
>> a side effect of creating various devices. It does not apply to
>> memory-backend-objects, whether explicitly specified on the command
>> line, or implicitly created by the -m command line option.
>>
>> This option is intended to support new migration modes, in which the
>> memory region can be transferred in place to a new QEMU process, by sending
>> the memfd file descriptor to the process. Memory contents are preserved,
>> and if the mode also transfers device descriptors, then pages that are
>> locked in memory for DMA remain locked. This behavior is a pre-requisite
>> for supporting vfio, vdpa, and iommufd devices with the new modes.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> [...]
>
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index dacc979..02b9118 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -38,6 +38,9 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
>> " nvdimm=on|off controls NVDIMM support (default=off)\n"
>> " memory-encryption=@var{} memory encryption object to use (default=none)\n"
>> " hmat=on|off controls ACPI HMAT support (default=off)\n"
>> +#ifdef CONFIG_POSIX
>> + " aux-ram-share=on|off allocate auxiliary guest RAM as shared (default: off)\n"
>> +#endif
>> " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
>> " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
>> QEMU_ARCH_ALL)
>> @@ -101,6 +104,18 @@ SRST
>> Enables or disables ACPI Heterogeneous Memory Attribute Table
>> (HMAT) support. The default is off.
>>
>> +#ifdef CONFIG_POSIX
>> + ``aux-ram-share=on|off``
>> + Allocate auxiliary guest RAM as an anonymous file that is
>> + shareable with an external process. This option applies to
>> + memory allocated as a side effect of creating various devices.
>> + It does not apply to memory-backend-objects, whether explicitly
>> + specified on the command line, or implicitly created by the -m
>> + command line option.
>> +
>> + Some migration modes require aux-ram-share=on.
>> +#endif
>> +
>
> I get
>
> Warning, treated as error:
> .../qemu-options.hx:117:Definition list ends without a blank line; unexpected unindent.
>
> Putting the blank line before #endif works for me.
Actually, #ifdef does not work within SRST ... ERST.
Elsewhere, we document build-time optional features unconditionally.
Simply drop the #ifdef here.
>> ``memory-backend='id'``
>> An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options.
>> Allows to use a memory backend as main RAM.
>
> [...]
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-05 12:19 ` Markus Armbruster
@ 2024-12-05 14:24 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-05 14:24 UTC (permalink / raw)
To: Markus Armbruster
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/5/2024 7:19 AM, Markus Armbruster wrote:
> Markus Armbruster <armbru@redhat.com> writes:
>
>> Steve Sistare <steven.sistare@oracle.com> writes:
>>
>>> Allocate auxilliary guest RAM as an anonymous file that is shareable
>>> with an external process. This option applies to memory allocated as
>>> a side effect of creating various devices. It does not apply to
>>> memory-backend-objects, whether explicitly specified on the command
>>> line, or implicitly created by the -m command line option.
>>>
>>> This option is intended to support new migration modes, in which the
>>> memory region can be transferred in place to a new QEMU process, by sending
>>> the memfd file descriptor to the process. Memory contents are preserved,
>>> and if the mode also transfers device descriptors, then pages that are
>>> locked in memory for DMA remain locked. This behavior is a pre-requisite
>>> for supporting vfio, vdpa, and iommufd devices with the new modes.
>>>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>
>> [...]
>>
>>> diff --git a/qemu-options.hx b/qemu-options.hx
>>> index dacc979..02b9118 100644
>>> --- a/qemu-options.hx
>>> +++ b/qemu-options.hx
>>> @@ -38,6 +38,9 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
>>> " nvdimm=on|off controls NVDIMM support (default=off)\n"
>>> " memory-encryption=@var{} memory encryption object to use (default=none)\n"
>>> " hmat=on|off controls ACPI HMAT support (default=off)\n"
>>> +#ifdef CONFIG_POSIX
>>> + " aux-ram-share=on|off allocate auxiliary guest RAM as shared (default: off)\n"
>>> +#endif
>>> " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n"
>>> " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n",
>>> QEMU_ARCH_ALL)
>>> @@ -101,6 +104,18 @@ SRST
>>> Enables or disables ACPI Heterogeneous Memory Attribute Table
>>> (HMAT) support. The default is off.
>>>
>>> +#ifdef CONFIG_POSIX
>>> + ``aux-ram-share=on|off``
>>> + Allocate auxiliary guest RAM as an anonymous file that is
>>> + shareable with an external process. This option applies to
>>> + memory allocated as a side effect of creating various devices.
>>> + It does not apply to memory-backend-objects, whether explicitly
>>> + specified on the command line, or implicitly created by the -m
>>> + command line option.
>>> +
>>> + Some migration modes require aux-ram-share=on.
>>> +#endif
>>> +
>>
>> I get
>>
>> Warning, treated as error:
>> .../qemu-options.hx:117:Definition list ends without a blank line; unexpected unindent.
>>
>> Putting the blank line before #endif works for me.
>
> Actually, #ifdef does not work within SRST ... ERST.
>
> Elsewhere, we document build-time optional features unconditionally.
> Simply drop the #ifdef here.
Thanks Markus. I see the "#ifdef" literal emitted in the man page. I'll delete it.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-02 13:19 ` [PATCH V4 04/19] machine: aux-ram-share option Steve Sistare
2024-12-05 8:25 ` Markus Armbruster
2024-12-05 12:08 ` Markus Armbruster
@ 2024-12-09 19:54 ` Peter Xu
2024-12-12 20:38 ` Steven Sistare
2 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-09 19:54 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:19:56AM -0800, Steve Sistare wrote:
> diff --git a/system/physmem.c b/system/physmem.c
> index 36f0811..0bcb2cc 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -2164,6 +2164,9 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> new_block->flags = ram_flags;
>
> if (!host && !xen_enabled()) {
> + if (!share_flags && current_machine->aux_ram_share) {
> + new_block->flags |= RAM_SHARED;
> + }
Just to mention that if you agree with what I said in patch 2, here it will
need some trivial rebase change. IOW, IMO we shouldn't special case xen
either here, so it should also apply to xen if one chose to, changing aux
alloc to RAM_SHARED.
Frankly I don't know whether xen respects RAM_SHARED at all for anonymous,
but it's a separate question to ask..
Basically what will happen later is in cpr-transfer migrate cmd, it'll fail
for xen properly seeing fd==-1. That'll be fine, IMHO.
> if ((new_block->flags & RAM_SHARED) &&
> !qemu_ram_alloc_shared(new_block, &local_err)) {
> goto err;
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-09 19:54 ` Peter Xu
@ 2024-12-12 20:38 ` Steven Sistare
2024-12-12 21:22 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-12 20:38 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/9/2024 2:54 PM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:19:56AM -0800, Steve Sistare wrote:
>> diff --git a/system/physmem.c b/system/physmem.c
>> index 36f0811..0bcb2cc 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -2164,6 +2164,9 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
>> new_block->flags = ram_flags;
>>
>> if (!host && !xen_enabled()) {
>> + if (!share_flags && current_machine->aux_ram_share) {
>> + new_block->flags |= RAM_SHARED;
>> + }
>
> Just to mention that if you agree with what I said in patch 2, here it will
> need some trivial rebase change. IOW, IMO we shouldn't special case xen
> either here, so it should also apply to xen if one chose to, changing aux
> alloc to RAM_SHARED.
OK.
So, if this only requires a trivial change, do I get your RB?
- Steve
>
> Frankly I don't know whether xen respects RAM_SHARED at all for anonymous,
> but it's a separate question to ask..
>
> Basically what will happen later is in cpr-transfer migrate cmd, it'll fail
> for xen properly seeing fd==-1. That'll be fine, IMHO.
>
>> if ((new_block->flags & RAM_SHARED) &&
>> !qemu_ram_alloc_shared(new_block, &local_err)) {
>> goto err;
>
>
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 04/19] machine: aux-ram-share option
2024-12-12 20:38 ` Steven Sistare
@ 2024-12-12 21:22 ` Peter Xu
0 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-12 21:22 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Thu, Dec 12, 2024 at 03:38:07PM -0500, Steven Sistare wrote:
> On 12/9/2024 2:54 PM, Peter Xu wrote:
> > On Mon, Dec 02, 2024 at 05:19:56AM -0800, Steve Sistare wrote:
> > > diff --git a/system/physmem.c b/system/physmem.c
> > > index 36f0811..0bcb2cc 100644
> > > --- a/system/physmem.c
> > > +++ b/system/physmem.c
> > > @@ -2164,6 +2164,9 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
> > > new_block->flags = ram_flags;
> > > if (!host && !xen_enabled()) {
> > > + if (!share_flags && current_machine->aux_ram_share) {
> > > + new_block->flags |= RAM_SHARED;
> > > + }
> >
> > Just to mention that if you agree with what I said in patch 2, here it will
> > need some trivial rebase change. IOW, IMO we shouldn't special case xen
> > either here, so it should also apply to xen if one chose to, changing aux
> > alloc to RAM_SHARED.
>
> OK.
>
> So, if this only requires a trivial change, do I get your RB?
Yes please.
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 05/19] migration: cpr-state
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (3 preceding siblings ...)
2024-12-02 13:19 ` [PATCH V4 04/19] machine: aux-ram-share option Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-02 13:19 ` [PATCH V4 06/19] physmem: preserve ram blocks for cpr Steve Sistare
` (13 subsequent siblings)
18 siblings, 0 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
CPR must save state that is needed after QEMU is restarted, when devices
are realized. Thus the extra state cannot be saved in the migration
channel, as objects must already exist before that channel can be loaded.
Instead, define auxilliary state structures and vmstate descriptions, not
associated with any registered object, and serialize the aux state to a
cpr-specific channel in cpr_state_save. Deserialize in cpr_state_load
after QEMU restarts, before devices are realized.
Provide accessors for clients to register file descriptors for saving.
The mechanism for passing the fd's to the new process will be specific
to each migration mode, and added in subsequent patches.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
include/migration/cpr.h | 25 ++++++
migration/cpr.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++
migration/meson.build | 1 +
migration/migration.c | 1 +
migration/trace-events | 7 ++
5 files changed, 232 insertions(+)
create mode 100644 include/migration/cpr.h
create mode 100644 migration/cpr.c
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
new file mode 100644
index 0000000..201d66d
--- /dev/null
+++ b/include/migration/cpr.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2021, 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef MIGRATION_CPR_H
+#define MIGRATION_CPR_H
+
+#include "qapi/qapi-types-migration.h"
+
+#define QEMU_CPR_FILE_MAGIC 0x51435052
+#define QEMU_CPR_FILE_VERSION 0x00000001
+
+void cpr_save_fd(const char *name, int id, int fd);
+void cpr_delete_fd(const char *name, int id);
+int cpr_find_fd(const char *name, int id);
+
+int cpr_state_save(MigrationChannel *channel, Error **errp);
+int cpr_state_load(Error **errp);
+void cpr_state_close(void);
+struct QIOChannel *cpr_state_ioc(void);
+
+#endif
diff --git a/migration/cpr.c b/migration/cpr.c
new file mode 100644
index 0000000..1e2878c
--- /dev/null
+++ b/migration/cpr.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021-2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "migration/cpr.h"
+#include "migration/misc.h"
+#include "migration/options.h"
+#include "migration/qemu-file.h"
+#include "migration/savevm.h"
+#include "migration/vmstate.h"
+#include "sysemu/runstate.h"
+#include "trace.h"
+
+/*************************************************************************/
+/* cpr state container for all information to be saved. */
+
+typedef QLIST_HEAD(CprFdList, CprFd) CprFdList;
+
+typedef struct CprState {
+ CprFdList fds;
+} CprState;
+
+static CprState cpr_state;
+
+/****************************************************************************/
+
+typedef struct CprFd {
+ char *name;
+ unsigned int namelen;
+ int id;
+ int fd;
+ QLIST_ENTRY(CprFd) next;
+} CprFd;
+
+static const VMStateDescription vmstate_cpr_fd = {
+ .name = "cpr fd",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(namelen, CprFd),
+ VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen),
+ VMSTATE_INT32(id, CprFd),
+ VMSTATE_INT32(fd, CprFd),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+void cpr_save_fd(const char *name, int id, int fd)
+{
+ CprFd *elem = g_new0(CprFd, 1);
+
+ trace_cpr_save_fd(name, id, fd);
+ elem->name = g_strdup(name);
+ elem->namelen = strlen(name) + 1;
+ elem->id = id;
+ elem->fd = fd;
+ QLIST_INSERT_HEAD(&cpr_state.fds, elem, next);
+}
+
+static CprFd *find_fd(CprFdList *head, const char *name, int id)
+{
+ CprFd *elem;
+
+ QLIST_FOREACH(elem, head, next) {
+ if (!strcmp(elem->name, name) && elem->id == id) {
+ return elem;
+ }
+ }
+ return NULL;
+}
+
+void cpr_delete_fd(const char *name, int id)
+{
+ CprFd *elem = find_fd(&cpr_state.fds, name, id);
+
+ if (elem) {
+ QLIST_REMOVE(elem, next);
+ g_free(elem->name);
+ g_free(elem);
+ }
+
+ trace_cpr_delete_fd(name, id);
+}
+
+int cpr_find_fd(const char *name, int id)
+{
+ CprFd *elem = find_fd(&cpr_state.fds, name, id);
+ int fd = elem ? elem->fd : -1;
+
+ trace_cpr_find_fd(name, id, fd);
+ return fd;
+}
+/*************************************************************************/
+#define CPR_STATE "CprState"
+
+static const VMStateDescription vmstate_cpr_state = {
+ .name = CPR_STATE,
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, next),
+ VMSTATE_END_OF_LIST()
+ }
+};
+/*************************************************************************/
+
+static QEMUFile *cpr_state_file;
+
+QIOChannel *cpr_state_ioc(void)
+{
+ return qemu_file_get_ioc(cpr_state_file);
+}
+
+int cpr_state_save(MigrationChannel *channel, Error **errp)
+{
+ int ret;
+ QEMUFile *f;
+ MigMode mode = migrate_mode();
+
+ trace_cpr_state_save(MigMode_str(mode));
+
+ /* set f based on mode in a later patch in this series */
+ return 0;
+
+ qemu_put_be32(f, QEMU_CPR_FILE_MAGIC);
+ qemu_put_be32(f, QEMU_CPR_FILE_VERSION);
+
+ ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0);
+ if (ret) {
+ error_setg(errp, "vmstate_save_state error %d", ret);
+ qemu_fclose(f);
+ return ret;
+ }
+
+ /*
+ * Close the socket only partially so we can later detect when the other
+ * end closes by getting a HUP event.
+ */
+ qemu_fflush(f);
+ qio_channel_shutdown(qemu_file_get_ioc(f), QIO_CHANNEL_SHUTDOWN_WRITE,
+ NULL);
+ cpr_state_file = f;
+ return 0;
+}
+
+int cpr_state_load(Error **errp)
+{
+ int ret;
+ uint32_t v;
+ QEMUFile *f;
+ MigMode mode = 0;
+
+ /* set f and mode based on other parameters later in this patch series */
+ return 0;
+
+ trace_cpr_state_load(MigMode_str(mode));
+
+ v = qemu_get_be32(f);
+ if (v != QEMU_CPR_FILE_MAGIC) {
+ error_setg(errp, "Not a migration stream (bad magic %x)", v);
+ qemu_fclose(f);
+ return -EINVAL;
+ }
+ v = qemu_get_be32(f);
+ if (v != QEMU_CPR_FILE_VERSION) {
+ error_setg(errp, "Unsupported migration stream version %d", v);
+ qemu_fclose(f);
+ return -ENOTSUP;
+ }
+
+ ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1);
+ if (ret) {
+ error_setg(errp, "vmstate_load_state error %d", ret);
+ qemu_fclose(f);
+ return ret;
+ }
+
+ /*
+ * Let the caller decide when to close the socket (and generate a HUP event
+ * for the sending side).
+ */
+ cpr_state_file = f;
+
+ return ret;
+}
+
+void cpr_state_close(void)
+{
+ if (cpr_state_file) {
+ qemu_fclose(cpr_state_file);
+ cpr_state_file = NULL;
+ }
+}
diff --git a/migration/meson.build b/migration/meson.build
index d53cf34..039f0f9 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -13,6 +13,7 @@ system_ss.add(files(
'block-dirty-bitmap.c',
'channel.c',
'channel-block.c',
+ 'cpr.c',
'cpu-throttle.c',
'dirtyrate.c',
'exec.c',
diff --git a/migration/migration.c b/migration/migration.c
index 8c5bd0a..83dabc7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -27,6 +27,7 @@
#include "sysemu/cpu-throttle.h"
#include "rdma.h"
#include "ram.h"
+#include "migration/cpr.h"
#include "migration/global_state.h"
#include "migration/misc.h"
#include "migration.h"
diff --git a/migration/trace-events b/migration/trace-events
index bb0e0cc..89c0244 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -342,6 +342,13 @@ colo_receive_message(const char *msg) "Receive '%s' message"
# colo-failover.c
colo_failover_set_state(const char *new_state) "new state %s"
+# cpr.c
+cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d"
+cpr_delete_fd(const char *name, int id) "%s, id %d"
+cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d"
+cpr_state_save(const char *mode) "%s mode"
+cpr_state_load(const char *mode) "%s mode"
+
# block-dirty-bitmap.c
send_bitmap_header_enter(void) ""
send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (4 preceding siblings ...)
2024-12-02 13:19 ` [PATCH V4 05/19] migration: cpr-state Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-09 20:07 ` Peter Xu
2024-12-02 13:19 ` [PATCH V4 07/19] hostmem-memfd: preserve " Steve Sistare
` (12 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Save the memfd for ramblocks in CPR state, along with a name that
uniquely identifies it. The block's idstr is not yet set, so it
cannot be used for this purpose. Find the saved memfd in new QEMU when
creating a block. If the block size is larger in new QEMU, extend the
block using fallocate, and the extra space will be useable after a guest
reset.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
1 file changed, 34 insertions(+), 2 deletions(-)
diff --git a/system/physmem.c b/system/physmem.c
index 0bcb2cc..aa095a3 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -70,6 +70,7 @@
#include "qemu/pmem.h"
+#include "migration/cpr.h"
#include "migration/vmstate.h"
#include "qemu/range.h"
@@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
}
}
+static char *cpr_name(RAMBlock *block)
+{
+ MemoryRegion *mr = block->mr;
+ const char *mr_name = memory_region_name(mr);
+ g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
+
+ if (id) {
+ return g_strdup_printf("%s/%s", id, mr_name);
+ } else {
+ return g_strdup(mr_name);
+ }
+}
+
size_t qemu_ram_pagesize(RAMBlock *rb)
{
return rb->page_size;
@@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
{
size_t max_length = new_block->max_length;
MemoryRegion *mr = new_block->mr;
- const char *name = memory_region_name(mr);
- int fd;
+ g_autofree char *name = cpr_name(new_block);
+ int fd = cpr_find_fd(name, 0);
+
+ if (fd >= 0) {
+ if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
+ error_setg_errno(errp, errno,
+ "cannot grow ram block %s fd %d to %ld bytes",
+ name, fd, max_length);
+ goto err;
+ }
+ goto have_fd;
+ }
if (qemu_memfd_available()) {
fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
@@ -2111,7 +2135,9 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
return true;
}
}
+ cpr_save_fd(name, 0, fd);
+have_fd:
new_block->mr->align = QEMU_VMALLOC_ALIGN;
new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
@@ -2122,6 +2148,8 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
return true;
}
+err:
+ cpr_delete_fd(name, 0);
close(fd);
return false;
}
@@ -2234,6 +2262,8 @@ static void reclaim_ramblock(RAMBlock *block)
void qemu_ram_free(RAMBlock *block)
{
+ g_autofree char *name = NULL;
+
if (!block) {
return;
}
@@ -2244,6 +2274,8 @@ void qemu_ram_free(RAMBlock *block)
}
qemu_mutex_lock_ramlist();
+ name = cpr_name(block);
+ cpr_delete_fd(name, 0);
QLIST_REMOVE_RCU(block, next);
ram_list.mru_block = NULL;
/* Write list before version */
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-02 13:19 ` [PATCH V4 06/19] physmem: preserve ram blocks for cpr Steve Sistare
@ 2024-12-09 20:07 ` Peter Xu
2024-12-12 20:38 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-09 20:07 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
> Save the memfd for ramblocks in CPR state, along with a name that
> uniquely identifies it. The block's idstr is not yet set, so it
> cannot be used for this purpose. Find the saved memfd in new QEMU when
> creating a block. If the block size is larger in new QEMU, extend the
> block using fallocate, and the extra space will be useable after a guest
> reset.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
> 1 file changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/system/physmem.c b/system/physmem.c
> index 0bcb2cc..aa095a3 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -70,6 +70,7 @@
>
> #include "qemu/pmem.h"
>
> +#include "migration/cpr.h"
> #include "migration/vmstate.h"
>
> #include "qemu/range.h"
> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> }
> }
>
> +static char *cpr_name(RAMBlock *block)
> +{
> + MemoryRegion *mr = block->mr;
> + const char *mr_name = memory_region_name(mr);
> + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> +
> + if (id) {
> + return g_strdup_printf("%s/%s", id, mr_name);
> + } else {
> + return g_strdup(mr_name);
> + }
> +}
> +
> size_t qemu_ram_pagesize(RAMBlock *rb)
> {
> return rb->page_size;
> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> {
> size_t max_length = new_block->max_length;
> MemoryRegion *mr = new_block->mr;
> - const char *name = memory_region_name(mr);
> - int fd;
> + g_autofree char *name = cpr_name(new_block);
> + int fd = cpr_find_fd(name, 0);
If to use the proposed patch in the reply of patch 2, here this should be
able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
> +
> + if (fd >= 0) {
> + if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
> + error_setg_errno(errp, errno,
> + "cannot grow ram block %s fd %d to %ld bytes",
> + name, fd, max_length);
> + goto err;
> + }
I remember we discussed something similar to this, do we need ftruncate()
at all? I think not.
This happens when booting QEMU, so I don't think it's relevant yet to what
size used in src, as this is dest.
It starts to get relevant only when cpr migration starts on src, it sents
ramblocks at the beginning, then parse_ramblock() will properly resize any
ramblock to whatever size it should use.
If the resize didn't happen it can only mean that used_length is correctly
matched on both sides.
So I don't see why a special truncate() call is needed yet..
> + goto have_fd;
> + }
>
> if (qemu_memfd_available()) {
> fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
> @@ -2111,7 +2135,9 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> return true;
> }
> }
> + cpr_save_fd(name, 0, fd);
>
> +have_fd:
> new_block->mr->align = QEMU_VMALLOC_ALIGN;
> new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
>
> @@ -2122,6 +2148,8 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> return true;
> }
>
> +err:
> + cpr_delete_fd(name, 0);
> close(fd);
> return false;
> }
> @@ -2234,6 +2262,8 @@ static void reclaim_ramblock(RAMBlock *block)
>
> void qemu_ram_free(RAMBlock *block)
> {
> + g_autofree char *name = NULL;
> +
> if (!block) {
> return;
> }
> @@ -2244,6 +2274,8 @@ void qemu_ram_free(RAMBlock *block)
> }
>
> qemu_mutex_lock_ramlist();
> + name = cpr_name(block);
> + cpr_delete_fd(name, 0);
> QLIST_REMOVE_RCU(block, next);
> ram_list.mru_block = NULL;
> /* Write list before version */
> --
> 1.8.3.1
>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-09 20:07 ` Peter Xu
@ 2024-12-12 20:38 ` Steven Sistare
2024-12-12 22:48 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-12 20:38 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/9/2024 3:07 PM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
>> Save the memfd for ramblocks in CPR state, along with a name that
>> uniquely identifies it. The block's idstr is not yet set, so it
>> cannot be used for this purpose. Find the saved memfd in new QEMU when
>> creating a block. If the block size is larger in new QEMU, extend the
>> block using fallocate, and the extra space will be useable after a guest
>> reset.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>> 1 file changed, 34 insertions(+), 2 deletions(-)
>>
>> diff --git a/system/physmem.c b/system/physmem.c
>> index 0bcb2cc..aa095a3 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -70,6 +70,7 @@
>>
>> #include "qemu/pmem.h"
>>
>> +#include "migration/cpr.h"
>> #include "migration/vmstate.h"
>>
>> #include "qemu/range.h"
>> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>> }
>> }
>>
>> +static char *cpr_name(RAMBlock *block)
>> +{
>> + MemoryRegion *mr = block->mr;
>> + const char *mr_name = memory_region_name(mr);
>> + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>> +
>> + if (id) {
>> + return g_strdup_printf("%s/%s", id, mr_name);
>> + } else {
>> + return g_strdup(mr_name);
>> + }
>> +}
>> +
>> size_t qemu_ram_pagesize(RAMBlock *rb)
>> {
>> return rb->page_size;
>> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>> {
>> size_t max_length = new_block->max_length;
>> MemoryRegion *mr = new_block->mr;
>> - const char *name = memory_region_name(mr);
>> - int fd;
>> + g_autofree char *name = cpr_name(new_block);
>> + int fd = cpr_find_fd(name, 0);
>
> If to use the proposed patch in the reply of patch 2, here this should be
> able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
>
>> +
>> + if (fd >= 0) {
>> + if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
>> + error_setg_errno(errp, errno,
>> + "cannot grow ram block %s fd %d to %ld bytes",
>> + name, fd, max_length);
>> + goto err;
>> + }
>
> I remember we discussed something similar to this, do we need ftruncate()
> at all? I think not.
>
> This happens when booting QEMU, so I don't think it's relevant yet to what
> size used in src, as this is dest.
>
> It starts to get relevant only when cpr migration starts on src, it sents
> ramblocks at the beginning, then parse_ramblock() will properly resize any
> ramblock to whatever size it should use.
>
> If the resize didn't happen it can only mean that used_length is correctly
> matched on both sides.
>
> So I don't see why a special truncate() call is needed yet..
You suggested truncate:
https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
"So after such system reset, QEMU might start to see new ROM code loaded
here (not the one that got migrated anymore, which will only match the
version installed on src QEMU). Here the problem is the new firmware can
be larger, so I _think_ we need to make sure max_length is not modified by
CPR to allow resizing happen here, while if we use truncate=true here it
should just work in all cases."
... but you suggested passing a truncate bool to the file_ram_alloc call after
cpr_find_fd. I could do that instead. However, if qemu_ram_alloc_shared uses
qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
then I will still call ftruncate here, because qemu_ram_alloc_from_fd does not
take a truncate argument.
- Steve
>> + goto have_fd;
>> + }
>>
>> if (qemu_memfd_available()) {
>> fd = qemu_memfd_create(name, max_length + mr->align, 0, 0, 0, errp);
>> @@ -2111,7 +2135,9 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>> return true;
>> }
>> }
>> + cpr_save_fd(name, 0, fd);
>>
>> +have_fd:
>> new_block->mr->align = QEMU_VMALLOC_ALIGN;
>> new_block->host = file_ram_alloc(new_block, max_length, fd, false, 0, errp);
>>
>> @@ -2122,6 +2148,8 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>> return true;
>> }
>>
>> +err:
>> + cpr_delete_fd(name, 0);
>> close(fd);
>> return false;
>> }
>> @@ -2234,6 +2262,8 @@ static void reclaim_ramblock(RAMBlock *block)
>>
>> void qemu_ram_free(RAMBlock *block)
>> {
>> + g_autofree char *name = NULL;
>> +
>> if (!block) {
>> return;
>> }
>> @@ -2244,6 +2274,8 @@ void qemu_ram_free(RAMBlock *block)
>> }
>>
>> qemu_mutex_lock_ramlist();
>> + name = cpr_name(block);
>> + cpr_delete_fd(name, 0);
>> QLIST_REMOVE_RCU(block, next);
>> ram_list.mru_block = NULL;
>> /* Write list before version */
>> --
>> 1.8.3.1
>>
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-12 20:38 ` Steven Sistare
@ 2024-12-12 22:48 ` Peter Xu
2024-12-13 15:21 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-12 22:48 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
> On 12/9/2024 3:07 PM, Peter Xu wrote:
> > On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
> > > Save the memfd for ramblocks in CPR state, along with a name that
> > > uniquely identifies it. The block's idstr is not yet set, so it
> > > cannot be used for this purpose. Find the saved memfd in new QEMU when
> > > creating a block. If the block size is larger in new QEMU, extend the
> > > block using fallocate, and the extra space will be useable after a guest
> > > reset.
> > >
> > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > ---
> > > system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
> > > 1 file changed, 34 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/system/physmem.c b/system/physmem.c
> > > index 0bcb2cc..aa095a3 100644
> > > --- a/system/physmem.c
> > > +++ b/system/physmem.c
> > > @@ -70,6 +70,7 @@
> > > #include "qemu/pmem.h"
> > > +#include "migration/cpr.h"
> > > #include "migration/vmstate.h"
> > > #include "qemu/range.h"
> > > @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> > > }
> > > }
> > > +static char *cpr_name(RAMBlock *block)
> > > +{
> > > + MemoryRegion *mr = block->mr;
> > > + const char *mr_name = memory_region_name(mr);
> > > + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > > +
> > > + if (id) {
> > > + return g_strdup_printf("%s/%s", id, mr_name);
> > > + } else {
> > > + return g_strdup(mr_name);
> > > + }
> > > +}
> > > +
> > > size_t qemu_ram_pagesize(RAMBlock *rb)
> > > {
> > > return rb->page_size;
> > > @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> > > {
> > > size_t max_length = new_block->max_length;
> > > MemoryRegion *mr = new_block->mr;
> > > - const char *name = memory_region_name(mr);
> > > - int fd;
> > > + g_autofree char *name = cpr_name(new_block);
> > > + int fd = cpr_find_fd(name, 0);
> >
> > If to use the proposed patch in the reply of patch 2, here this should be
> > able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
> >
> > > +
> > > + if (fd >= 0) {
> > > + if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
> > > + error_setg_errno(errp, errno,
> > > + "cannot grow ram block %s fd %d to %ld bytes",
> > > + name, fd, max_length);
> > > + goto err;
> > > + }
> >
> > I remember we discussed something similar to this, do we need ftruncate()
> > at all? I think not.
> >
> > This happens when booting QEMU, so I don't think it's relevant yet to what
> > size used in src, as this is dest.
> >
> > It starts to get relevant only when cpr migration starts on src, it sents
> > ramblocks at the beginning, then parse_ramblock() will properly resize any
> > ramblock to whatever size it should use.
> >
> > If the resize didn't happen it can only mean that used_length is correctly
> > matched on both sides.
> >
> > So I don't see why a special truncate() call is needed yet..
>
> You suggested truncate:
>
> https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
>
> "So after such system reset, QEMU might start to see new ROM code loaded
> here (not the one that got migrated anymore, which will only match the
> version installed on src QEMU). Here the problem is the new firmware can
> be larger, so I _think_ we need to make sure max_length is not modified by
> CPR to allow resizing happen here, while if we use truncate=true here it
> should just work in all cases."
>
> ... but you suggested passing a truncate bool to the file_ram_alloc call after
> cpr_find_fd. I could do that instead. However, if qemu_ram_alloc_shared uses
> qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
> then I will still call ftruncate here, because qemu_ram_alloc_from_fd does not
> take a truncate argument.
My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
patch 2, it will only create zero-length fd (with fsize=0) and leave all
the rest to qemu_ram_alloc_from_fd(), then with that:
qemu_ram_alloc_from_fd:
new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
errp);
So that'll always have truncate==!file_size==1. Then truncate will be done
at file_ram_alloc() later, iiuc.
if (truncate && ftruncate(fd, offset + memory)) {
perror("ftruncate");
}
Would this work?
Meanwhile, this whole ram resize discussion reminded me that to reuse
qemu_ram_alloc_from_fd(), we may also want to make sure to pass ->resized()
hook from qemu_ram_alloc_internal() to the fd helper too.. IOW, we may
want to keep qemu_ram_resize() invoke those hooks, even after switching to
fd-based for aux mems.
Maybe the size / max_size also need to be passed over? As for fd ramblock
it used to be always the same on used_length/max_length, but not anymore
when we switch aux mem to fd based. Please feel free to double check..
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-12 22:48 ` Peter Xu
@ 2024-12-13 15:21 ` Peter Xu
2024-12-13 15:30 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-13 15:21 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Thu, Dec 12, 2024 at 05:48:03PM -0500, Peter Xu wrote:
> On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
> > On 12/9/2024 3:07 PM, Peter Xu wrote:
> > > On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
> > > > Save the memfd for ramblocks in CPR state, along with a name that
> > > > uniquely identifies it. The block's idstr is not yet set, so it
> > > > cannot be used for this purpose. Find the saved memfd in new QEMU when
> > > > creating a block. If the block size is larger in new QEMU, extend the
> > > > block using fallocate, and the extra space will be useable after a guest
> > > > reset.
> > > >
> > > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > > ---
> > > > system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
> > > > 1 file changed, 34 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/system/physmem.c b/system/physmem.c
> > > > index 0bcb2cc..aa095a3 100644
> > > > --- a/system/physmem.c
> > > > +++ b/system/physmem.c
> > > > @@ -70,6 +70,7 @@
> > > > #include "qemu/pmem.h"
> > > > +#include "migration/cpr.h"
> > > > #include "migration/vmstate.h"
> > > > #include "qemu/range.h"
> > > > @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
> > > > }
> > > > }
> > > > +static char *cpr_name(RAMBlock *block)
> > > > +{
> > > > + MemoryRegion *mr = block->mr;
> > > > + const char *mr_name = memory_region_name(mr);
> > > > + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
> > > > +
> > > > + if (id) {
> > > > + return g_strdup_printf("%s/%s", id, mr_name);
> > > > + } else {
> > > > + return g_strdup(mr_name);
> > > > + }
> > > > +}
> > > > +
> > > > size_t qemu_ram_pagesize(RAMBlock *rb)
> > > > {
> > > > return rb->page_size;
> > > > @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
> > > > {
> > > > size_t max_length = new_block->max_length;
> > > > MemoryRegion *mr = new_block->mr;
> > > > - const char *name = memory_region_name(mr);
> > > > - int fd;
> > > > + g_autofree char *name = cpr_name(new_block);
> > > > + int fd = cpr_find_fd(name, 0);
> > >
> > > If to use the proposed patch in the reply of patch 2, here this should be
> > > able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
> > >
> > > > +
> > > > + if (fd >= 0) {
> > > > + if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
> > > > + error_setg_errno(errp, errno,
> > > > + "cannot grow ram block %s fd %d to %ld bytes",
> > > > + name, fd, max_length);
> > > > + goto err;
> > > > + }
> > >
> > > I remember we discussed something similar to this, do we need ftruncate()
> > > at all? I think not.
> > >
> > > This happens when booting QEMU, so I don't think it's relevant yet to what
> > > size used in src, as this is dest.
> > >
> > > It starts to get relevant only when cpr migration starts on src, it sents
> > > ramblocks at the beginning, then parse_ramblock() will properly resize any
> > > ramblock to whatever size it should use.
> > >
> > > If the resize didn't happen it can only mean that used_length is correctly
> > > matched on both sides.
> > >
> > > So I don't see why a special truncate() call is needed yet..
> >
> > You suggested truncate:
> >
> > https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
> >
> > "So after such system reset, QEMU might start to see new ROM code loaded
> > here (not the one that got migrated anymore, which will only match the
> > version installed on src QEMU). Here the problem is the new firmware can
> > be larger, so I _think_ we need to make sure max_length is not modified by
> > CPR to allow resizing happen here, while if we use truncate=true here it
> > should just work in all cases."
> >
> > ... but you suggested passing a truncate bool to the file_ram_alloc call after
> > cpr_find_fd. I could do that instead. However, if qemu_ram_alloc_shared uses
> > qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
> > then I will still call ftruncate here, because qemu_ram_alloc_from_fd does not
> > take a truncate argument.
>
[begin]
> My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
> patch 2, it will only create zero-length fd (with fsize=0) and leave all
> the rest to qemu_ram_alloc_from_fd(), then with that:
>
> qemu_ram_alloc_from_fd:
> new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
> errp);
>
> So that'll always have truncate==!file_size==1. Then truncate will be done
> at file_ram_alloc() later, iiuc.
>
> if (truncate && ftruncate(fd, offset + memory)) {
> perror("ftruncate");
> }
>
> Would this work?
[end]
Please feel free to ignore [begin]->[end].. I guess I didn't really answer
it.
Now after I re-read the question.. considering that we have been very
cautious on the fsize here:
file_size = get_file_size(fd);
if (file_size > offset && file_size < (offset + size)) {
error_setg(errp, "backing store size 0x%" PRIx64
" does not match 'size' option 0x" RAM_ADDR_FMT,
file_size, size);
return NULL;
}
I suppose your change makes sense. So please feel free to keep the
truncation change. I wished we could already auto-enlarge the file size
there already instead of failing, but I think I see why we're over cautious
on this - we want to still provide some safety in case some wrong file path
passed over to QEMU, to not easily corrupt the file when that happens. So
we assume the file must be pre-truncated to say this is the right ram file.
Though if you wouldn't mind, I'd still request a comment explaining it,
because it probably isn't obvious..
AFAICT it's only relevant to resizable RAM and also the fact that it'll be
able to present now in fd-based ramblocks. Maybe also mention the fact of
our cautious on changing file sizes on fd-based, but not avoidable to do it
here to make resizable work for firmwares. Any form of comment would help.
OTOH, below comments should still worth checking.
>
> Meanwhile, this whole ram resize discussion reminded me that to reuse
> qemu_ram_alloc_from_fd(), we may also want to make sure to pass ->resized()
> hook from qemu_ram_alloc_internal() to the fd helper too.. IOW, we may
> want to keep qemu_ram_resize() invoke those hooks, even after switching to
> fd-based for aux mems.
>
> Maybe the size / max_size also need to be passed over? As for fd ramblock
> it used to be always the same on used_length/max_length, but not anymore
> when we switch aux mem to fd based. Please feel free to double check..
>
> --
> Peter Xu
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-13 15:21 ` Peter Xu
@ 2024-12-13 15:30 ` Steven Sistare
2024-12-18 16:34 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-13 15:30 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/13/2024 10:21 AM, Peter Xu wrote:
> On Thu, Dec 12, 2024 at 05:48:03PM -0500, Peter Xu wrote:
>> On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
>>> On 12/9/2024 3:07 PM, Peter Xu wrote:
>>>> On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
>>>>> Save the memfd for ramblocks in CPR state, along with a name that
>>>>> uniquely identifies it. The block's idstr is not yet set, so it
>>>>> cannot be used for this purpose. Find the saved memfd in new QEMU when
>>>>> creating a block. If the block size is larger in new QEMU, extend the
>>>>> block using fallocate, and the extra space will be useable after a guest
>>>>> reset.
>>>>>
>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>> ---
>>>>> system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>>>>> 1 file changed, 34 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/system/physmem.c b/system/physmem.c
>>>>> index 0bcb2cc..aa095a3 100644
>>>>> --- a/system/physmem.c
>>>>> +++ b/system/physmem.c
>>>>> @@ -70,6 +70,7 @@
>>>>> #include "qemu/pmem.h"
>>>>> +#include "migration/cpr.h"
>>>>> #include "migration/vmstate.h"
>>>>> #include "qemu/range.h"
>>>>> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>>> }
>>>>> }
>>>>> +static char *cpr_name(RAMBlock *block)
>>>>> +{
>>>>> + MemoryRegion *mr = block->mr;
>>>>> + const char *mr_name = memory_region_name(mr);
>>>>> + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>>>> +
>>>>> + if (id) {
>>>>> + return g_strdup_printf("%s/%s", id, mr_name);
>>>>> + } else {
>>>>> + return g_strdup(mr_name);
>>>>> + }
>>>>> +}
>>>>> +
>>>>> size_t qemu_ram_pagesize(RAMBlock *rb)
>>>>> {
>>>>> return rb->page_size;
>>>>> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>>>> {
>>>>> size_t max_length = new_block->max_length;
>>>>> MemoryRegion *mr = new_block->mr;
>>>>> - const char *name = memory_region_name(mr);
>>>>> - int fd;
>>>>> + g_autofree char *name = cpr_name(new_block);
>>>>> + int fd = cpr_find_fd(name, 0);
>>>>
>>>> If to use the proposed patch in the reply of patch 2, here this should be
>>>> able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
>>>>
>>>>> +
>>>>> + if (fd >= 0) {
>>>>> + if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
>>>>> + error_setg_errno(errp, errno,
>>>>> + "cannot grow ram block %s fd %d to %ld bytes",
>>>>> + name, fd, max_length);
>>>>> + goto err;
>>>>> + }
>>>>
>>>> I remember we discussed something similar to this, do we need ftruncate()
>>>> at all? I think not.
>>>>
>>>> This happens when booting QEMU, so I don't think it's relevant yet to what
>>>> size used in src, as this is dest.
>>>>
>>>> It starts to get relevant only when cpr migration starts on src, it sents
>>>> ramblocks at the beginning, then parse_ramblock() will properly resize any
>>>> ramblock to whatever size it should use.
>>>>
>>>> If the resize didn't happen it can only mean that used_length is correctly
>>>> matched on both sides.
>>>>
>>>> So I don't see why a special truncate() call is needed yet..
>>>
>>> You suggested truncate:
>>>
>>> https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
>>>
>>> "So after such system reset, QEMU might start to see new ROM code loaded
>>> here (not the one that got migrated anymore, which will only match the
>>> version installed on src QEMU). Here the problem is the new firmware can
>>> be larger, so I _think_ we need to make sure max_length is not modified by
>>> CPR to allow resizing happen here, while if we use truncate=true here it
>>> should just work in all cases."
>>>
>>> ... but you suggested passing a truncate bool to the file_ram_alloc call after
>>> cpr_find_fd. I could do that instead. However, if qemu_ram_alloc_shared uses
>>> qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
>>> then I will still call ftruncate here, because qemu_ram_alloc_from_fd does not
>>> take a truncate argument.
>>
>
> [begin]
>
>> My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
>> patch 2, it will only create zero-length fd (with fsize=0) and leave all
>> the rest to qemu_ram_alloc_from_fd(), then with that:
>>
>> qemu_ram_alloc_from_fd:
>> new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
>> errp);
>>
>> So that'll always have truncate==!file_size==1. Then truncate will be done
>> at file_ram_alloc() later, iiuc.
>>
>> if (truncate && ftruncate(fd, offset + memory)) {
>> perror("ftruncate");
>> }
>>
>> Would this work?
>
> [end]
>
> Please feel free to ignore [begin]->[end].. I guess I didn't really answer
> it.
>
> Now after I re-read the question.. considering that we have been very
> cautious on the fsize here:
>
> file_size = get_file_size(fd);
> if (file_size > offset && file_size < (offset + size)) {
> error_setg(errp, "backing store size 0x%" PRIx64
> " does not match 'size' option 0x" RAM_ADDR_FMT,
> file_size, size);
> return NULL;
> }
>
> I suppose your change makes sense. So please feel free to keep the
> truncation change. I wished we could already auto-enlarge the file size
> there already instead of failing, but I think I see why we're over cautious
> on this - we want to still provide some safety in case some wrong file path
> passed over to QEMU, to not easily corrupt the file when that happens. So
> we assume the file must be pre-truncated to say this is the right ram file.
>
> Though if you wouldn't mind, I'd still request a comment explaining it,
> because it probably isn't obvious..
>
> AFAICT it's only relevant to resizable RAM and also the fact that it'll be
> able to present now in fd-based ramblocks. Maybe also mention the fact of
> our cautious on changing file sizes on fd-based, but not avoidable to do it
> here to make resizable work for firmwares. Any form of comment would help.
Perhaps more to the point, for the incoming cpr memfd, the file size is not 0, so
the logic in qemu_ram_alloc_from_fd does not right-size it. Calling ftruncate
prior does the right thing.
I will add comments.
> OTOH, below comments should still worth checking.
>>
>> Meanwhile, this whole ram resize discussion reminded me that to reuse
>> qemu_ram_alloc_from_fd(), we may also want to make sure to pass ->resized()
>> hook from qemu_ram_alloc_internal() to the fd helper too.. IOW, we may
>> want to keep qemu_ram_resize() invoke those hooks, even after switching to
>> fd-based for aux mems.
>>
>> Maybe the size / max_size also need to be passed over? As for fd ramblock
>> it used to be always the same on used_length/max_length, but not anymore
>> when we switch aux mem to fd based. Please feel free to double check..
Yes, I had already added the resize and size parameters to the helper function
prior to this email.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-13 15:30 ` Steven Sistare
@ 2024-12-18 16:34 ` Steven Sistare
2024-12-18 17:00 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 16:34 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/13/2024 10:30 AM, Steven Sistare wrote:
> On 12/13/2024 10:21 AM, Peter Xu wrote:
>> On Thu, Dec 12, 2024 at 05:48:03PM -0500, Peter Xu wrote:
>>> On Thu, Dec 12, 2024 at 03:38:14PM -0500, Steven Sistare wrote:
>>>> On 12/9/2024 3:07 PM, Peter Xu wrote:
>>>>> On Mon, Dec 02, 2024 at 05:19:58AM -0800, Steve Sistare wrote:
>>>>>> Save the memfd for ramblocks in CPR state, along with a name that
>>>>>> uniquely identifies it. The block's idstr is not yet set, so it
>>>>>> cannot be used for this purpose. Find the saved memfd in new QEMU when
>>>>>> creating a block. If the block size is larger in new QEMU, extend the
>>>>>> block using fallocate, and the extra space will be useable after a guest
>>>>>> reset.
>>>>>>
>>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>>> ---
>>>>>> system/physmem.c | 36 ++++++++++++++++++++++++++++++++++--
>>>>>> 1 file changed, 34 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/system/physmem.c b/system/physmem.c
>>>>>> index 0bcb2cc..aa095a3 100644
>>>>>> --- a/system/physmem.c
>>>>>> +++ b/system/physmem.c
>>>>>> @@ -70,6 +70,7 @@
>>>>>> #include "qemu/pmem.h"
>>>>>> +#include "migration/cpr.h"
>>>>>> #include "migration/vmstate.h"
>>>>>> #include "qemu/range.h"
>>>>>> @@ -1661,6 +1662,19 @@ void qemu_ram_unset_idstr(RAMBlock *block)
>>>>>> }
>>>>>> }
>>>>>> +static char *cpr_name(RAMBlock *block)
>>>>>> +{
>>>>>> + MemoryRegion *mr = block->mr;
>>>>>> + const char *mr_name = memory_region_name(mr);
>>>>>> + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
>>>>>> +
>>>>>> + if (id) {
>>>>>> + return g_strdup_printf("%s/%s", id, mr_name);
>>>>>> + } else {
>>>>>> + return g_strdup(mr_name);
>>>>>> + }
>>>>>> +}
>>>>>> +
>>>>>> size_t qemu_ram_pagesize(RAMBlock *rb)
>>>>>> {
>>>>>> return rb->page_size;
>>>>>> @@ -2080,8 +2094,18 @@ static bool qemu_ram_alloc_shared(RAMBlock *new_block, Error **errp)
>>>>>> {
>>>>>> size_t max_length = new_block->max_length;
>>>>>> MemoryRegion *mr = new_block->mr;
>>>>>> - const char *name = memory_region_name(mr);
>>>>>> - int fd;
>>>>>> + g_autofree char *name = cpr_name(new_block);
>>>>>> + int fd = cpr_find_fd(name, 0);
>>>>>
>>>>> If to use the proposed patch in the reply of patch 2, here this should be
>>>>> able to be moved to qemu_ram_alloc_anonymous_fd(), IIUC.
>>>>>
>>>>>> +
>>>>>> + if (fd >= 0) {
>>>>>> + if (lseek(fd, 0, SEEK_END) < max_length && ftruncate(fd, max_length)) {
>>>>>> + error_setg_errno(errp, errno,
>>>>>> + "cannot grow ram block %s fd %d to %ld bytes",
>>>>>> + name, fd, max_length);
>>>>>> + goto err;
>>>>>> + }
>>>>>
>>>>> I remember we discussed something similar to this, do we need ftruncate()
>>>>> at all? I think not.
>>>>>
>>>>> This happens when booting QEMU, so I don't think it's relevant yet to what
>>>>> size used in src, as this is dest.
>>>>>
>>>>> It starts to get relevant only when cpr migration starts on src, it sents
>>>>> ramblocks at the beginning, then parse_ramblock() will properly resize any
>>>>> ramblock to whatever size it should use.
>>>>>
>>>>> If the resize didn't happen it can only mean that used_length is correctly
>>>>> matched on both sides.
>>>>>
>>>>> So I don't see why a special truncate() call is needed yet..
>>>>
>>>> You suggested truncate:
>>>>
>>>> https://lore.kernel.org/qemu-devel/47d6d984-7002-4086-bb10-b191168f141f@oracle.com/
>>>>
>>>> "So after such system reset, QEMU might start to see new ROM code loaded
>>>> here (not the one that got migrated anymore, which will only match the
>>>> version installed on src QEMU). Here the problem is the new firmware can
>>>> be larger, so I _think_ we need to make sure max_length is not modified by
>>>> CPR to allow resizing happen here, while if we use truncate=true here it
>>>> should just work in all cases."
>>>>
>>>> ... but you suggested passing a truncate bool to the file_ram_alloc call after
>>>> cpr_find_fd. I could do that instead. However, if qemu_ram_alloc_shared uses
>>>> qemu_ram_alloc_from_fd instead of file_ram_alloc, per your suggestion in patch 2,
>>>> then I will still call ftruncate here, because qemu_ram_alloc_from_fd does not
>>>> take a truncate argument.
>>>
>>
>> [begin]
>>
>>> My memory was when reuse qemu_ram_alloc_from_fd() in that suggestion of
>>> patch 2, it will only create zero-length fd (with fsize=0) and leave all
>>> the rest to qemu_ram_alloc_from_fd(), then with that:
>>>
>>> qemu_ram_alloc_from_fd:
>>> new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
>>> errp);
>>>
>>> So that'll always have truncate==!file_size==1. Then truncate will be done
>>> at file_ram_alloc() later, iiuc.
>>>
>>> if (truncate && ftruncate(fd, offset + memory)) {
>>> perror("ftruncate");
>>> }
>>>
>>> Would this work?
>>
>> [end]
>>
>> Please feel free to ignore [begin]->[end].. I guess I didn't really answer
>> it.
>>
>> Now after I re-read the question.. considering that we have been very
>> cautious on the fsize here:
>>
>> file_size = get_file_size(fd);
>> if (file_size > offset && file_size < (offset + size)) {
>> error_setg(errp, "backing store size 0x%" PRIx64
>> " does not match 'size' option 0x" RAM_ADDR_FMT,
>> file_size, size);
>> return NULL;
>> }
>>
>> I suppose your change makes sense. So please feel free to keep the
>> truncation change. I wished we could already auto-enlarge the file size
>> there already instead of failing, but I think I see why we're over cautious
>> on this - we want to still provide some safety in case some wrong file path
>> passed over to QEMU, to not easily corrupt the file when that happens. So
>> we assume the file must be pre-truncated to say this is the right ram file.
>>
>> Though if you wouldn't mind, I'd still request a comment explaining it,
>> because it probably isn't obvious..
>>
>> AFAICT it's only relevant to resizable RAM and also the fact that it'll be
>> able to present now in fd-based ramblocks. Maybe also mention the fact of
>> our cautious on changing file sizes on fd-based, but not avoidable to do it
>> here to make resizable work for firmwares. Any form of comment would help.
>
> Perhaps more to the point, for the incoming cpr memfd, the file size is not 0, so
> the logic in qemu_ram_alloc_from_fd does not right-size it. Calling ftruncate
> prior does the right thing.
>
> I will add comments.
After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
to grow the file while preserving error checking for the general case, and
delete the explicit ftruncate in its caller:
/*
* Allow file_ram_alloc to grow the file during CPR, if a resizable
* memory region wants a larger block than the incoming current size.
*/
file_size = get_file_size(fd);
if (file_size && file_size < offset + max_size && size == max_size &&
migrate_mode() != MIG_MODE_CPR_TRANSFER) {
error_setg(errp, "backing store size 0x%" PRIx64
" does not match 'size' option 0x" RAM_ADDR_FMT,
file_size, max_size);
return NULL;
}
...
new_block->host = file_ram_alloc(new_block, max_size, fd,
file_size < offset + max_size,
offset, errp);
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-18 16:34 ` Steven Sistare
@ 2024-12-18 17:00 ` Peter Xu
2024-12-18 20:22 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-18 17:00 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Wed, Dec 18, 2024 at 11:34:34AM -0500, Steven Sistare wrote:
> After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
> to grow the file while preserving error checking for the general case, and
> delete the explicit ftruncate in its caller:
>
> /*
> * Allow file_ram_alloc to grow the file during CPR, if a resizable
> * memory region wants a larger block than the incoming current size.
> */
> file_size = get_file_size(fd);
> if (file_size && file_size < offset + max_size && size == max_size &&
> migrate_mode() != MIG_MODE_CPR_TRANSFER) {
Firstly, this check is growing too long, maybe worthwhile to have a helper
already.
file_size_check():
// COMMENTS...
if (migrate_mode() == XXX) {
return true;
}
Said that, I think it's better we also add the flag to enforce the
truncation, only if cpr found a fd. E.g. we may want to keep the old
behavior even if the user sets migrate mode to CPR (even without a
migration happening at all), then create a fd ramblock.
> error_setg(errp, "backing store size 0x%" PRIx64
> " does not match 'size' option 0x" RAM_ADDR_FMT,
> file_size, max_size);
> return NULL;
> }
> ...
> new_block->host = file_ram_alloc(new_block, max_size, fd,
> file_size < offset + max_size,
> offset, errp);
>
> - Steve
>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-18 17:00 ` Peter Xu
@ 2024-12-18 20:22 ` Steven Sistare
2024-12-18 20:33 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 20:22 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/18/2024 12:00 PM, Peter Xu wrote:
> On Wed, Dec 18, 2024 at 11:34:34AM -0500, Steven Sistare wrote:
>> After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
>> to grow the file while preserving error checking for the general case, and
>> delete the explicit ftruncate in its caller:
>>
>> /*
>> * Allow file_ram_alloc to grow the file during CPR, if a resizable
>> * memory region wants a larger block than the incoming current size.
>> */
>> file_size = get_file_size(fd);
>> if (file_size && file_size < offset + max_size && size == max_size &&
>> migrate_mode() != MIG_MODE_CPR_TRANSFER) {
>>[...]
>
> Firstly, this check is growing too long, maybe worthwhile to have a helper
> already.
>
> file_size_check():
> // COMMENTS...
> if (migrate_mode() == XXX) {
> return true;
> }
>
> Said that, I think it's better we also add the flag to enforce the
> truncation, only if cpr found a fd. E.g. we may want to keep the old
> behavior even if the user sets migrate mode to CPR (even without a
> migration happening at all), then create a fd ramblock.
That was my intent. Normally mode becomes TRANSFER only when outgoing migration
is about to start, or is incoming, but conceivably the source qemu user could
set mode early, before creating some object requiring aux memory.
I can add a grow parameter to qemu_ram_alloc_from_fd and pass true only
if the fd came from cpr_find_fd. Sound OK?
RAMBlock *qemu_ram_alloc_from_fd(..., bool grow)
if (file_size && file_size < offset + max_size && !grow) {
error_setg(...
...
new_block->host = file_ram_alloc(new_block, max_size, fd,
file_size < offset + max_size,
offset, errp);
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 06/19] physmem: preserve ram blocks for cpr
2024-12-18 20:22 ` Steven Sistare
@ 2024-12-18 20:33 ` Peter Xu
0 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-18 20:33 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Wed, Dec 18, 2024 at 03:22:50PM -0500, Steven Sistare wrote:
> On 12/18/2024 12:00 PM, Peter Xu wrote:
> > On Wed, Dec 18, 2024 at 11:34:34AM -0500, Steven Sistare wrote:
> > > After adding resizable support to qemu_ram_alloc_from_fd, I can also tweak it
> > > to grow the file while preserving error checking for the general case, and
> > > delete the explicit ftruncate in its caller:
> > >
> > > /*
> > > * Allow file_ram_alloc to grow the file during CPR, if a resizable
> > > * memory region wants a larger block than the incoming current size.
> > > */
> > > file_size = get_file_size(fd);
> > > if (file_size && file_size < offset + max_size && size == max_size &&
> > > migrate_mode() != MIG_MODE_CPR_TRANSFER) {
> > > [...]
> >
> > Firstly, this check is growing too long, maybe worthwhile to have a helper
> > already.
> >
> > file_size_check():
> > // COMMENTS...
> > if (migrate_mode() == XXX) {
> > return true;
> > }
> >
> > Said that, I think it's better we also add the flag to enforce the
> > truncation, only if cpr found a fd. E.g. we may want to keep the old
> > behavior even if the user sets migrate mode to CPR (even without a
> > migration happening at all), then create a fd ramblock.
>
> That was my intent. Normally mode becomes TRANSFER only when outgoing migration
> is about to start, or is incoming, but conceivably the source qemu user could
> set mode early, before creating some object requiring aux memory.
Such ordering may not be wanted, and can be too trivial.
We used to discuss with Dan, and we wished all migration caps/params/modes
will only be set in the QMP "migrate" command, rather than being separate.
Actually we may start supporting such in the near future, taking all
migration setup in the QMP command 'migrate'. Then none of migration
caps/params/modes will be global anymore, but only taken from the QMP
command. From that POV, better not rely on that.
>
> I can add a grow parameter to qemu_ram_alloc_from_fd and pass true only
> if the fd came from cpr_find_fd. Sound OK?
>
> RAMBlock *qemu_ram_alloc_from_fd(..., bool grow)
> if (file_size && file_size < offset + max_size && !grow) {
> error_setg(...
> ...
> new_block->host = file_ram_alloc(new_block, max_size, fd,
> file_size < offset + max_size,
> offset, errp);
Sounds good.
Thanks,
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 07/19] hostmem-memfd: preserve for cpr
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (5 preceding siblings ...)
2024-12-02 13:19 ` [PATCH V4 06/19] physmem: preserve ram blocks for cpr Steve Sistare
@ 2024-12-02 13:19 ` Steve Sistare
2024-12-18 19:53 ` Steven Sistare
2024-12-02 13:20 ` [PATCH V4 08/19] hostmem-shm: " Steve Sistare
` (11 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:19 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Preserve memory-backend-memfd memory objects during cpr-transfer.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Acked-by: Peter Xu <peterx@redhat.com>
---
backends/hostmem-memfd.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index 1bcae4b..497d63b 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -17,6 +17,7 @@
#include "qemu/module.h"
#include "qapi/error.h"
#include "qom/object.h"
+#include "migration/cpr.h"
OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendMemfd, MEMORY_BACKEND_MEMFD)
@@ -33,15 +34,19 @@ static bool
memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
{
HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend);
- g_autofree char *name = NULL;
+ g_autofree char *name = host_memory_backend_get_name(backend);
+ int fd = cpr_find_fd(name, 0);
uint32_t ram_flags;
- int fd;
if (!backend->size) {
error_setg(errp, "can't create backend with size 0");
return false;
}
+ if (fd >= 0) {
+ goto have_fd;
+ }
+
fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size,
m->hugetlb, m->hugetlbsize, m->seal ?
F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0,
@@ -49,9 +54,10 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
if (fd == -1) {
return false;
}
+ cpr_save_fd(name, 0, fd);
+have_fd:
backend->aligned = true;
- name = host_memory_backend_get_name(backend);
ram_flags = backend->share ? RAM_SHARED : RAM_PRIVATE;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 07/19] hostmem-memfd: preserve for cpr
2024-12-02 13:19 ` [PATCH V4 07/19] hostmem-memfd: preserve " Steve Sistare
@ 2024-12-18 19:53 ` Steven Sistare
2024-12-18 20:23 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 19:53 UTC (permalink / raw)
To: Peter Xu
Cc: Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, qemu-devel
Hi Peter, can we upgrade your Acked-by to an RB? You gave RB to the
similar patch "hostmem-shm: preserve for cpr"
- Steve
On 12/2/2024 8:19 AM, Steve Sistare wrote:
> Preserve memory-backend-memfd memory objects during cpr-transfer.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> Acked-by: Peter Xu <peterx@redhat.com>
> ---
> backends/hostmem-memfd.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
> index 1bcae4b..497d63b 100644
> --- a/backends/hostmem-memfd.c
> +++ b/backends/hostmem-memfd.c
> @@ -17,6 +17,7 @@
> #include "qemu/module.h"
> #include "qapi/error.h"
> #include "qom/object.h"
> +#include "migration/cpr.h"
>
> OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendMemfd, MEMORY_BACKEND_MEMFD)
>
> @@ -33,15 +34,19 @@ static bool
> memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
> {
> HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend);
> - g_autofree char *name = NULL;
> + g_autofree char *name = host_memory_backend_get_name(backend);
> + int fd = cpr_find_fd(name, 0);
> uint32_t ram_flags;
> - int fd;
>
> if (!backend->size) {
> error_setg(errp, "can't create backend with size 0");
> return false;
> }
>
> + if (fd >= 0) {
> + goto have_fd;
> + }
> +
> fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size,
> m->hugetlb, m->hugetlbsize, m->seal ?
> F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0,
> @@ -49,9 +54,10 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
> if (fd == -1) {
> return false;
> }
> + cpr_save_fd(name, 0, fd);
>
> +have_fd:
> backend->aligned = true;
> - name = host_memory_backend_get_name(backend);
> ram_flags = backend->share ? RAM_SHARED : RAM_PRIVATE;
> ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
> ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 07/19] hostmem-memfd: preserve for cpr
2024-12-18 19:53 ` Steven Sistare
@ 2024-12-18 20:23 ` Peter Xu
0 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-18 20:23 UTC (permalink / raw)
To: Steven Sistare
Cc: Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, qemu-devel
On Wed, Dec 18, 2024 at 02:53:03PM -0500, Steven Sistare wrote:
> Hi Peter, can we upgrade your Acked-by to an RB? You gave RB to the
> similar patch "hostmem-shm: preserve for cpr"
In this context I don't think there's a major difference, so sure.
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 08/19] hostmem-shm: preserve for cpr
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (6 preceding siblings ...)
2024-12-02 13:19 ` [PATCH V4 07/19] hostmem-memfd: preserve " Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-12 17:38 ` Peter Xu
2024-12-02 13:20 ` [PATCH V4 09/19] migration: incoming channel Steve Sistare
` (10 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Preserve memory-backend-shm memory objects during cpr-transfer.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
backends/hostmem-shm.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c
index 837b9f1..0946f2a 100644
--- a/backends/hostmem-shm.c
+++ b/backends/hostmem-shm.c
@@ -13,6 +13,7 @@
#include "qemu/osdep.h"
#include "sysemu/hostmem.h"
#include "qapi/error.h"
+#include "migration/cpr.h"
#define TYPE_MEMORY_BACKEND_SHM "memory-backend-shm"
@@ -25,9 +26,9 @@ struct HostMemoryBackendShm {
static bool
shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
{
- g_autofree char *backend_name = NULL;
+ g_autofree char *backend_name = host_memory_backend_get_name(backend);
uint32_t ram_flags;
- int fd;
+ int fd = cpr_find_fd(backend_name, 0);
if (!backend->size) {
error_setg(errp, "can't create shm backend with size 0");
@@ -39,13 +40,18 @@ shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
return false;
}
+ if (fd >= 0) {
+ goto have_fd;
+ }
+
fd = qemu_shm_alloc(backend->size, errp);
if (fd < 0) {
return false;
}
+ cpr_save_fd(backend_name, 0, fd);
+have_fd:
/* Let's do the same as memory-backend-ram,share=on would do. */
- backend_name = host_memory_backend_get_name(backend);
ram_flags = RAM_SHARED;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 08/19] hostmem-shm: preserve for cpr
2024-12-02 13:20 ` [PATCH V4 08/19] hostmem-shm: " Steve Sistare
@ 2024-12-12 17:38 ` Peter Xu
0 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-12 17:38 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:20:00AM -0800, Steve Sistare wrote:
> Preserve memory-backend-shm memory objects during cpr-transfer.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 09/19] migration: incoming channel
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (7 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 08/19] hostmem-shm: " Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-05 15:23 ` Markus Armbruster
2024-12-02 13:20 ` [PATCH V4 10/19] migration: cpr channel Steve Sistare
` (9 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Extend the -incoming option to allow an @MigrationChannel to be specified.
This allows channels other than 'main' to be described on the command
line, which will be needed for CPR.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
include/migration/misc.h | 2 ++
migration/migration.c | 18 ++++++++++++++----
migration/migration.h | 2 --
qemu-options.hx | 17 +++++++++++++++++
system/vl.c | 35 +++++++++++++++++++++++++++++++++--
5 files changed, 66 insertions(+), 8 deletions(-)
diff --git a/include/migration/misc.h b/include/migration/misc.h
index 804eb23..259d4aa 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -106,4 +106,6 @@ bool migration_incoming_postcopy_advised(void);
/* True if background snapshot is active */
bool migration_in_bg_snapshot(void);
+bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
+ Error **errp);
#endif
diff --git a/migration/migration.c b/migration/migration.c
index 83dabc7..a5cf148 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2046,6 +2046,7 @@ void qmp_migrate(const char *uri, bool has_channels,
MigrationState *s = migrate_get_current();
g_autoptr(MigrationChannel) channel = NULL;
MigrationAddress *addr = NULL;
+ MigrationChannel *channelv[MIGRATION_CHANNEL_TYPE__MAX] = { NULL };
/*
* Having preliminary checks for uri and channel
@@ -2056,12 +2057,21 @@ void qmp_migrate(const char *uri, bool has_channels,
}
if (channels) {
- /* To verify that Migrate channel list has only item */
- if (channels->next) {
- error_setg(errp, "Channel list has more than one entries");
+ for ( ; channels; channels = channels->next) {
+ MigrationChannelType type = channels->value->channel_type;
+
+ if (channelv[type]) {
+ error_setg(errp, "Channel list has more than one %s entry",
+ MigrationChannelType_str(type));
+ return;
+ }
+ channelv[type] = channels->value;
+ }
+ addr = channelv[MIGRATION_CHANNEL_TYPE_MAIN]->addr;
+ if (!addr) {
+ error_setg(errp, "Channel list has no main entry");
return;
}
- addr = channels->value->addr;
}
if (uri) {
diff --git a/migration/migration.h b/migration/migration.h
index 0956e92..5cd0f29 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -522,8 +522,6 @@ bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm,
Error **errp);
void migrate_add_address(SocketAddress *address);
-bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
- Error **errp);
int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
#define qemu_ram_foreach_block \
diff --git a/qemu-options.hx b/qemu-options.hx
index 02b9118..fab50ce 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4937,10 +4937,17 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \
"-incoming exec:cmdline\n" \
" accept incoming migration on given file descriptor\n" \
" or from given external command\n" \
+ "-incoming @MigrationChannel\n" \
+ " accept incoming migration on the channel\n" \
"-incoming defer\n" \
" wait for the URI to be specified via migrate_incoming\n",
QEMU_ARCH_ALL)
SRST
+The -incoming option specifies the migration channel for an incoming
+migration. It may be used multiple times to specify multiple
+migration channel types. The channel type is specified in @MigrationChannel,
+and is 'main' for all other forms of -incoming.
+
``-incoming tcp:[host]:port[,to=maxport][,ipv4=on|off][,ipv6=on|off]``
\
``-incoming rdma:host:port[,ipv4=on|off][,ipv6=on|off]``
@@ -4960,6 +4967,16 @@ SRST
Accept incoming migration as an output from specified external
command.
+``-incoming @MigrationChannel``
+ Accept incoming migration on the channel. See the QAPI documentation
+ for the syntax of the @MigrationChannel data element. For example:
+ ::
+
+ -incoming '{"channel-type": "main",
+ "addr": { "transport": "socket",
+ "type": "unix",
+ "path": "my.sock" }}'
+
``-incoming defer``
Wait for the URI to be specified via migrate\_incoming. The monitor
can be used to change settings (such as migration parameters) prior
diff --git a/system/vl.c b/system/vl.c
index 4151a79..2c24c60 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -123,6 +123,7 @@
#include "qapi/qapi-visit-block-core.h"
#include "qapi/qapi-visit-compat.h"
#include "qapi/qapi-visit-machine.h"
+#include "qapi/qapi-visit-migration.h"
#include "qapi/qapi-visit-ui.h"
#include "qapi/qapi-commands-block-core.h"
#include "qapi/qapi-commands-migration.h"
@@ -159,6 +160,7 @@ typedef struct DeviceOption {
static const char *cpu_option;
static const char *mem_path;
static const char *incoming;
+static MigrationChannelList *incoming_channels;
static const char *loadvm;
static const char *accelerators;
static bool have_custom_ram_size;
@@ -1821,6 +1823,35 @@ static void object_option_add_visitor(Visitor *v)
QTAILQ_INSERT_TAIL(&object_opts, opt, next);
}
+static void incoming_option_parse(const char *str)
+{
+ MigrationChannel *channel;
+
+ if (str[0] == '{') {
+ QObject *obj = qobject_from_json(str, &error_fatal);
+ Visitor *v = qobject_input_visitor_new(obj);
+
+ qobject_unref(obj);
+ visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
+ visit_free(v);
+ } else if (!strcmp(str, "defer")) {
+ channel = NULL;
+ } else {
+ migrate_uri_parse(str, &channel, &error_fatal);
+ }
+
+ /* New incoming spec replaces the previous */
+
+ if (incoming_channels) {
+ qapi_free_MigrationChannelList(incoming_channels);
+ }
+ if (channel) {
+ incoming_channels = g_new0(MigrationChannelList, 1);
+ incoming_channels->value = channel;
+ }
+ incoming = str;
+}
+
static void object_option_parse(const char *str)
{
QemuOpts *opts;
@@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
if (incoming) {
Error *local_err = NULL;
if (strcmp(incoming, "defer") != 0) {
- qmp_migrate_incoming(incoming, false, NULL, true, true,
+ qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
&local_err);
if (local_err) {
error_reportf_err(local_err, "-incoming %s: ", incoming);
@@ -3477,7 +3508,7 @@ void qemu_init(int argc, char **argv)
if (!incoming) {
runstate_set(RUN_STATE_INMIGRATE);
}
- incoming = optarg;
+ incoming_option_parse(optarg);
break;
case QEMU_OPTION_only_migratable:
only_migratable = 1;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-02 13:20 ` [PATCH V4 09/19] migration: incoming channel Steve Sistare
@ 2024-12-05 15:23 ` Markus Armbruster
2024-12-05 20:45 ` Steven Sistare
2024-12-10 12:46 ` Markus Armbruster
0 siblings, 2 replies; 78+ messages in thread
From: Markus Armbruster @ 2024-12-05 15:23 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Steve Sistare <steven.sistare@oracle.com> writes:
> Extend the -incoming option to allow an @MigrationChannel to be specified.
> This allows channels other than 'main' to be described on the command
> line, which will be needed for CPR.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
[...]
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 02b9118..fab50ce 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -4937,10 +4937,17 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \
> "-incoming exec:cmdline\n" \
> " accept incoming migration on given file descriptor\n" \
> " or from given external command\n" \
> + "-incoming @MigrationChannel\n" \
> + " accept incoming migration on the channel\n" \
> "-incoming defer\n" \
> " wait for the URI to be specified via migrate_incoming\n",
> QEMU_ARCH_ALL)
> SRST
> +The -incoming option specifies the migration channel for an incoming
> +migration. It may be used multiple times to specify multiple
> +migration channel types.
Really? If I understand the code below correctly, the last -incoming
wins, and any previous ones are silently ignored.
> The channel type is specified in @MigrationChannel,
> +and is 'main' for all other forms of -incoming.
> +
> ``-incoming tcp:[host]:port[,to=maxport][,ipv4=on|off][,ipv6=on|off]``
> \
> ``-incoming rdma:host:port[,ipv4=on|off][,ipv6=on|off]``
> @@ -4960,6 +4967,16 @@ SRST
> Accept incoming migration as an output from specified external
> command.
>
> +``-incoming @MigrationChannel``
> + Accept incoming migration on the channel. See the QAPI documentation
> + for the syntax of the @MigrationChannel data element. For example:
> + ::
I get what you're trying to express, but there's no precedence for
referring to QAPI types like @TypeName in option documentation. But
let's ignore this until after we nailed down the actual interface, on
which I have questions below.
> +
> + -incoming '{"channel-type": "main",
> + "addr": { "transport": "socket",
> + "type": "unix",
> + "path": "my.sock" }}'
> +
> ``-incoming defer``
> Wait for the URI to be specified via migrate\_incoming. The monitor
> can be used to change settings (such as migration parameters) prior
> diff --git a/system/vl.c b/system/vl.c
> index 4151a79..2c24c60 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -123,6 +123,7 @@
> #include "qapi/qapi-visit-block-core.h"
> #include "qapi/qapi-visit-compat.h"
> #include "qapi/qapi-visit-machine.h"
> +#include "qapi/qapi-visit-migration.h"
> #include "qapi/qapi-visit-ui.h"
> #include "qapi/qapi-commands-block-core.h"
> #include "qapi/qapi-commands-migration.h"
> @@ -159,6 +160,7 @@ typedef struct DeviceOption {
> static const char *cpu_option;
> static const char *mem_path;
> static const char *incoming;
> +static MigrationChannelList *incoming_channels;
> static const char *loadvm;
> static const char *accelerators;
> static bool have_custom_ram_size;
> @@ -1821,6 +1823,35 @@ static void object_option_add_visitor(Visitor *v)
> QTAILQ_INSERT_TAIL(&object_opts, opt, next);
> }
>
> +static void incoming_option_parse(const char *str)
> +{
> + MigrationChannel *channel;
> +
> + if (str[0] == '{') {
> + QObject *obj = qobject_from_json(str, &error_fatal);
> + Visitor *v = qobject_input_visitor_new(obj);
> +
> + qobject_unref(obj);
> + visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
> + visit_free(v);
> + } else if (!strcmp(str, "defer")) {
> + channel = NULL;
> + } else {
> + migrate_uri_parse(str, &channel, &error_fatal);
> + }
> +
> + /* New incoming spec replaces the previous */
> +
> + if (incoming_channels) {
> + qapi_free_MigrationChannelList(incoming_channels);
> + }
> + if (channel) {
> + incoming_channels = g_new0(MigrationChannelList, 1);
> + incoming_channels->value = channel;
> + }
> + incoming = str;
> +}
@incoming is set to @optarg.
@incoming_channels is set to a MigrationChannelList of exactly one
element, parsed from @incoming. Except when @incoming is "defer", then
@incoming_channels is set to null.
@incoming is only ever used as a flag. Turn it into a bool?
Oh, wait... see my comment on the next hunk.
Option -incoming resembles QMP command migrate-incoming. Differences:
* migrate-incoming keeps legacy URI and modern argument separate: there
are two named arguments, and exactly one of them must be passed.
-incoming overloads them: if @optarg starts with '{', it's modern,
else legacy URI.
Because of that, -incoming *only* supports JSON syntax for modern, not
dotted keys. Other JSON-capable arguments support both.
How can a management application detect that -incoming supports
modern?
Sure overloading -incoming this way is a good idea?
* migrate-incoming takes a list of channels, currently restricted to a
single channel. -incoming takes a channel. If we lift the
restriction, -incoming syntax will become even messier: we'll have to
additionally overload list of channel.
Should -incoming take a list from the start, like migrate-incoming
does?
> +
> static void object_option_parse(const char *str)
> {
> QemuOpts *opts;
> @@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
> if (incoming) {
> Error *local_err = NULL;
> if (strcmp(incoming, "defer") != 0) {
> - qmp_migrate_incoming(incoming, false, NULL, true, true,
> + qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
> &local_err);
You move the parsing of legacy URI from within qmp_migrate_incoming()
into incoming_option_parse().
The alternative is not to parse it in incoming_option_parse(), but pass
it to qmp_migrate_incoming() like this:
qmp_migrate_incoming(incoming, !incoming, incoming_channels,
true, true, &local_err);
> if (local_err) {
> error_reportf_err(local_err, "-incoming %s: ", incoming);
> @@ -3477,7 +3508,7 @@ void qemu_init(int argc, char **argv)
> if (!incoming) {
> runstate_set(RUN_STATE_INMIGRATE);
> }
> - incoming = optarg;
> + incoming_option_parse(optarg);
> break;
> case QEMU_OPTION_only_migratable:
> only_migratable = 1;
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-05 15:23 ` Markus Armbruster
@ 2024-12-05 20:45 ` Steven Sistare
2024-12-09 12:12 ` Markus Armbruster
2024-12-10 12:46 ` Markus Armbruster
1 sibling, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-05 20:45 UTC (permalink / raw)
To: Markus Armbruster
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/5/2024 10:23 AM, Markus Armbruster wrote:
> Steve Sistare <steven.sistare@oracle.com> writes:
>
>> Extend the -incoming option to allow an @MigrationChannel to be specified.
>> This allows channels other than 'main' to be described on the command
>> line, which will be needed for CPR.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> [...]
>
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 02b9118..fab50ce 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -4937,10 +4937,17 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \
>> "-incoming exec:cmdline\n" \
>> " accept incoming migration on given file descriptor\n" \
>> " or from given external command\n" \
>> + "-incoming @MigrationChannel\n" \
>> + " accept incoming migration on the channel\n" \
>> "-incoming defer\n" \
>> " wait for the URI to be specified via migrate_incoming\n",
>> QEMU_ARCH_ALL)
>> SRST
>> +The -incoming option specifies the migration channel for an incoming
>> +migration. It may be used multiple times to specify multiple
>> +migration channel types.
>
> Really? If I understand the code below correctly, the last -incoming
> wins, and any previous ones are silently ignored.
See patch "cpr-channel", where the cpr channel is saved separately.
Last wins, per channel type.
I did this to preserve the current behavior of -incoming in which last wins.
qemu_start_incoming_migration will need modification if more types are added.
>> The channel type is specified in @MigrationChannel,
>> +and is 'main' for all other forms of -incoming.
>> +
>> ``-incoming tcp:[host]:port[,to=maxport][,ipv4=on|off][,ipv6=on|off]``
>> \
>> ``-incoming rdma:host:port[,ipv4=on|off][,ipv6=on|off]``
>> @@ -4960,6 +4967,16 @@ SRST
>> Accept incoming migration as an output from specified external
>> command.
>>
>> +``-incoming @MigrationChannel``
>> + Accept incoming migration on the channel. See the QAPI documentation
>> + for the syntax of the @MigrationChannel data element. For example:
>> + ::
>
> I get what you're trying to express, but there's no precedence for
> referring to QAPI types like @TypeName in option documentation. But
> let's ignore this until after we nailed down the actual interface, on
> which I have questions below.
Ack.
>> +
>> + -incoming '{"channel-type": "main",
>> + "addr": { "transport": "socket",
>> + "type": "unix",
>> + "path": "my.sock" }}'
>> +
>> ``-incoming defer``
>> Wait for the URI to be specified via migrate\_incoming. The monitor
>> can be used to change settings (such as migration parameters) prior
>> diff --git a/system/vl.c b/system/vl.c
>> index 4151a79..2c24c60 100644
>> --- a/system/vl.c
>> +++ b/system/vl.c
>> @@ -123,6 +123,7 @@
>> #include "qapi/qapi-visit-block-core.h"
>> #include "qapi/qapi-visit-compat.h"
>> #include "qapi/qapi-visit-machine.h"
>> +#include "qapi/qapi-visit-migration.h"
>> #include "qapi/qapi-visit-ui.h"
>> #include "qapi/qapi-commands-block-core.h"
>> #include "qapi/qapi-commands-migration.h"
>> @@ -159,6 +160,7 @@ typedef struct DeviceOption {
>> static const char *cpu_option;
>> static const char *mem_path;
>> static const char *incoming;
>> +static MigrationChannelList *incoming_channels;
>> static const char *loadvm;
>> static const char *accelerators;
>> static bool have_custom_ram_size;
>> @@ -1821,6 +1823,35 @@ static void object_option_add_visitor(Visitor *v)
>> QTAILQ_INSERT_TAIL(&object_opts, opt, next);
>> }
>>
>> +static void incoming_option_parse(const char *str)
>> +{
>> + MigrationChannel *channel;
>> +
>> + if (str[0] == '{') {
>> + QObject *obj = qobject_from_json(str, &error_fatal);
>> + Visitor *v = qobject_input_visitor_new(obj);
>> +
>> + qobject_unref(obj);
>> + visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
>> + visit_free(v);
>> + } else if (!strcmp(str, "defer")) {
>> + channel = NULL;
>> + } else {
>> + migrate_uri_parse(str, &channel, &error_fatal);
>> + }
>> +
>> + /* New incoming spec replaces the previous */
>> +
>> + if (incoming_channels) {
>> + qapi_free_MigrationChannelList(incoming_channels);
>> + }
>> + if (channel) {
>> + incoming_channels = g_new0(MigrationChannelList, 1);
>> + incoming_channels->value = channel;
>> + }
>> + incoming = str;
>> +}
>
> @incoming is set to @optarg.
>
> @incoming_channels is set to a MigrationChannelList of exactly one
> element, parsed from @incoming. Except when @incoming is "defer", then
> @incoming_channels is set to null.
>
> @incoming is only ever used as a flag. Turn it into a bool?
The remembered incoming specifier is also used in an error message in
qmp_x_exit_preconfig:
error_reportf_err(local_err, "-incoming %s: ", incoming);
> Oh, wait... see my comment on the next hunk.
>
> Option -incoming resembles QMP command migrate-incoming. Differences:
>
> * migrate-incoming keeps legacy URI and modern argument separate: there
> are two named arguments, and exactly one of them must be passed.
> -incoming overloads them: if @optarg starts with '{', it's modern,
> else legacy URI.
>
> Because of that, -incoming *only* supports JSON syntax for modern, not
> dotted keys. Other JSON-capable arguments support both.
Not sure I follow.
Could you give me a dotted key example for a JSON-capable argument?
Do we care about dotted key for incoming, given the user can specify
a simple legacy URI?
> How can a management application detect that -incoming supports
> modern?
How does mgmt detect when other arguments support JSON?
The presence of cpr-transfer mode implies -incoming JSON support, though
that is indirect.
We could add a feature to the migrate-incoming command, like json-cli
for device_add. Seems like overkill though. 'feature' is little used,
except for unstable and deprecated.
> Sure overloading -incoming this way is a good idea?
>
> * migrate-incoming takes a list of channels, currently restricted to a
> single channel. -incoming takes a channel. If we lift the
> restriction, -incoming syntax will become even messier: we'll have to
> additionally overload list of channel.
>
> Should -incoming take a list from the start, like migrate-incoming
> does?
That was my first try. However, to support the equivalent of '-incoming deferred',
we need to add an 'defer' key to the channel, and when defer is true, the other
keys that are currently mandatory must be omitted. The tweaks to the implementation
and specification seemed not worth worth it.
If we want -incoming to also support a channel list in the future, we can simply
check for an initial '[' token.
>> +
>> static void object_option_parse(const char *str)
>> {
>> QemuOpts *opts;
>> @@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
>> if (incoming) {
>> Error *local_err = NULL;
>> if (strcmp(incoming, "defer") != 0) {
>> - qmp_migrate_incoming(incoming, false, NULL, true, true,
>> + qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
>> &local_err);
>
> You move the parsing of legacy URI from within qmp_migrate_incoming()
> into incoming_option_parse().
>
> The alternative is not to parse it in incoming_option_parse(), but pass
> it to qmp_migrate_incoming() like this:
>
> qmp_migrate_incoming(incoming, !incoming, incoming_channels,
> true, true, &local_err);
Sure, I can tweak that, but I need to define an additional incoming_uri variable:
qmp_migrate_incoming(incoming_uri, !!incoming_channels, incoming_channels, ...
Only one of incoming_uri and incoming_channels can be non-NULL (checked in
qemu_start_incoming_migration).
Would you prefer I continue down this path, or revert to the previous -cpr-uri
option? I made this change to make the incoming interface look more like the
V4 outgoing interface, in which the user adds a cpr channel to the migrate command
channels.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-05 20:45 ` Steven Sistare
@ 2024-12-09 12:12 ` Markus Armbruster
2024-12-09 16:36 ` Peter Xu
` (2 more replies)
0 siblings, 3 replies; 78+ messages in thread
From: Markus Armbruster @ 2024-12-09 12:12 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Steven Sistare <steven.sistare@oracle.com> writes:
> On 12/5/2024 10:23 AM, Markus Armbruster wrote:
>> Steve Sistare <steven.sistare@oracle.com> writes:
>>
>>> Extend the -incoming option to allow an @MigrationChannel to be specified.
>>> This allows channels other than 'main' to be described on the command
>>> line, which will be needed for CPR.
>>>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> [...]
>>
>>> diff --git a/qemu-options.hx b/qemu-options.hx
>>> index 02b9118..fab50ce 100644
>>> --- a/qemu-options.hx
>>> +++ b/qemu-options.hx
>>> @@ -4937,10 +4937,17 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \
>>> "-incoming exec:cmdline\n" \
>>> " accept incoming migration on given file descriptor\n" \
>>> " or from given external command\n" \
>>> + "-incoming @MigrationChannel\n" \
>>> + " accept incoming migration on the channel\n" \
>>> "-incoming defer\n" \
>>> " wait for the URI to be specified via migrate_incoming\n",
>>> QEMU_ARCH_ALL)
>>> SRST
>>> +The -incoming option specifies the migration channel for an incoming
>>> +migration. It may be used multiple times to specify multiple
>>> +migration channel types.
>>
>> Really? If I understand the code below correctly, the last -incoming
>> wins, and any previous ones are silently ignored.
>
> See patch "cpr-channel", where the cpr channel is saved separately.
> Last wins, per channel type.
> I did this to preserve the current behavior of -incoming in which last wins.
Documentation needs to be clarified then.
> qemu_start_incoming_migration will need modification if more types are added.
>
>>> The channel type is specified in @MigrationChannel,
>>> +and is 'main' for all other forms of -incoming.
>>> +
>>> ``-incoming tcp:[host]:port[,to=maxport][,ipv4=on|off][,ipv6=on|off]``
>>> \
>>> ``-incoming rdma:host:port[,ipv4=on|off][,ipv6=on|off]``
>>> @@ -4960,6 +4967,16 @@ SRST
>>> Accept incoming migration as an output from specified external
>>> command.
>>> +``-incoming @MigrationChannel``
>>> + Accept incoming migration on the channel. See the QAPI documentation
>>> + for the syntax of the @MigrationChannel data element. For example:
>>> + ::
>>
>> I get what you're trying to express, but there's no precedence for
>> referring to QAPI types like @TypeName in option documentation. But
>> let's ignore this until after we nailed down the actual interface, on
>> which I have questions below.
>
> Ack.
>
>>> +
>>> + -incoming '{"channel-type": "main",
>>> + "addr": { "transport": "socket",
>>> + "type": "unix",
>>> + "path": "my.sock" }}'
>>> +
>>> ``-incoming defer``
>>> Wait for the URI to be specified via migrate\_incoming. The monitor
>>> can be used to change settings (such as migration parameters) prior
>>> diff --git a/system/vl.c b/system/vl.c
>>> index 4151a79..2c24c60 100644
>>> --- a/system/vl.c
>>> +++ b/system/vl.c
>>> @@ -123,6 +123,7 @@
>>> #include "qapi/qapi-visit-block-core.h"
>>> #include "qapi/qapi-visit-compat.h"
>>> #include "qapi/qapi-visit-machine.h"
>>> +#include "qapi/qapi-visit-migration.h"
>>> #include "qapi/qapi-visit-ui.h"
>>> #include "qapi/qapi-commands-block-core.h"
>>> #include "qapi/qapi-commands-migration.h"
>>> @@ -159,6 +160,7 @@ typedef struct DeviceOption {
>>> static const char *cpu_option;
>>> static const char *mem_path;
>>> static const char *incoming;
>>> +static MigrationChannelList *incoming_channels;
>>> static const char *loadvm;
>>> static const char *accelerators;
>>> static bool have_custom_ram_size;
>>> @@ -1821,6 +1823,35 @@ static void object_option_add_visitor(Visitor *v)
>>> QTAILQ_INSERT_TAIL(&object_opts, opt, next);
>>> }
>>> +static void incoming_option_parse(const char *str)
>>> +{
>>> + MigrationChannel *channel;
>>> +
>>> + if (str[0] == '{') {
>>> + QObject *obj = qobject_from_json(str, &error_fatal);
>>> + Visitor *v = qobject_input_visitor_new(obj);
>>> +
>>> + qobject_unref(obj);
>>> + visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
>>> + visit_free(v);
>>> + } else if (!strcmp(str, "defer")) {
>>> + channel = NULL;
>>> + } else {
>>> + migrate_uri_parse(str, &channel, &error_fatal);
>>> + }
>>> +
>>> + /* New incoming spec replaces the previous */
>>> +
>>> + if (incoming_channels) {
>>> + qapi_free_MigrationChannelList(incoming_channels);
>>> + }
>>> + if (channel) {
>>> + incoming_channels = g_new0(MigrationChannelList, 1);
>>> + incoming_channels->value = channel;
>>> + }
>>> + incoming = str;
>>> +}
>>
>> @incoming is set to @optarg.
>>
>> @incoming_channels is set to a MigrationChannelList of exactly one
>> element, parsed from @incoming. Except when @incoming is "defer", then
>> @incoming_channels is set to null.
>>
>> @incoming is only ever used as a flag. Turn it into a bool?
>
> The remembered incoming specifier is also used in an error message in
> qmp_x_exit_preconfig:
> error_reportf_err(local_err, "-incoming %s: ", incoming);
>
>> Oh, wait... see my comment on the next hunk.
>>
>> Option -incoming resembles QMP command migrate-incoming. Differences:
>>
>> * migrate-incoming keeps legacy URI and modern argument separate: there
>> are two named arguments, and exactly one of them must be passed.
>> -incoming overloads them: if @optarg starts with '{', it's modern,
>> else legacy URI.
>>
>> Because of that, -incoming *only* supports JSON syntax for modern, not
>> dotted keys. Other JSON-capable arguments support both.
>
> Not sure I follow.
> Could you give me a dotted key example for a JSON-capable argument?
> Do we care about dotted key for incoming, given the user can specify
> a simple legacy URI?
A quick grep for the usual parser qobject_input_visitor_new() finds
-audiodev, -blockdev, -compat, -display, and -netdev. Beware, the
latter two come with backward compatibility gunk. There's also -device
and -object, also with backward compatibility gunk.
Simple example:
JSON -compat '{"deprecated-input": "reject", "deprecated-output": "hide"}
dotted keys -compat deprecated-input=reject,deprecated-output=hide
Slightly more interesting:
JSON -audiodev '{"id": "audiodev0", "driver": "wav", "in": {"voices": 4}}'
dotted keys -audiodev id=audiodev0,driver=wav,in.voices=4
>> How can a management application detect that -incoming supports
>> modern?
>
> How does mgmt detect when other arguments support JSON?
Easy when an option supports it from the start: -audiodev, -blockdev,
-compat. Awkward when we extend an existing option to support it:
-display, -netdev, -device, -object.
As far as I can tell at a glance, libvirt
* Remains unaware of -display JSON arguments
* Assumes -netdev accepts JSON when QMP netdev-add supports backend type
"dgram", see commit 697e26fac66 (qemu: capabilities: Detect support
for JSON args for -netdev) v8.10.0
* Assumes -device accepts JSON when QMP device_add has feature
json-cli-hotplug, see commit 1a691fe1c84 (qemu: capabilities:
Re-enable JSON syntax for -device) v8.1.0
* Assumes -object accepts JSON when object-add supports object type
"secret", see commit f763b6e4390 (qemu: capabilities: Enable detection
of QEMU_CAPS_OBJECT_QAPIFIED) v7.2.0
In theory, such indirect probing can fall apart when somebody backports
JSON syntax *without* the thing libvirt probes for. They then get to
adjust libvirt's detection logic, too. Hasn't been an issue in practice
as far as I know.
> The presence of cpr-transfer mode implies -incoming JSON support, though
> that is indirect.
Might be good enough.
> We could add a feature to the migrate-incoming command, like json-cli
> for device_add. Seems like overkill though. 'feature' is little used,
> except for unstable and deprecated.
'feature' is best used sparingly. But when it's needed, using it is
*fine*.
>> Sure overloading -incoming this way is a good idea?
>>
>> * migrate-incoming takes a list of channels, currently restricted to a
>> single channel. -incoming takes a channel. If we lift the
>> restriction, -incoming syntax will become even messier: we'll have to
>> additionally overload list of channel.
>>
>> Should -incoming take a list from the start, like migrate-incoming
>> does?
>
> That was my first try. However, to support the equivalent of '-incoming deferred',
> we need to add an 'defer' key to the channel, and when defer is true, the other
> keys that are currently mandatory must be omitted. The tweaks to the implementation
> and specification seemed not worth worth it.
>
> If we want -incoming to also support a channel list in the future, we can simply
> check for an initial '[' token.
Yes, but it'll then have to support single channels both as list of one
channel object, and channel object, unlike migrate-incoming.
Syntactical differences between CLI and QMP for things that are
semantically identical add unnecessary complexity, don't you think?
>>> +
>>> static void object_option_parse(const char *str)
>>> {
>>> QemuOpts *opts;
>>> @@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
>>> if (incoming) {
>>> Error *local_err = NULL;
>>> if (strcmp(incoming, "defer") != 0) {
>>> - qmp_migrate_incoming(incoming, false, NULL, true, true,
>>> + qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
>>> &local_err);
>>
>> You move the parsing of legacy URI from within qmp_migrate_incoming()
>> into incoming_option_parse().
>>
>> The alternative is not to parse it in incoming_option_parse(), but pass
>> it to qmp_migrate_incoming() like this:
>>
>> qmp_migrate_incoming(incoming, !incoming, incoming_channels,
>> true, true, &local_err);
>
> Sure, I can tweak that, but I need to define an additional incoming_uri variable:
> qmp_migrate_incoming(incoming_uri, !!incoming_channels, incoming_channels, ...
>
> Only one of incoming_uri and incoming_channels can be non-NULL (checked in
> qemu_start_incoming_migration).
>
> Would you prefer I continue down this path, or revert to the previous -cpr-uri
> option? I made this change to make the incoming interface look more like the
> V4 outgoing interface, in which the user adds a cpr channel to the migrate command
> channels.
I'm not sure. Peter, what do you think?
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-09 12:12 ` Markus Armbruster
@ 2024-12-09 16:36 ` Peter Xu
2024-12-11 9:18 ` Markus Armbruster
2024-12-11 18:58 ` Steven Sistare
2 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-09 16:36 UTC (permalink / raw)
To: Markus Armbruster
Cc: Steven Sistare, qemu-devel, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On Mon, Dec 09, 2024 at 01:12:25PM +0100, Markus Armbruster wrote:
> >>> @@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
> >>> if (incoming) {
> >>> Error *local_err = NULL;
> >>> if (strcmp(incoming, "defer") != 0) {
> >>> - qmp_migrate_incoming(incoming, false, NULL, true, true,
> >>> + qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
> >>> &local_err);
> >>
> >> You move the parsing of legacy URI from within qmp_migrate_incoming()
> >> into incoming_option_parse().
> >>
> >> The alternative is not to parse it in incoming_option_parse(), but pass
> >> it to qmp_migrate_incoming() like this:
> >>
> >> qmp_migrate_incoming(incoming, !incoming, incoming_channels,
> >> true, true, &local_err);
> >
> > Sure, I can tweak that, but I need to define an additional incoming_uri variable:
> > qmp_migrate_incoming(incoming_uri, !!incoming_channels, incoming_channels, ...
> >
> > Only one of incoming_uri and incoming_channels can be non-NULL (checked in
> > qemu_start_incoming_migration).
> >
> > Would you prefer I continue down this path, or revert to the previous -cpr-uri
> > option? I made this change to make the incoming interface look more like the
> > V4 outgoing interface, in which the user adds a cpr channel to the migrate command
> > channels.
>
> I'm not sure. Peter, what do you think?
For this specific question, I prefer reusing -incoming rather than going
back to -cpr-uri.
We should have discussed about this in the previous spin of a follow up
discussion, using -incoming for cpr channels seems to always be preferred.
https://lore.kernel.org/qemu-devel/Zz4NqcTDK73MKOaa@redhat.com/
At that time, the concern from Dan was not reusing the JSON format or
design the CLI's own format. That was always based on reusing -incoming.
In this patch it's already reusing the JSON for the CPR port, which looks
all good from that POV. OTOH, I don't yet have any preference on the impl
of how QEMU should do the internal parsing of such JSON string / legacy
URIs.
Thanks,
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-09 12:12 ` Markus Armbruster
2024-12-09 16:36 ` Peter Xu
@ 2024-12-11 9:18 ` Markus Armbruster
2024-12-11 18:58 ` Steven Sistare
2 siblings, 0 replies; 78+ messages in thread
From: Markus Armbruster @ 2024-12-11 9:18 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Markus Armbruster <armbru@redhat.com> writes:
> Steven Sistare <steven.sistare@oracle.com> writes:
>
>> On 12/5/2024 10:23 AM, Markus Armbruster wrote:
>>> Steve Sistare <steven.sistare@oracle.com> writes:
>>>
>>>> Extend the -incoming option to allow an @MigrationChannel to be specified.
>>>> This allows channels other than 'main' to be described on the command
>>>> line, which will be needed for CPR.
>>>>
>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
[...]
>>> Option -incoming resembles QMP command migrate-incoming. Differences:
>>>
>>> * migrate-incoming keeps legacy URI and modern argument separate: there
>>> are two named arguments, and exactly one of them must be passed.
>>> -incoming overloads them: if @optarg starts with '{', it's modern,
>>> else legacy URI.
>>>
>>> Because of that, -incoming *only* supports JSON syntax for modern, not
>>> dotted keys. Other JSON-capable arguments support both.
>>
>> Not sure I follow.
>> Could you give me a dotted key example for a JSON-capable argument?
>> Do we care about dotted key for incoming, given the user can specify
>> a simple legacy URI?
>
> A quick grep for the usual parser qobject_input_visitor_new() finds
Correction: qobject_input_visitor_new_str().
> -audiodev, -blockdev, -compat, -display, and -netdev. Beware, the
> latter two come with backward compatibility gunk. There's also -device
> and -object, also with backward compatibility gunk.
[...]
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-09 12:12 ` Markus Armbruster
2024-12-09 16:36 ` Peter Xu
2024-12-11 9:18 ` Markus Armbruster
@ 2024-12-11 18:58 ` Steven Sistare
2 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-11 18:58 UTC (permalink / raw)
To: Markus Armbruster
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/9/2024 7:12 AM, Markus Armbruster wrote:
> Steven Sistare <steven.sistare@oracle.com> writes:
>
>> On 12/5/2024 10:23 AM, Markus Armbruster wrote:
>>> Steve Sistare <steven.sistare@oracle.com> writes:
>>>
>>>> Extend the -incoming option to allow an @MigrationChannel to be specified.
>>>> This allows channels other than 'main' to be described on the command
>>>> line, which will be needed for CPR.
>>>>
>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>> [...]
>>>
>>>> diff --git a/qemu-options.hx b/qemu-options.hx
>>>> index 02b9118..fab50ce 100644
>>>> --- a/qemu-options.hx
>>>> +++ b/qemu-options.hx
>>>> @@ -4937,10 +4937,17 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \
>>>> "-incoming exec:cmdline\n" \
>>>> " accept incoming migration on given file descriptor\n" \
>>>> " or from given external command\n" \
>>>> + "-incoming @MigrationChannel\n" \
>>>> + " accept incoming migration on the channel\n" \
>>>> "-incoming defer\n" \
>>>> " wait for the URI to be specified via migrate_incoming\n",
>>>> QEMU_ARCH_ALL)
>>>> SRST
>>>> +The -incoming option specifies the migration channel for an incoming
>>>> +migration. It may be used multiple times to specify multiple
>>>> +migration channel types.
>>>
>>> Really? If I understand the code below correctly, the last -incoming
>>> wins, and any previous ones are silently ignored.
>>
>> See patch "cpr-channel", where the cpr channel is saved separately.
>> Last wins, per channel type.
>> I did this to preserve the current behavior of -incoming in which last wins.
>
> Documentation needs to be clarified then.
Maybe. Depends whether we want/need to take a stand on whether the current
behavior is an accident of the implementation, or part of the specification.
The current behavior is not documented.
>> qemu_start_incoming_migration will need modification if more types are added.
>>
>>>> The channel type is specified in @MigrationChannel,
>>>> +and is 'main' for all other forms of -incoming.
>>>> +
>>>> ``-incoming tcp:[host]:port[,to=maxport][,ipv4=on|off][,ipv6=on|off]``
>>>> \
>>>> ``-incoming rdma:host:port[,ipv4=on|off][,ipv6=on|off]``
>>>> @@ -4960,6 +4967,16 @@ SRST
>>>> Accept incoming migration as an output from specified external
>>>> command.
>>>> +``-incoming @MigrationChannel``
>>>> + Accept incoming migration on the channel. See the QAPI documentation
>>>> + for the syntax of the @MigrationChannel data element. For example:
>>>> + ::
>>>
>>> I get what you're trying to express, but there's no precedence for
>>> referring to QAPI types like @TypeName in option documentation. But
>>> let's ignore this until after we nailed down the actual interface, on
>>> which I have questions below.
>>
>> Ack.
>>
>>>> +
>>>> + -incoming '{"channel-type": "main",
>>>> + "addr": { "transport": "socket",
>>>> + "type": "unix",
>>>> + "path": "my.sock" }}'
>>>> +
>>>> ``-incoming defer``
>>>> Wait for the URI to be specified via migrate\_incoming. The monitor
>>>> can be used to change settings (such as migration parameters) prior
>>>> diff --git a/system/vl.c b/system/vl.c
>>>> index 4151a79..2c24c60 100644
>>>> --- a/system/vl.c
>>>> +++ b/system/vl.c
>>>> @@ -123,6 +123,7 @@
>>>> #include "qapi/qapi-visit-block-core.h"
>>>> #include "qapi/qapi-visit-compat.h"
>>>> #include "qapi/qapi-visit-machine.h"
>>>> +#include "qapi/qapi-visit-migration.h"
>>>> #include "qapi/qapi-visit-ui.h"
>>>> #include "qapi/qapi-commands-block-core.h"
>>>> #include "qapi/qapi-commands-migration.h"
>>>> @@ -159,6 +160,7 @@ typedef struct DeviceOption {
>>>> static const char *cpu_option;
>>>> static const char *mem_path;
>>>> static const char *incoming;
>>>> +static MigrationChannelList *incoming_channels;
>>>> static const char *loadvm;
>>>> static const char *accelerators;
>>>> static bool have_custom_ram_size;
>>>> @@ -1821,6 +1823,35 @@ static void object_option_add_visitor(Visitor *v)
>>>> QTAILQ_INSERT_TAIL(&object_opts, opt, next);
>>>> }
>>>> +static void incoming_option_parse(const char *str)
>>>> +{
>>>> + MigrationChannel *channel;
>>>> +
>>>> + if (str[0] == '{') {
>>>> + QObject *obj = qobject_from_json(str, &error_fatal);
>>>> + Visitor *v = qobject_input_visitor_new(obj);
>>>> +
>>>> + qobject_unref(obj);
>>>> + visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
>>>> + visit_free(v);
>>>> + } else if (!strcmp(str, "defer")) {
>>>> + channel = NULL;
>>>> + } else {
>>>> + migrate_uri_parse(str, &channel, &error_fatal);
>>>> + }
>>>> +
>>>> + /* New incoming spec replaces the previous */
>>>> +
>>>> + if (incoming_channels) {
>>>> + qapi_free_MigrationChannelList(incoming_channels);
>>>> + }
>>>> + if (channel) {
>>>> + incoming_channels = g_new0(MigrationChannelList, 1);
>>>> + incoming_channels->value = channel;
>>>> + }
>>>> + incoming = str;
>>>> +}
>>>
>>> @incoming is set to @optarg.
>>>
>>> @incoming_channels is set to a MigrationChannelList of exactly one
>>> element, parsed from @incoming. Except when @incoming is "defer", then
>>> @incoming_channels is set to null.
>>>
>>> @incoming is only ever used as a flag. Turn it into a bool?
>>
>> The remembered incoming specifier is also used in an error message in
>> qmp_x_exit_preconfig:
>> error_reportf_err(local_err, "-incoming %s: ", incoming);
>>
>>> Oh, wait... see my comment on the next hunk.
>>>
>>> Option -incoming resembles QMP command migrate-incoming. Differences:
>>>
>>> * migrate-incoming keeps legacy URI and modern argument separate: there
>>> are two named arguments, and exactly one of them must be passed.
>>> -incoming overloads them: if @optarg starts with '{', it's modern,
>>> else legacy URI.
>>>
>>> Because of that, -incoming *only* supports JSON syntax for modern, not
>>> dotted keys. Other JSON-capable arguments support both.
>>
>> Not sure I follow.
>> Could you give me a dotted key example for a JSON-capable argument?
>> Do we care about dotted key for incoming, given the user can specify
>> a simple legacy URI?
>
> A quick grep for the usual parser qobject_input_visitor_new() finds
> -audiodev, -blockdev, -compat, -display, and -netdev. Beware, the
> latter two come with backward compatibility gunk. There's also -device
> and -object, also with backward compatibility gunk.
>
> Simple example:
>
> JSON -compat '{"deprecated-input": "reject", "deprecated-output": "hide"}
> dotted keys -compat deprecated-input=reject,deprecated-output=hide
>
> Slightly more interesting:
>
> JSON -audiodev '{"id": "audiodev0", "driver": "wav", "in": {"voices": 4}}'
> dotted keys -audiodev id=audiodev0,driver=wav,in.voices=4
Thank you (and for the correction to qobject_input_visitor_new_str). I did not
grok that this visitor handles both json and dotted keys, as so far I had only
seen examples with keys but not dotted keys. I can easily support those as well
as the legacy uri, by returning a new parameter from migrate_uri_parse indicating
that no uri is recognized (as opposed to recognized, but with some other error):
if (!strcmp(str, "defer")) {
channel = NULL;
} else if (!migrate_uri_parse(str, &channel, &no_uri, &err)) {
if (no_uri) {
qobject_input_visitor_new_str()
visit_type_MigrationChannel()
} else {
report error
}
}
I implemented this and tested for all formats.
>>> How can a management application detect that -incoming supports
>>> modern?
>>
>> How does mgmt detect when other arguments support JSON?
>
> Easy when an option supports it from the start: -audiodev, -blockdev,
> -compat. Awkward when we extend an existing option to support it:
> -display, -netdev, -device, -object.
>
> As far as I can tell at a glance, libvirt
>
> * Remains unaware of -display JSON arguments
>
> * Assumes -netdev accepts JSON when QMP netdev-add supports backend type
> "dgram", see commit 697e26fac66 (qemu: capabilities: Detect support
> for JSON args for -netdev) v8.10.0
>
> * Assumes -device accepts JSON when QMP device_add has feature
> json-cli-hotplug, see commit 1a691fe1c84 (qemu: capabilities:
> Re-enable JSON syntax for -device) v8.1.0
>
> * Assumes -object accepts JSON when object-add supports object type
> "secret", see commit f763b6e4390 (qemu: capabilities: Enable detection
> of QEMU_CAPS_OBJECT_QAPIFIED) v7.2.0
>
> In theory, such indirect probing can fall apart when somebody backports
> JSON syntax *without* the thing libvirt probes for. They then get to
> adjust libvirt's detection logic, too. Hasn't been an issue in practice
> as far as I know.
>
>> The presence of cpr-transfer mode implies -incoming JSON support, though
>> that is indirect.
>
> Might be good enough.
I'll keep it simple and go with that unless someone objects.
>> We could add a feature to the migrate-incoming command, like json-cli
>> for device_add. Seems like overkill though. 'feature' is little used,
>> except for unstable and deprecated.
>
> 'feature' is best used sparingly. But when it's needed, using it is
> *fine*.
>
>>> Sure overloading -incoming this way is a good idea?
>>>
>>> * migrate-incoming takes a list of channels, currently restricted to a
>>> single channel. -incoming takes a channel. If we lift the
>>> restriction, -incoming syntax will become even messier: we'll have to
>>> additionally overload list of channel.
>>>
>>> Should -incoming take a list from the start, like migrate-incoming
>>> does?
>>
>> That was my first try. However, to support the equivalent of '-incoming deferred',
>> we need to add an 'defer' key to the channel, and when defer is true, the other
>> keys that are currently mandatory must be omitted. The tweaks to the implementation
>> and specification seemed not worth worth it.
>>
>> If we want -incoming to also support a channel list in the future, we can simply
>> check for an initial '[' token.
>
> Yes, but it'll then have to support single channels both as list of one
> channel object, and channel object, unlike migrate-incoming.
>
> Syntactical differences between CLI and QMP for things that are
> semantically identical add unnecessary complexity, don't you think?
Agreed on both. I am just pointing out that if we implement '-incoming uri|channel'
now, but in the future want to add '-incoming channel-list', then we can parse it
unambiguously and be backwards compatible. I don't foresee ever needing the latter,
as multiple '-incoming uri|channel' arguments are logically equivalent to a list.
To reiterate, I prefer '-incoming uri|channel' because it avoids the need
to add a 'defer' property to the channel specification and implementation.
>>>> +
>>>> static void object_option_parse(const char *str)
>>>> {
>>>> QemuOpts *opts;
>>>> @@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
>>>> if (incoming) {
>>>> Error *local_err = NULL;
>>>> if (strcmp(incoming, "defer") != 0) {
>>>> - qmp_migrate_incoming(incoming, false, NULL, true, true,
>>>> + qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
>>>> &local_err);
>>>
>>> You move the parsing of legacy URI from within qmp_migrate_incoming()
>>> into incoming_option_parse().
>>>
>>> The alternative is not to parse it in incoming_option_parse(), but pass
>>> it to qmp_migrate_incoming() like this:
>>>
>>> qmp_migrate_incoming(incoming, !incoming, incoming_channels,
>>> true, true, &local_err);
>>
>> Sure, I can tweak that, but I need to define an additional incoming_uri variable:
>> qmp_migrate_incoming(incoming_uri, !!incoming_channels, incoming_channels, ...
>>
>> Only one of incoming_uri and incoming_channels can be non-NULL (checked in
>> qemu_start_incoming_migration).
>>
>> Would you prefer I continue down this path, or revert to the previous -cpr-uri
>> option? I made this change to make the incoming interface look more like the
>> V4 outgoing interface, in which the user adds a cpr channel to the migrate command
>> channels.
>
> I'm not sure. Peter, what do you think?
Peter likes -incoming, so I will continue with it.
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 09/19] migration: incoming channel
2024-12-05 15:23 ` Markus Armbruster
2024-12-05 20:45 ` Steven Sistare
@ 2024-12-10 12:46 ` Markus Armbruster
1 sibling, 0 replies; 78+ messages in thread
From: Markus Armbruster @ 2024-12-10 12:46 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Markus Armbruster <armbru@redhat.com> writes:
> Steve Sistare <steven.sistare@oracle.com> writes:
>
>> Extend the -incoming option to allow an @MigrationChannel to be specified.
>> This allows channels other than 'main' to be described on the command
>> line, which will be needed for CPR.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
[...]
>> diff --git a/system/vl.c b/system/vl.c
>> index 4151a79..2c24c60 100644
>> --- a/system/vl.c
>> +++ b/system/vl.c
>> @@ -123,6 +123,7 @@
>> #include "qapi/qapi-visit-block-core.h"
>> #include "qapi/qapi-visit-compat.h"
>> #include "qapi/qapi-visit-machine.h"
>> +#include "qapi/qapi-visit-migration.h"
>> #include "qapi/qapi-visit-ui.h"
>> #include "qapi/qapi-commands-block-core.h"
>> #include "qapi/qapi-commands-migration.h"
>> @@ -159,6 +160,7 @@ typedef struct DeviceOption {
>> static const char *cpu_option;
>> static const char *mem_path;
>> static const char *incoming;
>> +static MigrationChannelList *incoming_channels;
>> static const char *loadvm;
>> static const char *accelerators;
>> static bool have_custom_ram_size;
>> @@ -1821,6 +1823,35 @@ static void object_option_add_visitor(Visitor *v)
>> QTAILQ_INSERT_TAIL(&object_opts, opt, next);
>> }
>>
>> +static void incoming_option_parse(const char *str)
>> +{
>> + MigrationChannel *channel;
>> +
>> + if (str[0] == '{') {
>> + QObject *obj = qobject_from_json(str, &error_fatal);
>> + Visitor *v = qobject_input_visitor_new(obj);
>> +
>> + qobject_unref(obj);
>> + visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
>> + visit_free(v);
>> + } else if (!strcmp(str, "defer")) {
>> + channel = NULL;
>> + } else {
>> + migrate_uri_parse(str, &channel, &error_fatal);
>> + }
>> +
>> + /* New incoming spec replaces the previous */
>> +
>> + if (incoming_channels) {
>> + qapi_free_MigrationChannelList(incoming_channels);
>> + }
>> + if (channel) {
>> + incoming_channels = g_new0(MigrationChannelList, 1);
>> + incoming_channels->value = channel;
>> + }
>> + incoming = str;
>> +}
>
> @incoming is set to @optarg.
>
> @incoming_channels is set to a MigrationChannelList of exactly one
> element, parsed from @incoming. Except when @incoming is "defer", then
> @incoming_channels is set to null.
>
> @incoming is only ever used as a flag. Turn it into a bool?
>
> Oh, wait... see my comment on the next hunk.
>
> Option -incoming resembles QMP command migrate-incoming. Differences:
>
> * migrate-incoming keeps legacy URI and modern argument separate: there
> are two named arguments, and exactly one of them must be passed.
> -incoming overloads them: if @optarg starts with '{', it's modern,
> else legacy URI.
>
> Because of that, -incoming *only* supports JSON syntax for modern, not
> dotted keys. Other JSON-capable arguments support both.
Here's a way to avoid restricting modern to JSON.
Legacy URI is either "defer" or starts with "KEYWORD:", where KEYWORD is
one of a few well-known words.
As long as we don't support an implied key, a non-empty dotted keys
argument starts with "KEY=", where KEY cannot contain ':'.
This lets us distinguish legacy URI from dotted keys. Say, if the
argument is "defer" or starts with letters followed by ':', assume URI.
> How can a management application detect that -incoming supports
> modern?
>
> Sure overloading -incoming this way is a good idea?
It'll be a pain to document.
> * migrate-incoming takes a list of channels, currently restricted to a
> single channel. -incoming takes a channel. If we lift the
> restriction, -incoming syntax will become even messier: we'll have to
> additionally overload list of channel.
>
> Should -incoming take a list from the start, like migrate-incoming
> does?
>
>> +
>> static void object_option_parse(const char *str)
>> {
>> QemuOpts *opts;
>> @@ -2730,7 +2761,7 @@ void qmp_x_exit_preconfig(Error **errp)
>> if (incoming) {
>> Error *local_err = NULL;
>> if (strcmp(incoming, "defer") != 0) {
>> - qmp_migrate_incoming(incoming, false, NULL, true, true,
>> + qmp_migrate_incoming(NULL, true, incoming_channels, true, true,
>> &local_err);
>
> You move the parsing of legacy URI from within qmp_migrate_incoming()
> into incoming_option_parse().
>
> The alternative is not to parse it in incoming_option_parse(), but pass
> it to qmp_migrate_incoming() like this:
>
> qmp_migrate_incoming(incoming, !incoming, incoming_channels,
> true, true, &local_err);
>
>> if (local_err) {
>> error_reportf_err(local_err, "-incoming %s: ", incoming);
>> @@ -3477,7 +3508,7 @@ void qemu_init(int argc, char **argv)
>> if (!incoming) {
>> runstate_set(RUN_STATE_INMIGRATE);
>> }
>> - incoming = optarg;
>> + incoming_option_parse(optarg);
>> break;
>> case QEMU_OPTION_only_migratable:
>> only_migratable = 1;
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 10/19] migration: cpr channel
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (8 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 09/19] migration: incoming channel Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-05 15:37 ` Markus Armbruster
2024-12-02 13:20 ` [PATCH V4 11/19] migration: SCM_RIGHTS for QEMUFile Steve Sistare
` (8 subsequent siblings)
18 siblings, 1 reply; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Add the 'cpr' channel type, and stash the incoming cpr channel for use
in a subsequent patch.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
include/migration/cpr.h | 3 +++
migration/cpr.c | 15 +++++++++++++++
qapi/migration.json | 3 ++-
system/vl.c | 6 ++++++
4 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index 201d66d..e833fae 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -17,6 +17,9 @@ void cpr_save_fd(const char *name, int id, int fd);
void cpr_delete_fd(const char *name, int id);
int cpr_find_fd(const char *name, int id);
+void cpr_set_cpr_channel(MigrationChannel *channel);
+MigrationChannel *cpr_get_cpr_channel(void);
+
int cpr_state_save(MigrationChannel *channel, Error **errp);
int cpr_state_load(Error **errp);
void cpr_state_close(void);
diff --git a/migration/cpr.c b/migration/cpr.c
index 1e2878c..f4a795f 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -116,6 +116,21 @@ QIOChannel *cpr_state_ioc(void)
return qemu_file_get_ioc(cpr_state_file);
}
+static MigrationChannel *cpr_channel;
+
+void cpr_set_cpr_channel(MigrationChannel *channel)
+{
+ if (cpr_channel) {
+ qapi_free_MigrationChannel(cpr_channel);
+ }
+ cpr_channel = channel;
+}
+
+MigrationChannel *cpr_get_cpr_channel(void)
+{
+ return cpr_channel;
+}
+
int cpr_state_save(MigrationChannel *channel, Error **errp)
{
int ret;
diff --git a/qapi/migration.json b/qapi/migration.json
index a605dc2..a26960b 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1578,11 +1578,12 @@
# The migration channel-type request options.
#
# @main: Main outbound migration channel.
+# @cpr: cpr state channel.
#
# Since: 8.1
##
{ 'enum': 'MigrationChannelType',
- 'data': [ 'main' ] }
+ 'data': [ 'main', 'cpr' ] }
##
# @MigrationChannel:
diff --git a/system/vl.c b/system/vl.c
index 2c24c60..40e049e 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -77,6 +77,7 @@
#include "hw/block/block.h"
#include "hw/i386/x86.h"
#include "hw/i386/pc.h"
+#include "migration/cpr.h"
#include "migration/misc.h"
#include "migration/snapshot.h"
#include "sysemu/tpm.h"
@@ -1834,6 +1835,11 @@ static void incoming_option_parse(const char *str)
qobject_unref(obj);
visit_type_MigrationChannel(v, "channel", &channel, &error_fatal);
visit_free(v);
+
+ if (channel->channel_type == MIGRATION_CHANNEL_TYPE_CPR) {
+ cpr_set_cpr_channel(channel);
+ return;
+ }
} else if (!strcmp(str, "defer")) {
channel = NULL;
} else {
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 10/19] migration: cpr channel
2024-12-02 13:20 ` [PATCH V4 10/19] migration: cpr channel Steve Sistare
@ 2024-12-05 15:37 ` Markus Armbruster
2024-12-05 20:46 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Markus Armbruster @ 2024-12-05 15:37 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Steve Sistare <steven.sistare@oracle.com> writes:
> Add the 'cpr' channel type, and stash the incoming cpr channel for use
> in a subsequent patch.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
[...]
> diff --git a/qapi/migration.json b/qapi/migration.json
> index a605dc2..a26960b 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -1578,11 +1578,12 @@
> # The migration channel-type request options.
> #
> # @main: Main outbound migration channel.
> +# @cpr: cpr state channel.
What does "cpr" stand for?
> #
> # Since: 8.1
> ##
> { 'enum': 'MigrationChannelType',
> - 'data': [ 'main' ] }
> + 'data': [ 'main', 'cpr' ] }
>
> ##
> # @MigrationChannel:
[...]
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 10/19] migration: cpr channel
2024-12-05 15:37 ` Markus Armbruster
@ 2024-12-05 20:46 ` Steven Sistare
2024-12-06 9:31 ` Markus Armbruster
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-05 20:46 UTC (permalink / raw)
To: Markus Armbruster
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/5/2024 10:37 AM, Markus Armbruster wrote:
> Steve Sistare <steven.sistare@oracle.com> writes:
>
>> Add the 'cpr' channel type, and stash the incoming cpr channel for use
>> in a subsequent patch.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> [...]
>
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index a605dc2..a26960b 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -1578,11 +1578,12 @@
>> # The migration channel-type request options.
>> #
>> # @main: Main outbound migration channel.
>> +# @cpr: cpr state channel.
>
> What does "cpr" stand for?
docs/devel/migration/CPR.rst: CheckPoint and Restart (CPR)
- Steve
>
>> #
>> # Since: 8.1
>> ##
>> { 'enum': 'MigrationChannelType',
>> - 'data': [ 'main' ] }
>> + 'data': [ 'main', 'cpr' ] }
>>
>> ##
>> # @MigrationChannel:
>
> [...]
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 10/19] migration: cpr channel
2024-12-05 20:46 ` Steven Sistare
@ 2024-12-06 9:31 ` Markus Armbruster
2024-12-18 19:53 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Markus Armbruster @ 2024-12-06 9:31 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Steven Sistare <steven.sistare@oracle.com> writes:
> On 12/5/2024 10:37 AM, Markus Armbruster wrote:
>> Steve Sistare <steven.sistare@oracle.com> writes:
>>
>>> Add the 'cpr' channel type, and stash the incoming cpr channel for use
>>> in a subsequent patch.
>>>
>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> [...]
>>
>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>> index a605dc2..a26960b 100644
>>> --- a/qapi/migration.json
>>> +++ b/qapi/migration.json
>>> @@ -1578,11 +1578,12 @@
>>> # The migration channel-type request options.
>>> #
>>> # @main: Main outbound migration channel.
>>> +# @cpr: cpr state channel.
>>>
>> What does "cpr" stand for?
>
> docs/devel/migration/CPR.rst: CheckPoint and Restart (CPR)
Suggest something like
# The migration channel-type request options.
#
# @main: Main outbound migration channel.
#
# @cpr: Checkpoint and restart state channel
A quick glance at docs/devel/migration/CPR.rst makes me wonder: is that
really *developer* documentation?
Should we have something meant for *users*, too? QAPI docs could then
link to it.
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 10/19] migration: cpr channel
2024-12-06 9:31 ` Markus Armbruster
@ 2024-12-18 19:53 ` Steven Sistare
2024-12-18 20:27 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 19:53 UTC (permalink / raw)
To: Peter Xu, Markus Armbruster
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange
On 12/6/2024 4:31 AM, Markus Armbruster wrote:
> Steven Sistare <steven.sistare@oracle.com> writes:
>
>> On 12/5/2024 10:37 AM, Markus Armbruster wrote:
>>> Steve Sistare <steven.sistare@oracle.com> writes:
>>>
>>>> Add the 'cpr' channel type, and stash the incoming cpr channel for use
>>>> in a subsequent patch.
>>>>
>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>> [...]
>>>
>>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>>> index a605dc2..a26960b 100644
>>>> --- a/qapi/migration.json
>>>> +++ b/qapi/migration.json
>>>> @@ -1578,11 +1578,12 @@
>>>> # The migration channel-type request options.
>>>> #
>>>> # @main: Main outbound migration channel.
>>>> +# @cpr: cpr state channel.
>>>>
>>> What does "cpr" stand for?
>>
>> docs/devel/migration/CPR.rst: CheckPoint and Restart (CPR)
>
> Suggest something like
>
> # The migration channel-type request options.
> #
> # @main: Main outbound migration channel.
> #
> # @cpr: Checkpoint and restart state channel
>
> A quick glance at docs/devel/migration/CPR.rst makes me wonder: is that
> really *developer* documentation?
>
> Should we have something meant for *users*, too? QAPI docs could then
> link to it.
I agree, CPR.rst is user documentation.
Peter, are you OK with me moving it to the "System Emulation" section of
the documention?
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 10/19] migration: cpr channel
2024-12-18 19:53 ` Steven Sistare
@ 2024-12-18 20:27 ` Peter Xu
2024-12-18 20:31 ` Steven Sistare
0 siblings, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-18 20:27 UTC (permalink / raw)
To: Steven Sistare
Cc: Markus Armbruster, qemu-devel, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On Wed, Dec 18, 2024 at 02:53:16PM -0500, Steven Sistare wrote:
> On 12/6/2024 4:31 AM, Markus Armbruster wrote:
> > Steven Sistare <steven.sistare@oracle.com> writes:
> >
> > > On 12/5/2024 10:37 AM, Markus Armbruster wrote:
> > > > Steve Sistare <steven.sistare@oracle.com> writes:
> > > >
> > > > > Add the 'cpr' channel type, and stash the incoming cpr channel for use
> > > > > in a subsequent patch.
> > > > >
> > > > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > > > [...]
> > > >
> > > > > diff --git a/qapi/migration.json b/qapi/migration.json
> > > > > index a605dc2..a26960b 100644
> > > > > --- a/qapi/migration.json
> > > > > +++ b/qapi/migration.json
> > > > > @@ -1578,11 +1578,12 @@
> > > > > # The migration channel-type request options.
> > > > > #
> > > > > # @main: Main outbound migration channel.
> > > > > +# @cpr: cpr state channel.
> > > > >
> > > > What does "cpr" stand for?
> > >
> > > docs/devel/migration/CPR.rst: CheckPoint and Restart (CPR)
> >
> > Suggest something like
> >
> > # The migration channel-type request options.
> > #
> > # @main: Main outbound migration channel.
> > #
> > # @cpr: Checkpoint and restart state channel
> >
> > A quick glance at docs/devel/migration/CPR.rst makes me wonder: is that
> > really *developer* documentation?
> >
> > Should we have something meant for *users*, too? QAPI docs could then
> > link to it.
>
> I agree, CPR.rst is user documentation.
>
> Peter, are you OK with me moving it to the "System Emulation" section of
> the documention?
Considering CPR is very closely attached to migration, while we do have the
migration doc in one place right now in devel/... it may make it harder for
people to find relevant info.
It might indeed be an issue, and it can be a more generic that migration
doc (no matter whether it's user or devel oriented..) always stays in
devel/ so far..
As of now.. How about we still keep it in devel/migration/ so migration
stuff is together, but then we move user-relevant migration docs out
instead? That may contain more than CPR.
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 10/19] migration: cpr channel
2024-12-18 20:27 ` Peter Xu
@ 2024-12-18 20:31 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 20:31 UTC (permalink / raw)
To: Peter Xu
Cc: Markus Armbruster, qemu-devel, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/18/2024 3:27 PM, Peter Xu wrote:
> On Wed, Dec 18, 2024 at 02:53:16PM -0500, Steven Sistare wrote:
>> On 12/6/2024 4:31 AM, Markus Armbruster wrote:
>>> Steven Sistare <steven.sistare@oracle.com> writes:
>>>
>>>> On 12/5/2024 10:37 AM, Markus Armbruster wrote:
>>>>> Steve Sistare <steven.sistare@oracle.com> writes:
>>>>>
>>>>>> Add the 'cpr' channel type, and stash the incoming cpr channel for use
>>>>>> in a subsequent patch.
>>>>>>
>>>>>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>>>>> [...]
>>>>>
>>>>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>>>>> index a605dc2..a26960b 100644
>>>>>> --- a/qapi/migration.json
>>>>>> +++ b/qapi/migration.json
>>>>>> @@ -1578,11 +1578,12 @@
>>>>>> # The migration channel-type request options.
>>>>>> #
>>>>>> # @main: Main outbound migration channel.
>>>>>> +# @cpr: cpr state channel.
>>>>>>
>>>>> What does "cpr" stand for?
>>>>
>>>> docs/devel/migration/CPR.rst: CheckPoint and Restart (CPR)
>>>
>>> Suggest something like
>>>
>>> # The migration channel-type request options.
>>> #
>>> # @main: Main outbound migration channel.
>>> #
>>> # @cpr: Checkpoint and restart state channel
>>>
>>> A quick glance at docs/devel/migration/CPR.rst makes me wonder: is that
>>> really *developer* documentation?
>>>
>>> Should we have something meant for *users*, too? QAPI docs could then
>>> link to it.
>>
>> I agree, CPR.rst is user documentation.
>>
>> Peter, are you OK with me moving it to the "System Emulation" section of
>> the documention?
>
> Considering CPR is very closely attached to migration, while we do have the
> migration doc in one place right now in devel/... it may make it harder for
> people to find relevant info.
>
> It might indeed be an issue, and it can be a more generic that migration
> doc (no matter whether it's user or devel oriented..) always stays in
> devel/ so far..
>
> As of now.. How about we still keep it in devel/migration/ so migration
> stuff is together, but then we move user-relevant migration docs out
> instead? That may contain more than CPR.
Fine with me - steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 11/19] migration: SCM_RIGHTS for QEMUFile
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (9 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 10/19] migration: cpr channel Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-02 13:20 ` [PATCH V4 12/19] migration: VMSTATE_FD Steve Sistare
` (7 subsequent siblings)
18 siblings, 0 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Define functions to put/get file descriptors to/from a QEMUFile, for qio
channels that support SCM_RIGHTS. Maintain ordering such that
put(A), put(fd), put(B)
followed by
get(A), get(fd), get(B)
always succeeds. Other get orderings may succeed but are not guaranteed.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
migration/qemu-file.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++---
migration/qemu-file.h | 2 ++
migration/trace-events | 2 ++
3 files changed, 83 insertions(+), 4 deletions(-)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index b6d2f58..728c7dd 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -37,6 +37,11 @@
#define IO_BUF_SIZE 32768
#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64)
+typedef struct FdEntry {
+ QTAILQ_ENTRY(FdEntry) entry;
+ int fd;
+} FdEntry;
+
struct QEMUFile {
QIOChannel *ioc;
bool is_writable;
@@ -51,6 +56,9 @@ struct QEMUFile {
int last_error;
Error *last_error_obj;
+
+ bool can_pass_fd;
+ QTAILQ_HEAD(, FdEntry) fds;
};
/*
@@ -109,6 +117,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
object_ref(ioc);
f->ioc = ioc;
f->is_writable = is_writable;
+ f->can_pass_fd = qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS);
+ QTAILQ_INIT(&f->fds);
return f;
}
@@ -310,6 +320,10 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
int len;
int pending;
Error *local_error = NULL;
+ g_autofree int *fds = NULL;
+ size_t nfd = 0;
+ int **pfds = f->can_pass_fd ? &fds : NULL;
+ size_t *pnfd = f->can_pass_fd ? &nfd : NULL;
assert(!qemu_file_is_writable(f));
@@ -325,10 +339,9 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
}
do {
- len = qio_channel_read(f->ioc,
- (char *)f->buf + pending,
- IO_BUF_SIZE - pending,
- &local_error);
+ struct iovec iov = { f->buf + pending, IO_BUF_SIZE - pending };
+ len = qio_channel_readv_full(f->ioc, &iov, 1, pfds, pnfd, 0,
+ &local_error);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
qio_channel_yield(f->ioc, G_IO_IN);
@@ -348,9 +361,65 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
qemu_file_set_error_obj(f, len, local_error);
}
+ for (int i = 0; i < nfd; i++) {
+ FdEntry *fde = g_new0(FdEntry, 1);
+ fde->fd = fds[i];
+ QTAILQ_INSERT_TAIL(&f->fds, fde, entry);
+ }
+
return len;
}
+int qemu_file_put_fd(QEMUFile *f, int fd)
+{
+ int ret = 0;
+ QIOChannel *ioc = qemu_file_get_ioc(f);
+ Error *err = NULL;
+ struct iovec iov = { (void *)" ", 1 };
+
+ /*
+ * Send a dummy byte so qemu_fill_buffer on the receiving side does not
+ * fail with a len=0 error. Flush first to maintain ordering wrt other
+ * data.
+ */
+
+ qemu_fflush(f);
+ if (qio_channel_writev_full(ioc, &iov, 1, &fd, 1, 0, &err) < 1) {
+ error_report_err(error_copy(err));
+ qemu_file_set_error_obj(f, -EIO, err);
+ ret = -1;
+ }
+ trace_qemu_file_put_fd(f->ioc->name, fd, ret);
+ return ret;
+}
+
+int qemu_file_get_fd(QEMUFile *f)
+{
+ int fd = -1;
+ FdEntry *fde;
+
+ if (!f->can_pass_fd) {
+ Error *err = NULL;
+ error_setg(&err, "%s does not support fd passing", f->ioc->name);
+ error_report_err(error_copy(err));
+ qemu_file_set_error_obj(f, -EIO, err);
+ goto out;
+ }
+
+ /* Force the dummy byte and its fd passenger to appear. */
+ qemu_peek_byte(f, 0);
+
+ fde = QTAILQ_FIRST(&f->fds);
+ if (fde) {
+ qemu_get_byte(f); /* Drop the dummy byte */
+ fd = fde->fd;
+ QTAILQ_REMOVE(&f->fds, fde, entry);
+ }
+out:
+ trace_qemu_file_get_fd(f->ioc->name, fd);
+ return fd;
+}
+
/** Closes the file
*
* Returns negative error value if any error happened on previous operations or
@@ -361,11 +430,17 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
*/
int qemu_fclose(QEMUFile *f)
{
+ FdEntry *fde, *next;
int ret = qemu_fflush(f);
int ret2 = qio_channel_close(f->ioc, NULL);
if (ret >= 0) {
ret = ret2;
}
+ QTAILQ_FOREACH_SAFE(fde, &f->fds, entry, next) {
+ warn_report("qemu_fclose: received fd %d was never claimed", fde->fd);
+ close(fde->fd);
+ g_free(fde);
+ }
g_clear_pointer(&f->ioc, object_unref);
error_free(f->last_error_obj);
g_free(f);
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 11c2120..3e47a20 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -79,5 +79,7 @@ size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
off_t pos);
QIOChannel *qemu_file_get_ioc(QEMUFile *file);
+int qemu_file_put_fd(QEMUFile *f, int fd);
+int qemu_file_get_fd(QEMUFile *f);
#endif
diff --git a/migration/trace-events b/migration/trace-events
index 89c0244..9388f81 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -88,6 +88,8 @@ put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
# qemu-file.c
qemu_file_fclose(void) ""
+qemu_file_put_fd(const char *name, int fd, int ret) "ioc %s, fd %d -> status %d"
+qemu_file_get_fd(const char *name, int fd) "ioc %s -> fd %d"
# ram.c
get_queued_page(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx"
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH V4 12/19] migration: VMSTATE_FD
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (10 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 11/19] migration: SCM_RIGHTS for QEMUFile Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-02 13:20 ` [PATCH V4 13/19] migration: cpr-transfer save and load Steve Sistare
` (6 subsequent siblings)
18 siblings, 0 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Define VMSTATE_FD for declaring a file descriptor field in a
VMStateDescription.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
include/migration/vmstate.h | 9 +++++++++
migration/vmstate-types.c | 23 +++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index f313f2f..a1dfab4 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -230,6 +230,7 @@ extern const VMStateInfo vmstate_info_uint8;
extern const VMStateInfo vmstate_info_uint16;
extern const VMStateInfo vmstate_info_uint32;
extern const VMStateInfo vmstate_info_uint64;
+extern const VMStateInfo vmstate_info_fd;
/** Put this in the stream when migrating a null pointer.*/
#define VMS_NULLPTR_MARKER (0x30U) /* '0' */
@@ -902,6 +903,9 @@ extern const VMStateInfo vmstate_info_qlist;
#define VMSTATE_UINT64_V(_f, _s, _v) \
VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint64, uint64_t)
+#define VMSTATE_FD_V(_f, _s, _v) \
+ VMSTATE_SINGLE(_f, _s, _v, vmstate_info_fd, int32_t)
+
#ifdef CONFIG_LINUX
#define VMSTATE_U8_V(_f, _s, _v) \
@@ -936,6 +940,9 @@ extern const VMStateInfo vmstate_info_qlist;
#define VMSTATE_UINT64(_f, _s) \
VMSTATE_UINT64_V(_f, _s, 0)
+#define VMSTATE_FD(_f, _s) \
+ VMSTATE_FD_V(_f, _s, 0)
+
#ifdef CONFIG_LINUX
#define VMSTATE_U8(_f, _s) \
@@ -1009,6 +1016,8 @@ extern const VMStateInfo vmstate_info_qlist;
#define VMSTATE_UINT64_TEST(_f, _s, _t) \
VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_info_uint64, uint64_t)
+#define VMSTATE_FD_TEST(_f, _s, _t) \
+ VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_info_fd, int32_t)
#define VMSTATE_TIMER_PTR_TEST(_f, _s, _test) \
VMSTATE_POINTER_TEST(_f, _s, _test, vmstate_info_timer, QEMUTimer *)
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
index e83bfcc..f31deb3 100644
--- a/migration/vmstate-types.c
+++ b/migration/vmstate-types.c
@@ -314,6 +314,29 @@ const VMStateInfo vmstate_info_uint64 = {
.put = put_uint64,
};
+/* File descriptor communicated via SCM_RIGHTS */
+
+static int get_fd(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field)
+{
+ int32_t *v = pv;
+ *v = qemu_file_get_fd(f);
+ return 0;
+}
+
+static int put_fd(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field, JSONWriter *vmdesc)
+{
+ int32_t *v = pv;
+ return qemu_file_put_fd(f, *v);
+}
+
+const VMStateInfo vmstate_info_fd = {
+ .name = "fd",
+ .get = get_fd,
+ .put = put_fd,
+};
+
static int get_nullptr(QEMUFile *f, void *pv, size_t size,
const VMStateField *field)
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH V4 13/19] migration: cpr-transfer save and load
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (11 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 12/19] migration: VMSTATE_FD Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-02 13:20 ` [PATCH V4 14/19] migration: cpr-transfer mode Steve Sistare
` (5 subsequent siblings)
18 siblings, 0 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Add functions to create a QEMUFile based on a unix URI, for saving or
loading, for use by cpr-transfer mode to preserve CPR state.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
---
include/migration/cpr.h | 3 ++
migration/cpr-transfer.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
migration/meson.build | 1 +
migration/trace-events | 2 ++
4 files changed, 82 insertions(+)
create mode 100644 migration/cpr-transfer.c
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index e833fae..51ac7f4 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -25,4 +25,7 @@ int cpr_state_load(Error **errp);
void cpr_state_close(void);
struct QIOChannel *cpr_state_ioc(void);
+QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
+QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
+
#endif
diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
new file mode 100644
index 0000000..0fbdf66
--- /dev/null
+++ b/migration/cpr-transfer.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022, 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "io/channel-file.h"
+#include "io/channel-socket.h"
+#include "io/net-listener.h"
+#include "migration/cpr.h"
+#include "migration/migration.h"
+#include "migration/savevm.h"
+#include "migration/qemu-file.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+
+QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp)
+{
+ QIOChannel *ioc;
+ MigrationAddress *addr = channel->addr;
+
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
+
+ QIOChannelSocket *sioc = qio_channel_socket_new();
+ SocketAddress *saddr = &addr->u.socket;
+
+ if (qio_channel_socket_connect_sync(sioc, saddr, errp)) {
+ object_unref(OBJECT(sioc));
+ return NULL;
+ }
+ ioc = QIO_CHANNEL(sioc);
+
+ } else {
+ error_setg(errp, "bad cpr channel address; must be unix");
+ return NULL;
+ }
+
+ trace_cpr_transfer_output(addr->u.socket.u.q_unix.path);
+ qio_channel_set_name(ioc, "cpr-out");
+ return qemu_file_new_output(ioc);
+}
+
+QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp)
+{
+ QIOChannel *ioc;
+ MigrationAddress *addr = channel->addr;
+
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET &&
+ addr->u.socket.type == SOCKET_ADDRESS_TYPE_UNIX) {
+
+ QIOChannelSocket *sioc;
+ SocketAddress *saddr = &addr->u.socket;
+ QIONetListener *listener = qio_net_listener_new();
+
+ qio_net_listener_set_name(listener, "cpr-socket-listener");
+ if (qio_net_listener_open_sync(listener, saddr, 1, errp) < 0) {
+ object_unref(OBJECT(listener));
+ return NULL;
+ }
+
+ sioc = qio_net_listener_wait_client(listener);
+ ioc = QIO_CHANNEL(sioc);
+
+ } else {
+ error_setg(errp, "bad cpr channel socket type; must be unix");
+ return NULL;
+ }
+
+ trace_cpr_transfer_input(addr->u.socket.u.q_unix.path);
+ qio_channel_set_name(ioc, "cpr-in");
+ return qemu_file_new_input(ioc);
+}
diff --git a/migration/meson.build b/migration/meson.build
index 039f0f9..d89435b 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -14,6 +14,7 @@ system_ss.add(files(
'channel.c',
'channel-block.c',
'cpr.c',
+ 'cpr-transfer.c',
'cpu-throttle.c',
'dirtyrate.c',
'exec.c',
diff --git a/migration/trace-events b/migration/trace-events
index 9388f81..1dd394d 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -350,6 +350,8 @@ cpr_delete_fd(const char *name, int id) "%s, id %d"
cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d"
cpr_state_save(const char *mode) "%s mode"
cpr_state_load(const char *mode) "%s mode"
+cpr_transfer_input(const char *path) "%s"
+cpr_transfer_output(const char *path) "%s"
# block-dirty-bitmap.c
send_bitmap_header_enter(void) ""
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH V4 14/19] migration: cpr-transfer mode
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (12 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 13/19] migration: cpr-transfer save and load Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-04 16:10 ` Steven Sistare
2024-12-10 12:26 ` Markus Armbruster
2024-12-02 13:20 ` [PATCH V4 15/19] tests/migration-test: memory_backend Steve Sistare
` (4 subsequent siblings)
18 siblings, 2 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Add the cpr-transfer migration mode. Usage:
qemu-system-$arch -machine aux-ram-share=on ...
start new QEMU with "-incoming <main-uri> -incoming <cpr-channel>"
Issue commands to old QEMU:
migrate_set_parameter mode cpr-transfer
{"execute": "migrate", ...
{"channel-type": "main"...}, {"channel-type": "cpr"...} ... }
The migrate command stops the VM, saves CPR state to cpr-channel, saves
normal migration state to main-uri, and old QEMU enters the postmigrate
state. The user starts new QEMU on the same host as old QEMU, with the
same arguments as old QEMU, plus two -incoming options. Guest RAM is
preserved in place, albeit with new virtual addresses in new QEMU.
This mode requires a second migration channel of type "cpr", in the
channel arguments on the outgoing side, and in a second -incoming
command-line parameter on the incoming side.
Memory-backend objects must have the share=on attribute, but
memory-backend-epc is not supported. The VM must be started with
the '-machine aux-ram-share=on' option, which allows anonymous
memory to be transferred in place to the new process. The memfds
are kept open by sending the descriptors to new QEMU via the CPR
channel, which must support SCM_RIGHTS, and they are mmap'd in new QEMU.
The implementation splits qmp_migrate into start and finish functions.
Start sends CPR state to new QEMU, which responds by closing the CPR
channel. Old QEMU detects the HUP then calls finish, which connects
the main migration channel.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
migration/cpr.c | 23 +++++++---
migration/migration.c | 107 +++++++++++++++++++++++++++++++++++++++++++++-
migration/migration.h | 2 +
migration/options.c | 12 +++++-
migration/ram.c | 2 +
migration/vmstate-types.c | 1 +
qapi/migration.json | 37 +++++++++++++++-
stubs/vmstate.c | 7 +++
system/vl.c | 6 +++
9 files changed, 188 insertions(+), 9 deletions(-)
diff --git a/migration/cpr.c b/migration/cpr.c
index f4a795f..560546d 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -45,7 +45,7 @@ static const VMStateDescription vmstate_cpr_fd = {
VMSTATE_UINT32(namelen, CprFd),
VMSTATE_VBUFFER_ALLOC_UINT32(name, CprFd, 0, NULL, namelen),
VMSTATE_INT32(id, CprFd),
- VMSTATE_INT32(fd, CprFd),
+ VMSTATE_FD(fd, CprFd),
VMSTATE_END_OF_LIST()
}
};
@@ -139,8 +139,14 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
trace_cpr_state_save(MigMode_str(mode));
- /* set f based on mode in a later patch in this series */
- return 0;
+ if (mode == MIG_MODE_CPR_TRANSFER) {
+ f = cpr_transfer_output(channel, errp);
+ } else {
+ return 0;
+ }
+ if (!f) {
+ return -1;
+ }
qemu_put_be32(f, QEMU_CPR_FILE_MAGIC);
qemu_put_be32(f, QEMU_CPR_FILE_VERSION);
@@ -170,8 +176,15 @@ int cpr_state_load(Error **errp)
QEMUFile *f;
MigMode mode = 0;
- /* set f and mode based on other parameters later in this patch series */
- return 0;
+ if (cpr_channel) {
+ mode = MIG_MODE_CPR_TRANSFER;
+ f = cpr_transfer_input(cpr_channel, errp);
+ } else {
+ return 0;
+ }
+ if (!f) {
+ return -1;
+ }
trace_cpr_state_load(MigMode_str(mode));
diff --git a/migration/migration.c b/migration/migration.c
index a5cf148..81a1ced 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -76,6 +76,7 @@
static NotifierWithReturnList migration_state_notifiers[] = {
NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL),
NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT),
+ NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_TRANSFER),
};
/* Messages sent on the return path from destination to source */
@@ -109,6 +110,7 @@ static int migration_maybe_pause(MigrationState *s,
static void migrate_fd_cancel(MigrationState *s);
static bool close_return_path_on_source(MigrationState *s);
static void migration_completion_end(MigrationState *s);
+static void migrate_hup_delete(MigrationState *s);
static void migration_downtime_start(MigrationState *s)
{
@@ -204,6 +206,12 @@ migration_channels_and_transport_compatible(MigrationAddress *addr,
return false;
}
+ if (migrate_mode() == MIG_MODE_CPR_TRANSFER &&
+ addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
+ error_setg(errp, "Migration requires streamable transport (eg unix)");
+ return false;
+ }
+
return true;
}
@@ -319,6 +327,7 @@ void migration_cancel(const Error *error)
qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
}
migrate_fd_cancel(current_migration);
+ migrate_hup_delete(current_migration);
}
void migration_shutdown(void)
@@ -419,6 +428,7 @@ void migration_incoming_state_destroy(void)
mis->postcopy_qemufile_dst = NULL;
}
+ cpr_set_cpr_channel(NULL);
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
@@ -720,6 +730,9 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
} else {
error_setg(errp, "unknown migration protocol: %s", uri);
}
+
+ /* Close cpr socket to tell source that we are listening */
+ cpr_state_close();
}
static void process_incoming_migration_bh(void *opaque)
@@ -1395,6 +1408,8 @@ static void migrate_fd_cleanup(MigrationState *s)
s->vmdesc = NULL;
qemu_savevm_state_cleanup();
+ cpr_state_close();
+ migrate_hup_delete(s);
close_return_path_on_source(s);
@@ -1506,6 +1521,7 @@ static void migrate_fd_error(MigrationState *s, const Error *error)
static void migrate_fd_cancel(MigrationState *s)
{
int old_state ;
+ bool setup = (s->state == MIGRATION_STATUS_SETUP);
trace_migrate_fd_cancel();
@@ -1550,6 +1566,17 @@ static void migrate_fd_cancel(MigrationState *s)
s->block_inactive = false;
}
}
+
+ /*
+ * If qmp_migrate_finish has not been called, then there is no path that
+ * will complete the cancellation. Do it now.
+ */
+ if (setup && !s->to_dst_file) {
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_CANCELLED);
+ cpr_state_close();
+ migrate_hup_delete(s);
+ vm_resume(s->vm_old_state);
+ }
}
void migration_add_notifier_mode(NotifierWithReturn *notify,
@@ -1662,7 +1689,9 @@ bool migration_thread_is_self(void)
bool migrate_mode_is_cpr(MigrationState *s)
{
- return s->parameters.mode == MIG_MODE_CPR_REBOOT;
+ MigMode mode = s->parameters.mode;
+ return mode == MIG_MODE_CPR_REBOOT ||
+ mode == MIG_MODE_CPR_TRANSFER;
}
int migrate_init(MigrationState *s, Error **errp)
@@ -2037,6 +2066,40 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
return true;
}
+static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested,
+ Error **errp);
+
+static void migrate_hup_add(MigrationState *s, QIOChannel *ioc, GSourceFunc cb,
+ void *opaque)
+{
+ s->hup_source = qio_channel_create_watch(ioc, G_IO_HUP);
+ g_source_set_callback(s->hup_source, cb, opaque, NULL);
+ g_source_attach(s->hup_source, NULL);
+}
+
+static void migrate_hup_delete(MigrationState *s)
+{
+ if (s->hup_source) {
+ g_source_destroy(s->hup_source);
+ g_source_unref(s->hup_source);
+ s->hup_source = NULL;
+ }
+}
+
+static gboolean qmp_migrate_finish_cb(QIOChannel *channel,
+ GIOCondition cond,
+ void *opaque)
+{
+ MigrationAddress *addr = opaque;
+
+ qmp_migrate_finish(addr, false, NULL);
+
+ cpr_state_close();
+ migrate_hup_delete(migrate_get_current());
+ qapi_free_MigrationAddress(addr);
+ return G_SOURCE_REMOVE;
+}
+
void qmp_migrate(const char *uri, bool has_channels,
MigrationChannelList *channels, bool has_detach, bool detach,
bool has_resume, bool resume, Error **errp)
@@ -2047,6 +2110,7 @@ void qmp_migrate(const char *uri, bool has_channels,
g_autoptr(MigrationChannel) channel = NULL;
MigrationAddress *addr = NULL;
MigrationChannel *channelv[MIGRATION_CHANNEL_TYPE__MAX] = { NULL };
+ MigrationChannel *cpr_channel = NULL;
/*
* Having preliminary checks for uri and channel
@@ -2067,6 +2131,7 @@ void qmp_migrate(const char *uri, bool has_channels,
}
channelv[type] = channels->value;
}
+ cpr_channel = channelv[MIGRATION_CHANNEL_TYPE_CPR];
addr = channelv[MIGRATION_CHANNEL_TYPE_MAIN]->addr;
if (!addr) {
error_setg(errp, "Channel list has no main entry");
@@ -2087,12 +2152,52 @@ void qmp_migrate(const char *uri, bool has_channels,
return;
}
+ if (s->parameters.mode == MIG_MODE_CPR_TRANSFER && !cpr_channel) {
+ error_setg(errp, "missing 'cpr' migration channel");
+ return;
+ }
+
resume_requested = has_resume && resume;
if (!migrate_prepare(s, resume_requested, errp)) {
/* Error detected, put into errp */
return;
}
+ if (cpr_state_save(cpr_channel, &local_err)) {
+ goto out;
+ }
+
+ /*
+ * For cpr-transfer, the target may not be listening yet on the migration
+ * channel, because first it must finish cpr_load_state. The target tells
+ * us it is listening by closing the cpr-state socket. Wait for that HUP
+ * event before connecting in qmp_migrate_finish.
+ *
+ * The HUP could occur because the target fails while reading CPR state,
+ * in which case the target will not listen for the incoming migration
+ * connection, so qmp_migrate_finish will fail to connect, and then recover.
+ */
+ if (s->parameters.mode == MIG_MODE_CPR_TRANSFER) {
+ migrate_hup_add(s, cpr_state_ioc(), (GSourceFunc)qmp_migrate_finish_cb,
+ QAPI_CLONE(MigrationAddress, addr));
+
+ } else {
+ qmp_migrate_finish(addr, resume_requested, errp);
+ }
+
+out:
+ if (local_err) {
+ migrate_fd_error(s, local_err);
+ error_propagate(errp, local_err);
+ }
+}
+
+static void qmp_migrate_finish(MigrationAddress *addr, bool resume_requested,
+ Error **errp)
+{
+ MigrationState *s = migrate_get_current();
+ Error *local_err = NULL;
+
if (!resume_requested) {
if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
return;
diff --git a/migration/migration.h b/migration/migration.h
index 5cd0f29..bdae228 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -471,6 +471,8 @@ struct MigrationState {
bool switchover_acked;
/* Is this a rdma migration */
bool rdma_migration;
+
+ GSource *hup_source;
};
void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
diff --git a/migration/options.c b/migration/options.c
index ad8d698..bb24bc9 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -22,6 +22,7 @@
#include "qapi/qmp/qnull.h"
#include "sysemu/runstate.h"
#include "migration/colo.h"
+#include "migration/cpr.h"
#include "migration/misc.h"
#include "migration.h"
#include "migration-stats.h"
@@ -745,9 +746,16 @@ uint64_t migrate_max_postcopy_bandwidth(void)
MigMode migrate_mode(void)
{
- MigrationState *s = migrate_get_current();
- MigMode mode = s->parameters.mode;
+ MigMode mode;
+ /*
+ * cpr_channel is only set during the early cpr-transfer loading stage,
+ * after which it is cleared.
+ */
+ if (cpr_get_cpr_channel()) {
+ return MIG_MODE_CPR_TRANSFER;
+ }
+ mode = migrate_get_current()->parameters.mode;
assert(mode >= 0 && mode < MIG_MODE__MAX);
return mode;
}
diff --git a/migration/ram.c b/migration/ram.c
index 05ff9eb..fa6a909 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -216,7 +216,9 @@ static bool postcopy_preempt_active(void)
bool migrate_ram_is_ignored(RAMBlock *block)
{
+ MigMode mode = migrate_mode();
return !qemu_ram_is_migratable(block) ||
+ mode == MIG_MODE_CPR_TRANSFER ||
(migrate_ignore_shared() && qemu_ram_is_shared(block)
&& qemu_ram_is_named_file(block));
}
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
index f31deb3..2210f0c 100644
--- a/migration/vmstate-types.c
+++ b/migration/vmstate-types.c
@@ -15,6 +15,7 @@
#include "qemu-file.h"
#include "migration.h"
#include "migration/vmstate.h"
+#include "migration/client-options.h"
#include "qemu/error-report.h"
#include "qemu/queue.h"
#include "trace.h"
diff --git a/qapi/migration.json b/qapi/migration.json
index a26960b..1bc963f 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -614,9 +614,44 @@
# or COLO.
#
# (since 8.2)
+#
+# @cpr-transfer: This mode allows the user to transfer a guest to a
+# new QEMU instance on the same host with minimal guest pause
+# time, by preserving guest RAM in place, albeit with new virtual
+# addresses in new QEMU.
+#
+# The user starts new QEMU on the same host as old QEMU, with the
+# the same arguments as old QEMU, plus the -incoming option. The
+# user issues the migrate command to old QEMU, which stops the VM,
+# saves state to the migration channels, and enters the
+# postmigrate state. Execution resumes in new QEMU.
+#
+# This mode requires a second migration channel type "cpr" in the
+# channel arguments on the outgoing side. The channel must be a
+# type, such as unix socket, that supports SCM_RIGHTS. However,
+# the cpr channel cannot be added to the list of channels for a
+# migrate-incoming command, because it must be read before new
+# QEMU opens a monitor. Instead, the user passes the channel as a
+# second -incoming command-line argument to new QEMU using JSON
+# syntax.
+#
+# Memory-backend objects must have the share=on attribute, but
+# memory-backend-epc is not supported. The VM must be started
+# with the '-machine aux-ram-share=on' option.
+#
+# The incoming migration channel cannot be a file type, and for
+# the tcp type, the port cannot be 0 (meaning dynamically choose
+# a port).
+#
+# When using -incoming defer, you must issue the migrate command
+# to old QEMU before issuing any monitor commands to new QEMU.
+# However, new QEMU does not open and read the migration stream
+# until you issue the migrate incoming command.
+#
+# (since 10.0)
##
{ 'enum': 'MigMode',
- 'data': [ 'normal', 'cpr-reboot' ] }
+ 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
##
# @ZeroPageDetection:
diff --git a/stubs/vmstate.c b/stubs/vmstate.c
index 8513d92..c190762 100644
--- a/stubs/vmstate.c
+++ b/stubs/vmstate.c
@@ -1,5 +1,7 @@
#include "qemu/osdep.h"
#include "migration/vmstate.h"
+#include "qapi/qapi-types-migration.h"
+#include "migration/client-options.h"
int vmstate_register_with_alias_id(VMStateIf *obj,
uint32_t instance_id,
@@ -21,3 +23,8 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
{
return true;
}
+
+MigMode migrate_mode(void)
+{
+ return MIG_MODE_NORMAL;
+}
diff --git a/system/vl.c b/system/vl.c
index 40e049e..d965be8 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -3741,6 +3741,12 @@ void qemu_init(int argc, char **argv)
qemu_create_machine(machine_opts_dict);
+ /*
+ * Load incoming CPR state before any devices are created, because it
+ * contains file descriptors that are needed in device initialization code.
+ */
+ cpr_state_load(&error_fatal);
+
suspend_mux_open();
qemu_disable_default_devices();
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 14/19] migration: cpr-transfer mode
2024-12-02 13:20 ` [PATCH V4 14/19] migration: cpr-transfer mode Steve Sistare
@ 2024-12-04 16:10 ` Steven Sistare
2024-12-10 12:26 ` Markus Armbruster
1 sibling, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-04 16:10 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/2/2024 8:20 AM, Steve Sistare wrote:
[...]
> +
> + /*
> + * If qmp_migrate_finish has not been called, then there is no path that
> + * will complete the cancellation. Do it now.
> + */
> + if (setup && !s->to_dst_file) {
> + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_CANCELLED);
> + cpr_state_close();
> + migrate_hup_delete(s);
> + vm_resume(s->vm_old_state);
> + }
I forgot to make changes here as we discussed in
https://lore.kernel.org/qemu-devel/2dc614cb-8754-423f-8c31-e5425075af92@oracle.com/
I will move vm_resume to the patch "stop vm earlier for cpr" in a later series, and
verify the current state:
+ if (setup && !s->to_dst_file) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, MIGRATION_STATUS_CANCELLED);
+ cpr_state_close();
+ migrate_hup_delete(s);
+ }
- Steve
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 14/19] migration: cpr-transfer mode
2024-12-02 13:20 ` [PATCH V4 14/19] migration: cpr-transfer mode Steve Sistare
2024-12-04 16:10 ` Steven Sistare
@ 2024-12-10 12:26 ` Markus Armbruster
2024-12-11 22:05 ` Steven Sistare
1 sibling, 1 reply; 78+ messages in thread
From: Markus Armbruster @ 2024-12-10 12:26 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
Steve Sistare <steven.sistare@oracle.com> writes:
> Add the cpr-transfer migration mode. Usage:
>
> qemu-system-$arch -machine aux-ram-share=on ...
>
> start new QEMU with "-incoming <main-uri> -incoming <cpr-channel>"
>
> Issue commands to old QEMU:
> migrate_set_parameter mode cpr-transfer
>
> {"execute": "migrate", ...
> {"channel-type": "main"...}, {"channel-type": "cpr"...} ... }
Much technical detail here that won't make sense to the reader until
further down, but next to nothing on what the thing actually
accomplishes. Makes the commit message unnecessarily hard to
understand. But please read on.
> The migrate command stops the VM, saves CPR state to cpr-channel, saves
> normal migration state to main-uri, and old QEMU enters the postmigrate
> state. The user starts new QEMU on the same host as old QEMU, with the
> same arguments as old QEMU,
Any additional requirements over traditional migration?
There, "same arguments" is sufficient, but not necessary. For instance,
changing certain backends is quite possible.
> plus two -incoming options.
Two -incoming options to define two migration channels, the traditional
one of MigrationChannelType "main", and an another one of
MigrationChannelType "cpr"?
> Guest RAM is
> preserved in place, albeit with new virtual addresses in new QEMU.
>
> This mode requires a second migration channel of type "cpr", in the
> channel arguments on the outgoing side, and in a second -incoming
> command-line parameter on the incoming side.
>
> Memory-backend objects must have the share=on attribute, but
> memory-backend-epc is not supported. The VM must be started with
> the '-machine aux-ram-share=on' option, which allows anonymous
> memory to be transferred in place to the new process. The memfds
> are kept open by sending the descriptors to new QEMU via the CPR
> channel, which must support SCM_RIGHTS, and they are mmap'd in new QEMU.
>
> The implementation splits qmp_migrate into start and finish functions.
> Start sends CPR state to new QEMU, which responds by closing the CPR
> channel. Old QEMU detects the HUP then calls finish, which connects
> the main migration channel.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
I'd lead with a brief explanation of the feature and its benefits.
Could steam from the cover letter like this:
New migration mode cpr-transfer mode enables transferring a guest to a
new QEMU instance on the same host with minimal guest pause time, by
preserving guest RAM in place, albeit with new virtual addresses in
new QEMU, and by preserving device file descriptors.
Then talk about required special setup. I see aux-ram-share=on.
Anything else? Any differences between source and destination QEMU
there?
Then talk about the two channels. First what they do, second how to
create their destination end with -incoming, third how to create their
source end with "migrate".
Finally mention whatever technical detail you believe needs mentioning
here.
[...]
> diff --git a/qapi/migration.json b/qapi/migration.json
> index a26960b..1bc963f 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -614,9 +614,44 @@
> # or COLO.
> #
> # (since 8.2)
> +#
> +# @cpr-transfer: This mode allows the user to transfer a guest to a
> +# new QEMU instance on the same host with minimal guest pause
> +# time, by preserving guest RAM in place, albeit with new virtual
> +# addresses in new QEMU.
> +#
> +# The user starts new QEMU on the same host as old QEMU, with the
> +# the same arguments as old QEMU, plus the -incoming option.
Two of them?
> +# The
> +# user issues the migrate command to old QEMU, which stops the VM,
> +# saves state to the migration channels, and enters the
> +# postmigrate state. Execution resumes in new QEMU.
The commit message also mentions file descriptors are migrared over.
Worth mentioning here, too?
> +#
> +# This mode requires a second migration channel type "cpr" in the
> +# channel arguments on the outgoing side. The channel must be a
> +# type, such as unix socket, that supports SCM_RIGHTS. However,
This is vague. Would anything but a UNIX domain socket work?
Applies to both source and destination end?
> +# the cpr channel cannot be added to the list of channels for a
> +# migrate-incoming command, because it must be read before new
> +# QEMU opens a monitor.
Ugh! Remind me, why is that the case?
> +# Instead, the user passes the channel as a
> +# second -incoming command-line argument to new QEMU using JSON
> +# syntax.
> +#
> +# Memory-backend objects must have the share=on attribute, but
> +# memory-backend-epc is not supported. The VM must be started
> +# with the '-machine aux-ram-share=on' option.
What happens when the conditions aren't met? migrate command fails
with a useful error message?
> +#
> +# The incoming migration channel cannot be a file type, and for
> +# the tcp type, the port cannot be 0 (meaning dynamically choose
> +# a port).
Which of the two channels are you discussing?
> +#
> +# When using -incoming defer, you must issue the migrate command
> +# to old QEMU before issuing any monitor commands to new QEMU.
I'm confused. Not even qmp_capabilities? Why?
> +# However, new QEMU does not open and read the migration stream
> +# until you issue the migrate incoming command.
> +#
> +# (since 10.0)
> ##
> { 'enum': 'MigMode',
> - 'data': [ 'normal', 'cpr-reboot' ] }
> + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
>
> ##
> # @ZeroPageDetection:
[...]
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 14/19] migration: cpr-transfer mode
2024-12-10 12:26 ` Markus Armbruster
@ 2024-12-11 22:05 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-11 22:05 UTC (permalink / raw)
To: Markus Armbruster
Cc: qemu-devel, Peter Xu, Fabiano Rosas, David Hildenbrand,
Marcel Apfelbaum, Eduardo Habkost, Philippe Mathieu-Daude,
Paolo Bonzini, Daniel P. Berrange
On 12/10/2024 7:26 AM, Markus Armbruster wrote:
> Steve Sistare <steven.sistare@oracle.com> writes:
>
>> Add the cpr-transfer migration mode. Usage:
>>
>> qemu-system-$arch -machine aux-ram-share=on ...
>>
>> start new QEMU with "-incoming <main-uri> -incoming <cpr-channel>"
>>
>> Issue commands to old QEMU:
>> migrate_set_parameter mode cpr-transfer
>>
>> {"execute": "migrate", ...
>> {"channel-type": "main"...}, {"channel-type": "cpr"...} ... }
>
> Much technical detail here that won't make sense to the reader until
> further down, but next to nothing on what the thing actually
> accomplishes. Makes the commit message unnecessarily hard to
> understand. But please read on.
>
>> The migrate command stops the VM, saves CPR state to cpr-channel, saves
>> normal migration state to main-uri, and old QEMU enters the postmigrate
>> state. The user starts new QEMU on the same host as old QEMU, with the
>> same arguments as old QEMU,
>
> Any additional requirements over traditional migration?
>
> There, "same arguments" is sufficient, but not necessary. For instance,
> changing certain backends is quite possible.
No additional requirements over traditional migration.
AFAIK there is no user documentation on what arguments must be specified
to new QEMU during a migration. No words about "same arguments", or even
"same VM". I am trying to give some guidance where none currently exists,
in this commit message and in QAPI for CPR.
Perhaps this is better:
The user starts new QEMU on the same host as old QEMU, with command-line
arguments to create the same machine, plus the -incoming option for the
main migration channel, like normal live migration. In addition, the
user adds a second -incoming option with channel type "cpr", which matches
the cpr channel of the migrate command issued to old QEMU.
>> plus two -incoming options.
>
> Two -incoming options to define two migration channels, the traditional
> one of MigrationChannelType "main", and an another one of
> MigrationChannelType "cpr"?
Yes. I will elaborate.
>> Guest RAM is
>> preserved in place, albeit with new virtual addresses in new QEMU.
>>
>> This mode requires a second migration channel of type "cpr", in the
>> channel arguments on the outgoing side, and in a second -incoming
>> command-line parameter on the incoming side.
>>
>> Memory-backend objects must have the share=on attribute, but
>> memory-backend-epc is not supported. The VM must be started with
>> the '-machine aux-ram-share=on' option, which allows anonymous
>> memory to be transferred in place to the new process. The memfds
>> are kept open by sending the descriptors to new QEMU via the CPR
>> channel, which must support SCM_RIGHTS, and they are mmap'd in new QEMU.
>>
>> The implementation splits qmp_migrate into start and finish functions.
>> Start sends CPR state to new QEMU, which responds by closing the CPR
>> channel. Old QEMU detects the HUP then calls finish, which connects
>> the main migration channel.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> I'd lead with a brief explanation of the feature and its benefits.
> Could steam from the cover letter like this:
>
> New migration mode cpr-transfer mode enables transferring a guest to a
> new QEMU instance on the same host with minimal guest pause time, by
> preserving guest RAM in place, albeit with new virtual addresses in
> new QEMU, and by preserving device file descriptors.
>
> Then talk about required special setup. I see aux-ram-share=on.
> Anything else? Any differences between source and destination QEMU
> there?
>
> Then talk about the two channels. First what they do, second how to
> create their destination end with -incoming, third how to create their
> source end with "migrate".
>
> Finally mention whatever technical detail you believe needs mentioning
> here.
I'll work on it.
> [...]
>
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index a26960b..1bc963f 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -614,9 +614,44 @@
>> # or COLO.
>> #
>> # (since 8.2)
>> +#
>> +# @cpr-transfer: This mode allows the user to transfer a guest to a
>> +# new QEMU instance on the same host with minimal guest pause
>> +# time, by preserving guest RAM in place, albeit with new virtual
>> +# addresses in new QEMU.
>> +#
>> +# The user starts new QEMU on the same host as old QEMU, with the
>> +# the same arguments as old QEMU, plus the -incoming option.
>
> Two of them?
Yes, I will say more.
>> +# The
>> +# user issues the migrate command to old QEMU, which stops the VM,
>> +# saves state to the migration channels, and enters the
>> +# postmigrate state. Execution resumes in new QEMU.
>
> The commit message also mentions file descriptors are migrared over.
> Worth mentioning here, too?
IMO no. The user cannot observe that aspect, so they don't need to know.
It's an implementation detail.
>> +#
>> +# This mode requires a second migration channel type "cpr" in the
>> +# channel arguments on the outgoing side. The channel must be a
>> +# type, such as unix socket, that supports SCM_RIGHTS. However,
>
> This is vague. Would anything but a UNIX domain socket work?
I debated what to say here. One could specify an "exec" type, in which the
executed command creates a unix domain socket. However, that is only likely to
occur to a small fraction of clever users. I could simplify the description,
and let the clever ones realize they can fudge it using exec.
> Applies to both source and destination end?
Yes. It is generally understood that the same specification for a migration
channel applies to both ends. But not documented anywhere AFAIK. And again a
clever user could specify a socket URI on one side and an exec URI on the
other whose command connects to the socket. All true for normal migration.
>> +# the cpr channel cannot be added to the list of channels for a
>> +# migrate-incoming command, because it must be read before new
>> +# QEMU opens a monitor.
>
> Ugh! Remind me, why is that the case?
The cpr channel (containing preserved file descriptors) must be read before
objects are initialized, which occurs before the monitor is opened.
>> +# Instead, the user passes the channel as a
>> +# second -incoming command-line argument to new QEMU using JSON
>> +# syntax.
>> +#
>> +# Memory-backend objects must have the share=on attribute, but
>> +# memory-backend-epc is not supported. The VM must be started
>> +# with the '-machine aux-ram-share=on' option.
>
> What happens when the conditions aren't met? migrate command fails
> with a useful error message?
Yes, via a migration blocker.
>> +#
>> +# The incoming migration channel cannot be a file type, and for
>> +# the tcp type, the port cannot be 0 (meaning dynamically choose
>> +# a port).
>
> Which of the two channels are you discussing?
main. I will clarify.
>> +#
>> +# When using -incoming defer, you must issue the migrate command
>> +# to old QEMU before issuing any monitor commands to new QEMU.
>
> I'm confused. Not even qmp_capabilities? Why?
Because of the ordering dependency. Must load CPR state fd's, before device initialization,
which occurs before monitor initialization. The migrate command sends CPR fds which releases
all the above.
- Steve
>> +# However, new QEMU does not open and read the migration stream
>> +# until you issue the migrate incoming command.
>> +#
>> +# (since 10.0)
>> ##
>> { 'enum': 'MigMode',
>> - 'data': [ 'normal', 'cpr-reboot' ] }
>> + 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
>>
>> ##
>> # @ZeroPageDetection:
>
> [...]
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 15/19] tests/migration-test: memory_backend
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (13 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 14/19] migration: cpr-transfer mode Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-02 13:20 ` [PATCH V4 16/19] tests/qtest: defer connection Steve Sistare
` (3 subsequent siblings)
18 siblings, 0 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Allow each migration test to define its own memory backend, replacing
the standard "-m <size>" specification.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
---
tests/qtest/migration-test.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 74d3000..64e1c50 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -609,6 +609,11 @@ typedef struct {
const char *opts_target;
/* suspend the src before migrating to dest. */
bool suspend_me;
+ /*
+ * Format string for the main memory backend, containing one %s where the
+ * size is plugged in. If omitted, "-m %s" is used.
+ */
+ const char *memory_backend;
} MigrateStart;
/*
@@ -727,6 +732,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
const char *memory_size;
const char *machine_alias, *machine_opts = "";
g_autofree char *machine = NULL;
+ g_autofree char *memory_backend = NULL;
if (args->use_shmem) {
if (!g_file_test("/dev/shm", G_FILE_TEST_IS_DIR)) {
@@ -802,6 +808,12 @@ static int test_migrate_start(QTestState **from, QTestState **to,
memory_size, shmem_path);
}
+ if (args->memory_backend) {
+ memory_backend = g_strdup_printf(args->memory_backend, memory_size);
+ } else {
+ memory_backend = g_strdup_printf("-m %s ", memory_size);
+ }
+
if (args->use_dirty_ring) {
kvm_opts = ",dirty-ring-size=4096";
}
@@ -820,12 +832,12 @@ static int test_migrate_start(QTestState **from, QTestState **to,
cmd_source = g_strdup_printf("-accel kvm%s -accel tcg "
"-machine %s,%s "
"-name source,debug-threads=on "
- "-m %s "
+ "%s "
"-serial file:%s/src_serial "
"%s %s %s %s %s",
kvm_opts ? kvm_opts : "",
machine, machine_opts,
- memory_size, tmpfs,
+ memory_backend, tmpfs,
arch_opts ? arch_opts : "",
arch_source ? arch_source : "",
shmem_opts ? shmem_opts : "",
@@ -841,13 +853,13 @@ static int test_migrate_start(QTestState **from, QTestState **to,
cmd_target = g_strdup_printf("-accel kvm%s -accel tcg "
"-machine %s,%s "
"-name target,debug-threads=on "
- "-m %s "
+ "%s "
"-serial file:%s/dest_serial "
"-incoming %s "
"%s %s %s %s %s",
kvm_opts ? kvm_opts : "",
machine, machine_opts,
- memory_size, tmpfs, uri,
+ memory_backend, tmpfs, uri,
arch_opts ? arch_opts : "",
arch_target ? arch_target : "",
shmem_opts ? shmem_opts : "",
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH V4 16/19] tests/qtest: defer connection
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (14 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 15/19] tests/migration-test: memory_backend Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-18 21:02 ` Steven Sistare
2024-12-19 15:46 ` Peter Xu
2024-12-02 13:20 ` [PATCH V4 17/19] tests/migration-test: " Steve Sistare
` (2 subsequent siblings)
18 siblings, 2 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Add an option to defer making the connecting to the monitor and qtest
sockets when calling qtest_init_with_env. The client makes the connection
later by calling qtest_connect_deferred and qtest_qmp_handshake.
A test cannot specify port=0 for a deferred connection, because qmp_migrate
cannot query for the assigned port, because the monitor is not connected
yet. However, even if the test does not specify port=0, qmp_migrate ->
migrate_set_ports unconditionally queries connection parameters.
Modify migrate_set_ports to only query when port=0.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
tests/qtest/libqtest.c | 80 +++++++++++++++++++++++++----------------
tests/qtest/libqtest.h | 19 +++++++++-
tests/qtest/migration-helpers.c | 19 +++++-----
tests/qtest/migration-test.c | 4 +--
4 files changed, 80 insertions(+), 42 deletions(-)
diff --git a/tests/qtest/libqtest.c b/tests/qtest/libqtest.c
index 817fd7a..31c4032 100644
--- a/tests/qtest/libqtest.c
+++ b/tests/qtest/libqtest.c
@@ -75,6 +75,8 @@ struct QTestState
{
int fd;
int qmp_fd;
+ int sock;
+ int qmpsock;
pid_t qemu_pid; /* our child QEMU process */
int wstatus;
#ifdef _WIN32
@@ -442,18 +444,19 @@ static QTestState *G_GNUC_PRINTF(2, 3) qtest_spawn_qemu(const char *qemu_bin,
return s;
}
+static char *qtest_socket_path(const char *suffix)
+{
+ return g_strdup_printf("%s/qtest-%d.%s", g_get_tmp_dir(), getpid(), suffix);
+}
+
static QTestState *qtest_init_internal(const char *qemu_bin,
- const char *extra_args)
+ const char *extra_args,
+ bool defer_connect)
{
QTestState *s;
int sock, qmpsock, i;
- gchar *socket_path;
- gchar *qmp_socket_path;
-
- socket_path = g_strdup_printf("%s/qtest-%d.sock",
- g_get_tmp_dir(), getpid());
- qmp_socket_path = g_strdup_printf("%s/qtest-%d.qmp",
- g_get_tmp_dir(), getpid());
+ g_autofree gchar *socket_path = qtest_socket_path("sock");
+ g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
/*
* It's possible that if an earlier test run crashed it might
@@ -485,22 +488,17 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
qtest_client_set_rx_handler(s, qtest_client_socket_recv_line);
qtest_client_set_tx_handler(s, qtest_client_socket_send);
- s->fd = socket_accept(sock);
- if (s->fd >= 0) {
- s->qmp_fd = socket_accept(qmpsock);
- }
- unlink(socket_path);
- unlink(qmp_socket_path);
- g_free(socket_path);
- g_free(qmp_socket_path);
-
- g_assert(s->fd >= 0 && s->qmp_fd >= 0);
-
s->rx = g_string_new("");
for (i = 0; i < MAX_IRQ; i++) {
s->irq_level[i] = false;
}
+ s->sock = sock;
+ s->qmpsock = qmpsock;
+ if (!defer_connect) {
+ qtest_connect_deferred(s);
+ }
+
/*
* Stopping QEMU for debugging is not supported on Windows.
*
@@ -515,34 +513,54 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
}
#endif
+ return s;
+}
+
+void qtest_connect_deferred(QTestState *s)
+{
+ g_autofree gchar *socket_path = qtest_socket_path("sock");
+ g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
+
+ g_assert(s->sock >= 0 && s->qmpsock >= 0);
+ s->fd = socket_accept(s->sock);
+ if (s->fd >= 0) {
+ s->qmp_fd = socket_accept(s->qmpsock);
+ }
+ unlink(socket_path);
+ unlink(qmp_socket_path);
+ g_assert(s->fd >= 0 && s->qmp_fd >= 0);
+ s->sock = s->qmpsock = -1;
/* ask endianness of the target */
-
s->big_endian = qtest_query_target_endianness(s);
-
- return s;
}
QTestState *qtest_init_without_qmp_handshake(const char *extra_args)
{
- return qtest_init_internal(qtest_qemu_binary(NULL), extra_args);
+ return qtest_init_internal(qtest_qemu_binary(NULL), extra_args, false);
}
-QTestState *qtest_init_with_env(const char *var, const char *extra_args)
+void qtest_qmp_handshake(QTestState *s)
{
- QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args);
- QDict *greeting;
-
/* Read the QMP greeting and then do the handshake */
- greeting = qtest_qmp_receive(s);
+ QDict *greeting = qtest_qmp_receive(s);
qobject_unref(greeting);
qobject_unref(qtest_qmp(s, "{ 'execute': 'qmp_capabilities' }"));
+}
+QTestState *qtest_init_with_env(const char *var, const char *extra_args,
+ bool defer_connect)
+{
+ QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args,
+ defer_connect);
+ if (!defer_connect) {
+ qtest_qmp_handshake(s);
+ }
return s;
}
QTestState *qtest_init(const char *extra_args)
{
- return qtest_init_with_env(NULL, extra_args);
+ return qtest_init_with_env(NULL, extra_args, false);
}
QTestState *qtest_vinitf(const char *fmt, va_list ap)
@@ -1523,7 +1541,7 @@ static struct MachInfo *qtest_get_machines(const char *var)
silence_spawn_log = !g_test_verbose();
- qts = qtest_init_with_env(qemu_var, "-machine none");
+ qts = qtest_init_with_env(qemu_var, "-machine none", false);
response = qtest_qmp(qts, "{ 'execute': 'query-machines' }");
g_assert(response);
list = qdict_get_qlist(response, "return");
@@ -1578,7 +1596,7 @@ static struct CpuModel *qtest_get_cpu_models(void)
silence_spawn_log = !g_test_verbose();
- qts = qtest_init_with_env(NULL, "-machine none");
+ qts = qtest_init_with_env(NULL, "-machine none", false);
response = qtest_qmp(qts, "{ 'execute': 'query-cpu-definitions' }");
g_assert(response);
list = qdict_get_qlist(response, "return");
diff --git a/tests/qtest/libqtest.h b/tests/qtest/libqtest.h
index beb96b1..db76f2c 100644
--- a/tests/qtest/libqtest.h
+++ b/tests/qtest/libqtest.h
@@ -60,13 +60,15 @@ QTestState *qtest_init(const char *extra_args);
* @var: Environment variable from where to take the QEMU binary
* @extra_args: Other arguments to pass to QEMU. CAUTION: these
* arguments are subject to word splitting and shell evaluation.
+ * @defer_connect: do not connect to qemu monitor and qtest socket.
*
* Like qtest_init(), but use a different environment variable for the
* QEMU binary.
*
* Returns: #QTestState instance.
*/
-QTestState *qtest_init_with_env(const char *var, const char *extra_args);
+QTestState *qtest_init_with_env(const char *var, const char *extra_args,
+ bool defer_connect);
/**
* qtest_init_without_qmp_handshake:
@@ -78,6 +80,21 @@ QTestState *qtest_init_with_env(const char *var, const char *extra_args);
QTestState *qtest_init_without_qmp_handshake(const char *extra_args);
/**
+ * qtest_connect_deferred:
+ * @s: #QTestState instance to connect
+ * Connect to qemu monitor and qtest socket, after deferring them in
+ * qtest_init_with_env. Does not handshake with the monitor.
+ */
+void qtest_connect_deferred(QTestState *s);
+
+/**
+ * qtest_qmp_handshake:
+ * @s: #QTestState instance to operate on.
+ * Perform handshake after connecting to qemu monitor.
+ */
+void qtest_qmp_handshake(QTestState *s);
+
+/**
* qtest_init_with_serial:
* @extra_args: other arguments to pass to QEMU. CAUTION: these
* arguments are subject to word splitting and shell evaluation.
diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index 3f8ba7f..9f39401 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -127,25 +127,28 @@ migrate_get_connect_qdict(QTestState *who)
static void migrate_set_ports(QTestState *to, QList *channel_list)
{
- QDict *addr;
+ g_autoptr(QDict) addr = NULL;
QListEntry *entry;
const char *addr_port = NULL;
- addr = migrate_get_connect_qdict(to);
-
QLIST_FOREACH_ENTRY(channel_list, entry) {
QDict *channel = qobject_to(QDict, qlist_entry_obj(entry));
QDict *addrdict = qdict_get_qdict(channel, "addr");
- if (qdict_haskey(addrdict, "port") &&
- qdict_haskey(addr, "port") &&
- (strcmp(qdict_get_str(addrdict, "port"), "0") == 0)) {
+ if (!qdict_haskey(addrdict, "port") ||
+ strcmp(qdict_get_str(addrdict, "port"), "0")) {
+ continue;
+ }
+
+ if (!addr) {
+ addr = migrate_get_connect_qdict(to);
+ }
+
+ if (qdict_haskey(addr, "port")) {
addr_port = qdict_get_str(addr, "port");
qdict_put_str(addrdict, "port", addr_port);
}
}
-
- qobject_unref(addr);
}
bool migrate_watch_for_events(QTestState *who, const char *name,
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 64e1c50..b7001b0 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -844,7 +844,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
args->opts_source ? args->opts_source : "",
ignore_stderr);
if (!args->only_target) {
- *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source);
+ *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source, false);
qtest_qmp_set_event_callback(*from,
migrate_watch_for_events,
&src_state);
@@ -865,7 +865,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
shmem_opts ? shmem_opts : "",
args->opts_target ? args->opts_target : "",
ignore_stderr);
- *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target);
+ *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target, false);
qtest_qmp_set_event_callback(*to,
migrate_watch_for_events,
&dst_state);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 16/19] tests/qtest: defer connection
2024-12-02 13:20 ` [PATCH V4 16/19] tests/qtest: defer connection Steve Sistare
@ 2024-12-18 21:02 ` Steven Sistare
2024-12-19 15:46 ` Peter Xu
1 sibling, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 21:02 UTC (permalink / raw)
To: Peter Xu, Fabiano Rosas
Cc: David Hildenbrand, Marcel Apfelbaum, Eduardo Habkost,
Philippe Mathieu-Daude, Paolo Bonzini, Daniel P. Berrange,
Markus Armbruster, qemu-devel
If someone is bored, this needs review, else let it ride until V5.
- Steve
On 12/2/2024 8:20 AM, Steve Sistare wrote:
> Add an option to defer making the connecting to the monitor and qtest
> sockets when calling qtest_init_with_env. The client makes the connection
> later by calling qtest_connect_deferred and qtest_qmp_handshake.
>
> A test cannot specify port=0 for a deferred connection, because qmp_migrate
> cannot query for the assigned port, because the monitor is not connected
> yet. However, even if the test does not specify port=0, qmp_migrate ->
> migrate_set_ports unconditionally queries connection parameters.
> Modify migrate_set_ports to only query when port=0.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> tests/qtest/libqtest.c | 80 +++++++++++++++++++++++++----------------
> tests/qtest/libqtest.h | 19 +++++++++-
> tests/qtest/migration-helpers.c | 19 +++++-----
> tests/qtest/migration-test.c | 4 +--
> 4 files changed, 80 insertions(+), 42 deletions(-)
>
> diff --git a/tests/qtest/libqtest.c b/tests/qtest/libqtest.c
> index 817fd7a..31c4032 100644
> --- a/tests/qtest/libqtest.c
> +++ b/tests/qtest/libqtest.c
> @@ -75,6 +75,8 @@ struct QTestState
> {
> int fd;
> int qmp_fd;
> + int sock;
> + int qmpsock;
> pid_t qemu_pid; /* our child QEMU process */
> int wstatus;
> #ifdef _WIN32
> @@ -442,18 +444,19 @@ static QTestState *G_GNUC_PRINTF(2, 3) qtest_spawn_qemu(const char *qemu_bin,
> return s;
> }
>
> +static char *qtest_socket_path(const char *suffix)
> +{
> + return g_strdup_printf("%s/qtest-%d.%s", g_get_tmp_dir(), getpid(), suffix);
> +}
> +
> static QTestState *qtest_init_internal(const char *qemu_bin,
> - const char *extra_args)
> + const char *extra_args,
> + bool defer_connect)
> {
> QTestState *s;
> int sock, qmpsock, i;
> - gchar *socket_path;
> - gchar *qmp_socket_path;
> -
> - socket_path = g_strdup_printf("%s/qtest-%d.sock",
> - g_get_tmp_dir(), getpid());
> - qmp_socket_path = g_strdup_printf("%s/qtest-%d.qmp",
> - g_get_tmp_dir(), getpid());
> + g_autofree gchar *socket_path = qtest_socket_path("sock");
> + g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
>
> /*
> * It's possible that if an earlier test run crashed it might
> @@ -485,22 +488,17 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
> qtest_client_set_rx_handler(s, qtest_client_socket_recv_line);
> qtest_client_set_tx_handler(s, qtest_client_socket_send);
>
> - s->fd = socket_accept(sock);
> - if (s->fd >= 0) {
> - s->qmp_fd = socket_accept(qmpsock);
> - }
> - unlink(socket_path);
> - unlink(qmp_socket_path);
> - g_free(socket_path);
> - g_free(qmp_socket_path);
> -
> - g_assert(s->fd >= 0 && s->qmp_fd >= 0);
> -
> s->rx = g_string_new("");
> for (i = 0; i < MAX_IRQ; i++) {
> s->irq_level[i] = false;
> }
>
> + s->sock = sock;
> + s->qmpsock = qmpsock;
> + if (!defer_connect) {
> + qtest_connect_deferred(s);
> + }
> +
> /*
> * Stopping QEMU for debugging is not supported on Windows.
> *
> @@ -515,34 +513,54 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
> }
> #endif
>
> + return s;
> +}
> +
> +void qtest_connect_deferred(QTestState *s)
> +{
> + g_autofree gchar *socket_path = qtest_socket_path("sock");
> + g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
> +
> + g_assert(s->sock >= 0 && s->qmpsock >= 0);
> + s->fd = socket_accept(s->sock);
> + if (s->fd >= 0) {
> + s->qmp_fd = socket_accept(s->qmpsock);
> + }
> + unlink(socket_path);
> + unlink(qmp_socket_path);
> + g_assert(s->fd >= 0 && s->qmp_fd >= 0);
> + s->sock = s->qmpsock = -1;
> /* ask endianness of the target */
> -
> s->big_endian = qtest_query_target_endianness(s);
> -
> - return s;
> }
>
> QTestState *qtest_init_without_qmp_handshake(const char *extra_args)
> {
> - return qtest_init_internal(qtest_qemu_binary(NULL), extra_args);
> + return qtest_init_internal(qtest_qemu_binary(NULL), extra_args, false);
> }
>
> -QTestState *qtest_init_with_env(const char *var, const char *extra_args)
> +void qtest_qmp_handshake(QTestState *s)
> {
> - QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args);
> - QDict *greeting;
> -
> /* Read the QMP greeting and then do the handshake */
> - greeting = qtest_qmp_receive(s);
> + QDict *greeting = qtest_qmp_receive(s);
> qobject_unref(greeting);
> qobject_unref(qtest_qmp(s, "{ 'execute': 'qmp_capabilities' }"));
> +}
>
> +QTestState *qtest_init_with_env(const char *var, const char *extra_args,
> + bool defer_connect)
> +{
> + QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args,
> + defer_connect);
> + if (!defer_connect) {
> + qtest_qmp_handshake(s);
> + }
> return s;
> }
>
> QTestState *qtest_init(const char *extra_args)
> {
> - return qtest_init_with_env(NULL, extra_args);
> + return qtest_init_with_env(NULL, extra_args, false);
> }
>
> QTestState *qtest_vinitf(const char *fmt, va_list ap)
> @@ -1523,7 +1541,7 @@ static struct MachInfo *qtest_get_machines(const char *var)
>
> silence_spawn_log = !g_test_verbose();
>
> - qts = qtest_init_with_env(qemu_var, "-machine none");
> + qts = qtest_init_with_env(qemu_var, "-machine none", false);
> response = qtest_qmp(qts, "{ 'execute': 'query-machines' }");
> g_assert(response);
> list = qdict_get_qlist(response, "return");
> @@ -1578,7 +1596,7 @@ static struct CpuModel *qtest_get_cpu_models(void)
>
> silence_spawn_log = !g_test_verbose();
>
> - qts = qtest_init_with_env(NULL, "-machine none");
> + qts = qtest_init_with_env(NULL, "-machine none", false);
> response = qtest_qmp(qts, "{ 'execute': 'query-cpu-definitions' }");
> g_assert(response);
> list = qdict_get_qlist(response, "return");
> diff --git a/tests/qtest/libqtest.h b/tests/qtest/libqtest.h
> index beb96b1..db76f2c 100644
> --- a/tests/qtest/libqtest.h
> +++ b/tests/qtest/libqtest.h
> @@ -60,13 +60,15 @@ QTestState *qtest_init(const char *extra_args);
> * @var: Environment variable from where to take the QEMU binary
> * @extra_args: Other arguments to pass to QEMU. CAUTION: these
> * arguments are subject to word splitting and shell evaluation.
> + * @defer_connect: do not connect to qemu monitor and qtest socket.
> *
> * Like qtest_init(), but use a different environment variable for the
> * QEMU binary.
> *
> * Returns: #QTestState instance.
> */
> -QTestState *qtest_init_with_env(const char *var, const char *extra_args);
> +QTestState *qtest_init_with_env(const char *var, const char *extra_args,
> + bool defer_connect);
>
> /**
> * qtest_init_without_qmp_handshake:
> @@ -78,6 +80,21 @@ QTestState *qtest_init_with_env(const char *var, const char *extra_args);
> QTestState *qtest_init_without_qmp_handshake(const char *extra_args);
>
> /**
> + * qtest_connect_deferred:
> + * @s: #QTestState instance to connect
> + * Connect to qemu monitor and qtest socket, after deferring them in
> + * qtest_init_with_env. Does not handshake with the monitor.
> + */
> +void qtest_connect_deferred(QTestState *s);
> +
> +/**
> + * qtest_qmp_handshake:
> + * @s: #QTestState instance to operate on.
> + * Perform handshake after connecting to qemu monitor.
> + */
> +void qtest_qmp_handshake(QTestState *s);
> +
> +/**
> * qtest_init_with_serial:
> * @extra_args: other arguments to pass to QEMU. CAUTION: these
> * arguments are subject to word splitting and shell evaluation.
> diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
> index 3f8ba7f..9f39401 100644
> --- a/tests/qtest/migration-helpers.c
> +++ b/tests/qtest/migration-helpers.c
> @@ -127,25 +127,28 @@ migrate_get_connect_qdict(QTestState *who)
>
> static void migrate_set_ports(QTestState *to, QList *channel_list)
> {
> - QDict *addr;
> + g_autoptr(QDict) addr = NULL;
> QListEntry *entry;
> const char *addr_port = NULL;
>
> - addr = migrate_get_connect_qdict(to);
> -
> QLIST_FOREACH_ENTRY(channel_list, entry) {
> QDict *channel = qobject_to(QDict, qlist_entry_obj(entry));
> QDict *addrdict = qdict_get_qdict(channel, "addr");
>
> - if (qdict_haskey(addrdict, "port") &&
> - qdict_haskey(addr, "port") &&
> - (strcmp(qdict_get_str(addrdict, "port"), "0") == 0)) {
> + if (!qdict_haskey(addrdict, "port") ||
> + strcmp(qdict_get_str(addrdict, "port"), "0")) {
> + continue;
> + }
> +
> + if (!addr) {
> + addr = migrate_get_connect_qdict(to);
> + }
> +
> + if (qdict_haskey(addr, "port")) {
> addr_port = qdict_get_str(addr, "port");
> qdict_put_str(addrdict, "port", addr_port);
> }
> }
> -
> - qobject_unref(addr);
> }
>
> bool migrate_watch_for_events(QTestState *who, const char *name,
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 64e1c50..b7001b0 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -844,7 +844,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
> args->opts_source ? args->opts_source : "",
> ignore_stderr);
> if (!args->only_target) {
> - *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source);
> + *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source, false);
> qtest_qmp_set_event_callback(*from,
> migrate_watch_for_events,
> &src_state);
> @@ -865,7 +865,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
> shmem_opts ? shmem_opts : "",
> args->opts_target ? args->opts_target : "",
> ignore_stderr);
> - *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target);
> + *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target, false);
> qtest_qmp_set_event_callback(*to,
> migrate_watch_for_events,
> &dst_state);
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 16/19] tests/qtest: defer connection
2024-12-02 13:20 ` [PATCH V4 16/19] tests/qtest: defer connection Steve Sistare
2024-12-18 21:02 ` Steven Sistare
@ 2024-12-19 15:46 ` Peter Xu
2024-12-19 22:33 ` Steven Sistare
1 sibling, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-19 15:46 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:20:08AM -0800, Steve Sistare wrote:
> Add an option to defer making the connecting to the monitor and qtest
> sockets when calling qtest_init_with_env. The client makes the connection
> later by calling qtest_connect_deferred and qtest_qmp_handshake.
>
> A test cannot specify port=0 for a deferred connection, because qmp_migrate
> cannot query for the assigned port, because the monitor is not connected
> yet. However, even if the test does not specify port=0, qmp_migrate ->
> migrate_set_ports unconditionally queries connection parameters.
> Modify migrate_set_ports to only query when port=0.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Looks mostly good, nitpicks inline..
> ---
> tests/qtest/libqtest.c | 80 +++++++++++++++++++++++++----------------
> tests/qtest/libqtest.h | 19 +++++++++-
> tests/qtest/migration-helpers.c | 19 +++++-----
> tests/qtest/migration-test.c | 4 +--
> 4 files changed, 80 insertions(+), 42 deletions(-)
>
> diff --git a/tests/qtest/libqtest.c b/tests/qtest/libqtest.c
> index 817fd7a..31c4032 100644
> --- a/tests/qtest/libqtest.c
> +++ b/tests/qtest/libqtest.c
> @@ -75,6 +75,8 @@ struct QTestState
> {
> int fd;
> int qmp_fd;
> + int sock;
> + int qmpsock;
> pid_t qemu_pid; /* our child QEMU process */
> int wstatus;
> #ifdef _WIN32
> @@ -442,18 +444,19 @@ static QTestState *G_GNUC_PRINTF(2, 3) qtest_spawn_qemu(const char *qemu_bin,
> return s;
> }
>
> +static char *qtest_socket_path(const char *suffix)
> +{
> + return g_strdup_printf("%s/qtest-%d.%s", g_get_tmp_dir(), getpid(), suffix);
> +}
> +
> static QTestState *qtest_init_internal(const char *qemu_bin,
> - const char *extra_args)
> + const char *extra_args,
> + bool defer_connect)
Suggest to stick with positive logic naming.
That is, s/defer_connect/do_connect/ or similar, then invert the values in
callers.
> {
> QTestState *s;
> int sock, qmpsock, i;
> - gchar *socket_path;
> - gchar *qmp_socket_path;
> -
> - socket_path = g_strdup_printf("%s/qtest-%d.sock",
> - g_get_tmp_dir(), getpid());
> - qmp_socket_path = g_strdup_printf("%s/qtest-%d.qmp",
> - g_get_tmp_dir(), getpid());
> + g_autofree gchar *socket_path = qtest_socket_path("sock");
> + g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
>
> /*
> * It's possible that if an earlier test run crashed it might
> @@ -485,22 +488,17 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
> qtest_client_set_rx_handler(s, qtest_client_socket_recv_line);
> qtest_client_set_tx_handler(s, qtest_client_socket_send);
>
> - s->fd = socket_accept(sock);
> - if (s->fd >= 0) {
> - s->qmp_fd = socket_accept(qmpsock);
> - }
> - unlink(socket_path);
> - unlink(qmp_socket_path);
> - g_free(socket_path);
> - g_free(qmp_socket_path);
> -
> - g_assert(s->fd >= 0 && s->qmp_fd >= 0);
> -
> s->rx = g_string_new("");
> for (i = 0; i < MAX_IRQ; i++) {
> s->irq_level[i] = false;
> }
>
> + s->sock = sock;
> + s->qmpsock = qmpsock;
> + if (!defer_connect) {
> + qtest_connect_deferred(s);
Now qtest_connect_deferred() itself has nothing to do with the "defer"
concept.. it is the helper to connect the sockets, so maybe better call it
qtest_connect_socks(), or similar.
> + }
> +
> /*
> * Stopping QEMU for debugging is not supported on Windows.
> *
> @@ -515,34 +513,54 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
> }
> #endif
>
> + return s;
> +}
> +
> +void qtest_connect_deferred(QTestState *s)
> +{
> + g_autofree gchar *socket_path = qtest_socket_path("sock");
> + g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
> +
> + g_assert(s->sock >= 0 && s->qmpsock >= 0);
> + s->fd = socket_accept(s->sock);
> + if (s->fd >= 0) {
> + s->qmp_fd = socket_accept(s->qmpsock);
> + }
> + unlink(socket_path);
> + unlink(qmp_socket_path);
> + g_assert(s->fd >= 0 && s->qmp_fd >= 0);
> + s->sock = s->qmpsock = -1;
> /* ask endianness of the target */
> -
> s->big_endian = qtest_query_target_endianness(s);
> -
> - return s;
> }
>
> QTestState *qtest_init_without_qmp_handshake(const char *extra_args)
> {
> - return qtest_init_internal(qtest_qemu_binary(NULL), extra_args);
> + return qtest_init_internal(qtest_qemu_binary(NULL), extra_args, false);
> }
>
> -QTestState *qtest_init_with_env(const char *var, const char *extra_args)
> +void qtest_qmp_handshake(QTestState *s)
> {
> - QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args);
> - QDict *greeting;
> -
> /* Read the QMP greeting and then do the handshake */
> - greeting = qtest_qmp_receive(s);
> + QDict *greeting = qtest_qmp_receive(s);
> qobject_unref(greeting);
> qobject_unref(qtest_qmp(s, "{ 'execute': 'qmp_capabilities' }"));
> +}
>
> +QTestState *qtest_init_with_env(const char *var, const char *extra_args,
> + bool defer_connect)
> +{
> + QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args,
> + defer_connect);
> + if (!defer_connect) {
> + qtest_qmp_handshake(s);
> + }
> return s;
> }
>
> QTestState *qtest_init(const char *extra_args)
> {
> - return qtest_init_with_env(NULL, extra_args);
> + return qtest_init_with_env(NULL, extra_args, false);
> }
>
> QTestState *qtest_vinitf(const char *fmt, va_list ap)
> @@ -1523,7 +1541,7 @@ static struct MachInfo *qtest_get_machines(const char *var)
>
> silence_spawn_log = !g_test_verbose();
>
> - qts = qtest_init_with_env(qemu_var, "-machine none");
> + qts = qtest_init_with_env(qemu_var, "-machine none", false);
> response = qtest_qmp(qts, "{ 'execute': 'query-machines' }");
> g_assert(response);
> list = qdict_get_qlist(response, "return");
> @@ -1578,7 +1596,7 @@ static struct CpuModel *qtest_get_cpu_models(void)
>
> silence_spawn_log = !g_test_verbose();
>
> - qts = qtest_init_with_env(NULL, "-machine none");
> + qts = qtest_init_with_env(NULL, "-machine none", false);
> response = qtest_qmp(qts, "{ 'execute': 'query-cpu-definitions' }");
> g_assert(response);
> list = qdict_get_qlist(response, "return");
> diff --git a/tests/qtest/libqtest.h b/tests/qtest/libqtest.h
> index beb96b1..db76f2c 100644
> --- a/tests/qtest/libqtest.h
> +++ b/tests/qtest/libqtest.h
> @@ -60,13 +60,15 @@ QTestState *qtest_init(const char *extra_args);
> * @var: Environment variable from where to take the QEMU binary
> * @extra_args: Other arguments to pass to QEMU. CAUTION: these
> * arguments are subject to word splitting and shell evaluation.
> + * @defer_connect: do not connect to qemu monitor and qtest socket.
> *
> * Like qtest_init(), but use a different environment variable for the
> * QEMU binary.
> *
> * Returns: #QTestState instance.
> */
> -QTestState *qtest_init_with_env(const char *var, const char *extra_args);
> +QTestState *qtest_init_with_env(const char *var, const char *extra_args,
> + bool defer_connect);
>
> /**
> * qtest_init_without_qmp_handshake:
> @@ -78,6 +80,21 @@ QTestState *qtest_init_with_env(const char *var, const char *extra_args);
> QTestState *qtest_init_without_qmp_handshake(const char *extra_args);
>
> /**
> + * qtest_connect_deferred:
> + * @s: #QTestState instance to connect
> + * Connect to qemu monitor and qtest socket, after deferring them in
> + * qtest_init_with_env. Does not handshake with the monitor.
> + */
> +void qtest_connect_deferred(QTestState *s);
> +
> +/**
> + * qtest_qmp_handshake:
> + * @s: #QTestState instance to operate on.
> + * Perform handshake after connecting to qemu monitor.
> + */
> +void qtest_qmp_handshake(QTestState *s);
> +
> +/**
> * qtest_init_with_serial:
> * @extra_args: other arguments to pass to QEMU. CAUTION: these
> * arguments are subject to word splitting and shell evaluation.
> diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
> index 3f8ba7f..9f39401 100644
> --- a/tests/qtest/migration-helpers.c
> +++ b/tests/qtest/migration-helpers.c
> @@ -127,25 +127,28 @@ migrate_get_connect_qdict(QTestState *who)
>
> static void migrate_set_ports(QTestState *to, QList *channel_list)
> {
> - QDict *addr;
> + g_autoptr(QDict) addr = NULL;
> QListEntry *entry;
> const char *addr_port = NULL;
>
> - addr = migrate_get_connect_qdict(to);
> -
> QLIST_FOREACH_ENTRY(channel_list, entry) {
> QDict *channel = qobject_to(QDict, qlist_entry_obj(entry));
> QDict *addrdict = qdict_get_qdict(channel, "addr");
>
> - if (qdict_haskey(addrdict, "port") &&
> - qdict_haskey(addr, "port") &&
> - (strcmp(qdict_get_str(addrdict, "port"), "0") == 0)) {
> + if (!qdict_haskey(addrdict, "port") ||
> + strcmp(qdict_get_str(addrdict, "port"), "0")) {
> + continue;
> + }
> +
> + if (!addr) {
> + addr = migrate_get_connect_qdict(to);
May be good to add a comment above on why the query was done only lazily.
Meanwhile this chunk of change can be separate; it's relevant to the defer
idea but still pretty standalone change. Can be one small patch prior to
this one, IMHO.
Optional idea, can be for later: if QTestState can have the state showing
whether the QMP is ready, we could already assert making sure the query
happens only if the QMP is available.
> + }
> +
> + if (qdict_haskey(addr, "port")) {
> addr_port = qdict_get_str(addr, "port");
> qdict_put_str(addrdict, "port", addr_port);
> }
> }
> -
> - qobject_unref(addr);
> }
>
> bool migrate_watch_for_events(QTestState *who, const char *name,
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 64e1c50..b7001b0 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -844,7 +844,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
> args->opts_source ? args->opts_source : "",
> ignore_stderr);
> if (!args->only_target) {
> - *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source);
> + *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source, false);
> qtest_qmp_set_event_callback(*from,
> migrate_watch_for_events,
> &src_state);
> @@ -865,7 +865,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
> shmem_opts ? shmem_opts : "",
> args->opts_target ? args->opts_target : "",
> ignore_stderr);
> - *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target);
> + *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target, false);
> qtest_qmp_set_event_callback(*to,
> migrate_watch_for_events,
> &dst_state);
> --
> 1.8.3.1
>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 16/19] tests/qtest: defer connection
2024-12-19 15:46 ` Peter Xu
@ 2024-12-19 22:33 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-19 22:33 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/19/2024 10:46 AM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:20:08AM -0800, Steve Sistare wrote:
>> Add an option to defer making the connecting to the monitor and qtest
>> sockets when calling qtest_init_with_env. The client makes the connection
>> later by calling qtest_connect_deferred and qtest_qmp_handshake.
>>
>> A test cannot specify port=0 for a deferred connection, because qmp_migrate
>> cannot query for the assigned port, because the monitor is not connected
>> yet. However, even if the test does not specify port=0, qmp_migrate ->
>> migrate_set_ports unconditionally queries connection parameters.
>> Modify migrate_set_ports to only query when port=0.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> Looks mostly good, nitpicks inline..
>
>> ---
>> tests/qtest/libqtest.c | 80 +++++++++++++++++++++++++----------------
>> tests/qtest/libqtest.h | 19 +++++++++-
>> tests/qtest/migration-helpers.c | 19 +++++-----
>> tests/qtest/migration-test.c | 4 +--
>> 4 files changed, 80 insertions(+), 42 deletions(-)
>>
>> diff --git a/tests/qtest/libqtest.c b/tests/qtest/libqtest.c
>> index 817fd7a..31c4032 100644
>> --- a/tests/qtest/libqtest.c
>> +++ b/tests/qtest/libqtest.c
>> @@ -75,6 +75,8 @@ struct QTestState
>> {
>> int fd;
>> int qmp_fd;
>> + int sock;
>> + int qmpsock;
>> pid_t qemu_pid; /* our child QEMU process */
>> int wstatus;
>> #ifdef _WIN32
>> @@ -442,18 +444,19 @@ static QTestState *G_GNUC_PRINTF(2, 3) qtest_spawn_qemu(const char *qemu_bin,
>> return s;
>> }
>>
>> +static char *qtest_socket_path(const char *suffix)
>> +{
>> + return g_strdup_printf("%s/qtest-%d.%s", g_get_tmp_dir(), getpid(), suffix);
>> +}
>> +
>> static QTestState *qtest_init_internal(const char *qemu_bin,
>> - const char *extra_args)
>> + const char *extra_args,
>> + bool defer_connect)
>
> Suggest to stick with positive logic naming.
>
> That is, s/defer_connect/do_connect/ or similar, then invert the values in
> callers.
Will do, and will rename qtest_connect_deferred -> qtest_connect.
Thanks for reviewing these last few patches.
>> {
>> QTestState *s;
>> int sock, qmpsock, i;
>> - gchar *socket_path;
>> - gchar *qmp_socket_path;
>> -
>> - socket_path = g_strdup_printf("%s/qtest-%d.sock",
>> - g_get_tmp_dir(), getpid());
>> - qmp_socket_path = g_strdup_printf("%s/qtest-%d.qmp",
>> - g_get_tmp_dir(), getpid());
>> + g_autofree gchar *socket_path = qtest_socket_path("sock");
>> + g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
>>
>> /*
>> * It's possible that if an earlier test run crashed it might
>> @@ -485,22 +488,17 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
>> qtest_client_set_rx_handler(s, qtest_client_socket_recv_line);
>> qtest_client_set_tx_handler(s, qtest_client_socket_send);
>>
>> - s->fd = socket_accept(sock);
>> - if (s->fd >= 0) {
>> - s->qmp_fd = socket_accept(qmpsock);
>> - }
>> - unlink(socket_path);
>> - unlink(qmp_socket_path);
>> - g_free(socket_path);
>> - g_free(qmp_socket_path);
>> -
>> - g_assert(s->fd >= 0 && s->qmp_fd >= 0);
>> -
>> s->rx = g_string_new("");
>> for (i = 0; i < MAX_IRQ; i++) {
>> s->irq_level[i] = false;
>> }
>>
>> + s->sock = sock;
>> + s->qmpsock = qmpsock;
>> + if (!defer_connect) {
>> + qtest_connect_deferred(s);
>
> Now qtest_connect_deferred() itself has nothing to do with the "defer"
> concept.. it is the helper to connect the sockets, so maybe better call it
> qtest_connect_socks(), or similar.
>
>> + }
>> +
>> /*
>> * Stopping QEMU for debugging is not supported on Windows.
>> *
>> @@ -515,34 +513,54 @@ static QTestState *qtest_init_internal(const char *qemu_bin,
>> }
>> #endif
>>
>> + return s;
>> +}
>> +
>> +void qtest_connect_deferred(QTestState *s)
>> +{
>> + g_autofree gchar *socket_path = qtest_socket_path("sock");
>> + g_autofree gchar *qmp_socket_path = qtest_socket_path("qmp");
>> +
>> + g_assert(s->sock >= 0 && s->qmpsock >= 0);
>> + s->fd = socket_accept(s->sock);
>> + if (s->fd >= 0) {
>> + s->qmp_fd = socket_accept(s->qmpsock);
>> + }
>> + unlink(socket_path);
>> + unlink(qmp_socket_path);
>> + g_assert(s->fd >= 0 && s->qmp_fd >= 0);
>> + s->sock = s->qmpsock = -1;
>> /* ask endianness of the target */
>> -
>> s->big_endian = qtest_query_target_endianness(s);
>> -
>> - return s;
>> }
>>
>> QTestState *qtest_init_without_qmp_handshake(const char *extra_args)
>> {
>> - return qtest_init_internal(qtest_qemu_binary(NULL), extra_args);
>> + return qtest_init_internal(qtest_qemu_binary(NULL), extra_args, false);
>> }
>>
>> -QTestState *qtest_init_with_env(const char *var, const char *extra_args)
>> +void qtest_qmp_handshake(QTestState *s)
>> {
>> - QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args);
>> - QDict *greeting;
>> -
>> /* Read the QMP greeting and then do the handshake */
>> - greeting = qtest_qmp_receive(s);
>> + QDict *greeting = qtest_qmp_receive(s);
>> qobject_unref(greeting);
>> qobject_unref(qtest_qmp(s, "{ 'execute': 'qmp_capabilities' }"));
>> +}
>>
>> +QTestState *qtest_init_with_env(const char *var, const char *extra_args,
>> + bool defer_connect)
>> +{
>> + QTestState *s = qtest_init_internal(qtest_qemu_binary(var), extra_args,
>> + defer_connect);
>> + if (!defer_connect) {
>> + qtest_qmp_handshake(s);
>> + }
>> return s;
>> }
>>
>> QTestState *qtest_init(const char *extra_args)
>> {
>> - return qtest_init_with_env(NULL, extra_args);
>> + return qtest_init_with_env(NULL, extra_args, false);
>> }
>>
>> QTestState *qtest_vinitf(const char *fmt, va_list ap)
>> @@ -1523,7 +1541,7 @@ static struct MachInfo *qtest_get_machines(const char *var)
>>
>> silence_spawn_log = !g_test_verbose();
>>
>> - qts = qtest_init_with_env(qemu_var, "-machine none");
>> + qts = qtest_init_with_env(qemu_var, "-machine none", false);
>> response = qtest_qmp(qts, "{ 'execute': 'query-machines' }");
>> g_assert(response);
>> list = qdict_get_qlist(response, "return");
>> @@ -1578,7 +1596,7 @@ static struct CpuModel *qtest_get_cpu_models(void)
>>
>> silence_spawn_log = !g_test_verbose();
>>
>> - qts = qtest_init_with_env(NULL, "-machine none");
>> + qts = qtest_init_with_env(NULL, "-machine none", false);
>> response = qtest_qmp(qts, "{ 'execute': 'query-cpu-definitions' }");
>> g_assert(response);
>> list = qdict_get_qlist(response, "return");
>> diff --git a/tests/qtest/libqtest.h b/tests/qtest/libqtest.h
>> index beb96b1..db76f2c 100644
>> --- a/tests/qtest/libqtest.h
>> +++ b/tests/qtest/libqtest.h
>> @@ -60,13 +60,15 @@ QTestState *qtest_init(const char *extra_args);
>> * @var: Environment variable from where to take the QEMU binary
>> * @extra_args: Other arguments to pass to QEMU. CAUTION: these
>> * arguments are subject to word splitting and shell evaluation.
>> + * @defer_connect: do not connect to qemu monitor and qtest socket.
>> *
>> * Like qtest_init(), but use a different environment variable for the
>> * QEMU binary.
>> *
>> * Returns: #QTestState instance.
>> */
>> -QTestState *qtest_init_with_env(const char *var, const char *extra_args);
>> +QTestState *qtest_init_with_env(const char *var, const char *extra_args,
>> + bool defer_connect);
>>
>> /**
>> * qtest_init_without_qmp_handshake:
>> @@ -78,6 +80,21 @@ QTestState *qtest_init_with_env(const char *var, const char *extra_args);
>> QTestState *qtest_init_without_qmp_handshake(const char *extra_args);
>>
>> /**
>> + * qtest_connect_deferred:
>> + * @s: #QTestState instance to connect
>> + * Connect to qemu monitor and qtest socket, after deferring them in
>> + * qtest_init_with_env. Does not handshake with the monitor.
>> + */
>> +void qtest_connect_deferred(QTestState *s);
>> +
>> +/**
>> + * qtest_qmp_handshake:
>> + * @s: #QTestState instance to operate on.
>> + * Perform handshake after connecting to qemu monitor.
>> + */
>> +void qtest_qmp_handshake(QTestState *s);
>> +
>> +/**
>> * qtest_init_with_serial:
>> * @extra_args: other arguments to pass to QEMU. CAUTION: these
>> * arguments are subject to word splitting and shell evaluation.
>> diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
>> index 3f8ba7f..9f39401 100644
>> --- a/tests/qtest/migration-helpers.c
>> +++ b/tests/qtest/migration-helpers.c
>> @@ -127,25 +127,28 @@ migrate_get_connect_qdict(QTestState *who)
>>
>> static void migrate_set_ports(QTestState *to, QList *channel_list)
>> {
>> - QDict *addr;
>> + g_autoptr(QDict) addr = NULL;
>> QListEntry *entry;
>> const char *addr_port = NULL;
>>
>> - addr = migrate_get_connect_qdict(to);
>> -
>> QLIST_FOREACH_ENTRY(channel_list, entry) {
>> QDict *channel = qobject_to(QDict, qlist_entry_obj(entry));
>> QDict *addrdict = qdict_get_qdict(channel, "addr");
>>
>> - if (qdict_haskey(addrdict, "port") &&
>> - qdict_haskey(addr, "port") &&
>> - (strcmp(qdict_get_str(addrdict, "port"), "0") == 0)) {
>> + if (!qdict_haskey(addrdict, "port") ||
>> + strcmp(qdict_get_str(addrdict, "port"), "0")) {
>> + continue;
>> + }
>> +
>> + if (!addr) {
>> + addr = migrate_get_connect_qdict(to);
>
> May be good to add a comment above on why the query was done only lazily.
>
> Meanwhile this chunk of change can be separate; it's relevant to the defer
> idea but still pretty standalone change. Can be one small patch prior to
> this one, IMHO.
Will do both.
> Optional idea, can be for later: if QTestState can have the state showing
> whether the QMP is ready, we could already assert making sure the query
> happens only if the QMP is available.
Good idea, I'll add it now in a small patch. It already caught bugs as I
flipped defer_connect to do_connect :)
- Steve
>> + }
>> +
>> + if (qdict_haskey(addr, "port")) {
>> addr_port = qdict_get_str(addr, "port");
>> qdict_put_str(addrdict, "port", addr_port);
>> }
>> }
>> -
>> - qobject_unref(addr);
>> }
>>
>> bool migrate_watch_for_events(QTestState *who, const char *name,
>> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
>> index 64e1c50..b7001b0 100644
>> --- a/tests/qtest/migration-test.c
>> +++ b/tests/qtest/migration-test.c
>> @@ -844,7 +844,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
>> args->opts_source ? args->opts_source : "",
>> ignore_stderr);
>> if (!args->only_target) {
>> - *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source);
>> + *from = qtest_init_with_env(QEMU_ENV_SRC, cmd_source, false);
>> qtest_qmp_set_event_callback(*from,
>> migrate_watch_for_events,
>> &src_state);
>> @@ -865,7 +865,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
>> shmem_opts ? shmem_opts : "",
>> args->opts_target ? args->opts_target : "",
>> ignore_stderr);
>> - *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target);
>> + *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target, false);
>> qtest_qmp_set_event_callback(*to,
>> migrate_watch_for_events,
>> &dst_state);
>> --
>> 1.8.3.1
>>
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 17/19] tests/migration-test: defer connection
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (15 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 16/19] tests/qtest: defer connection Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-02 13:20 ` [PATCH V4 18/19] migration-test: cpr-transfer Steve Sistare
2024-12-02 13:20 ` [PATCH V4 19/19] migration: cpr-transfer documentation Steve Sistare
18 siblings, 0 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Add an option to defer connection to the target monitor, needed by the
cpr-transfer test.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
---
tests/qtest/migration-test.c | 26 +++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index b7001b0..8bc665d 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -614,6 +614,9 @@ typedef struct {
* size is plugged in. If omitted, "-m %s" is used.
*/
const char *memory_backend;
+
+ /* Do not connect to target monitor and qtest sockets in qtest_init */
+ bool defer_target_connect;
} MigrateStart;
/*
@@ -733,6 +736,7 @@ static int test_migrate_start(QTestState **from, QTestState **to,
const char *machine_alias, *machine_opts = "";
g_autofree char *machine = NULL;
g_autofree char *memory_backend = NULL;
+ const char *events;
if (args->use_shmem) {
if (!g_file_test("/dev/shm", G_FILE_TEST_IS_DIR)) {
@@ -850,22 +854,31 @@ static int test_migrate_start(QTestState **from, QTestState **to,
&src_state);
}
+ /*
+ * If the monitor connection is deferred, enable events on the command line
+ * so none are missed. This is for testing only, do not set migration
+ * options like this in general.
+ */
+ events = args->defer_target_connect ? "-global migration.x-events=on" : "";
+
cmd_target = g_strdup_printf("-accel kvm%s -accel tcg "
"-machine %s,%s "
"-name target,debug-threads=on "
"%s "
"-serial file:%s/dest_serial "
"-incoming %s "
- "%s %s %s %s %s",
+ "%s %s %s %s %s %s",
kvm_opts ? kvm_opts : "",
machine, machine_opts,
memory_backend, tmpfs, uri,
+ events,
arch_opts ? arch_opts : "",
arch_target ? arch_target : "",
shmem_opts ? shmem_opts : "",
args->opts_target ? args->opts_target : "",
ignore_stderr);
- *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target, false);
+ *to = qtest_init_with_env(QEMU_ENV_DST, cmd_target,
+ args->defer_target_connect);
qtest_qmp_set_event_callback(*to,
migrate_watch_for_events,
&dst_state);
@@ -883,7 +896,9 @@ static int test_migrate_start(QTestState **from, QTestState **to,
* to mimic as closer as that.
*/
migrate_set_capability(*from, "events", true);
- migrate_set_capability(*to, "events", true);
+ if (!args->defer_target_connect) {
+ migrate_set_capability(*to, "events", true);
+ }
return 0;
}
@@ -1753,6 +1768,11 @@ static void test_precopy_common(MigrateCommon *args)
migrate_qmp(from, to, args->connect_uri, args->connect_channels, "{}");
+ if (args->start.defer_target_connect) {
+ qtest_connect_deferred(to);
+ qtest_qmp_handshake(to);
+ }
+
if (args->result != MIG_TEST_SUCCEED) {
bool allow_active = args->result == MIG_TEST_FAIL;
wait_for_migration_fail(from, allow_active);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* [PATCH V4 18/19] migration-test: cpr-transfer
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (16 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 17/19] tests/migration-test: " Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-18 21:03 ` Steven Sistare
2024-12-19 16:56 ` Peter Xu
2024-12-02 13:20 ` [PATCH V4 19/19] migration: cpr-transfer documentation Steve Sistare
18 siblings, 2 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Add a migration test for cpr-transfer mode. Defer the connection to the
target monitor, else the test hangs because in cpr-transfer mode QEMU does
not listen for monitor connections until we send the migrate command to
source QEMU.
To test -incoming defer, send a migrate incoming command to the target,
after sending the migrate command to the source, as required by
cpr-transfer mode.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
tests/qtest/migration-test.c | 72 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 71 insertions(+), 1 deletion(-)
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 8bc665d..4eb641c 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1729,6 +1729,7 @@ static void test_precopy_common(MigrateCommon *args)
{
QTestState *from, *to;
void *data_hook = NULL;
+ const char *connect_uri;
if (test_migrate_start(&from, &to, args->listen_uri, &args->start)) {
return;
@@ -1766,11 +1767,16 @@ static void test_precopy_common(MigrateCommon *args)
goto finish;
}
- migrate_qmp(from, to, args->connect_uri, args->connect_channels, "{}");
+ /* If has channels, then connect_uri is only used for listen defer */
+ connect_uri = args->connect_channels ? NULL : args->connect_uri;
+ migrate_qmp(from, to, connect_uri, args->connect_channels, "{}");
if (args->start.defer_target_connect) {
qtest_connect_deferred(to);
qtest_qmp_handshake(to);
+ if (!strcmp(args->listen_uri, "defer")) {
+ migrate_incoming_qmp(to, args->connect_uri, "{}");
+ }
}
if (args->result != MIG_TEST_SUCCEED) {
@@ -2415,6 +2421,66 @@ static void test_multifd_file_mapped_ram_fdset_dio(void)
}
#endif /* !_WIN32 */
+static void *test_mode_transfer_start(QTestState *from, QTestState *to)
+{
+ migrate_set_parameter_str(from, "mode", "cpr-transfer");
+ return NULL;
+}
+
+/*
+ * cpr-transfer mode cannot use the target monitor prior to starting the
+ * migration, and cannot connect synchronously to the monitor, so defer
+ * the target connection.
+ */
+static void test_mode_transfer_common(bool incoming_defer)
+{
+ g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
+ g_autofree char *mig_path = g_strdup_printf("%s/migsocket", tmpfs);
+ g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
+
+ const char *opts = "-machine aux-ram-share=on -nodefaults";
+ g_autofree char *opts_target = g_strdup_printf(
+ "-incoming \\{\\\'channel-type\\\':\\\'cpr\\\',"
+ "\\\'addr\\\':\\{\\\'transport\\\':\\\'socket\\\',"
+ "\\\'type\\\':\\\'unix\\\',\\\'path\\\':\\\'%s\\\'\\}\\} %s",
+ cpr_path, opts);
+
+ g_autofree char *channels = g_strdup_printf(
+ "[ { 'channel-type': 'main',"
+ " 'addr': { 'transport': 'socket',"
+ " 'type': 'unix',"
+ " 'path': '%s' } },"
+ " { 'channel-type': 'cpr',"
+ " 'addr': { 'transport': 'socket',"
+ " 'type': 'unix',"
+ " 'path': '%s' } } ]",
+ mig_path, cpr_path);
+
+ MigrateCommon args = {
+ .start.opts_source = opts,
+ .start.opts_target = opts_target,
+ .start.defer_target_connect = true,
+ .start.memory_backend = "-object memory-backend-memfd,id=pc.ram,size=%s"
+ " -machine memory-backend=pc.ram",
+ .listen_uri = incoming_defer ? "defer" : uri,
+ .connect_uri = incoming_defer ? uri : NULL,
+ .connect_channels = channels,
+ .start_hook = test_mode_transfer_start,
+ };
+
+ test_precopy_common(&args);
+}
+
+static void test_mode_transfer(void)
+{
+ test_mode_transfer_common(NULL);
+}
+
+static void test_mode_transfer_defer(void)
+{
+ test_mode_transfer_common(true);
+}
+
static void test_precopy_tcp_plain(void)
{
MigrateCommon args = {
@@ -3905,6 +3971,10 @@ int main(int argc, char **argv)
migration_test_add("/migration/mode/reboot", test_mode_reboot);
}
+ migration_test_add("/migration/mode/transfer", test_mode_transfer);
+ migration_test_add("/migration/mode/transfer/defer",
+ test_mode_transfer_defer);
+
migration_test_add("/migration/precopy/file/mapped-ram",
test_precopy_file_mapped_ram);
migration_test_add("/migration/precopy/file/mapped-ram/live",
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 18/19] migration-test: cpr-transfer
2024-12-02 13:20 ` [PATCH V4 18/19] migration-test: cpr-transfer Steve Sistare
@ 2024-12-18 21:03 ` Steven Sistare
2024-12-19 16:56 ` Peter Xu
1 sibling, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 21:03 UTC (permalink / raw)
To: Peter Xu, Fabiano Rosas
Cc: David Hildenbrand, Marcel Apfelbaum, Eduardo Habkost,
Philippe Mathieu-Daude, Paolo Bonzini, Daniel P. Berrange,
Markus Armbruster, qemu-devel
If someone is bored, this needs review, else let it ride until V5.
- Steve
On 12/2/2024 8:20 AM, Steve Sistare wrote:
> Add a migration test for cpr-transfer mode. Defer the connection to the
> target monitor, else the test hangs because in cpr-transfer mode QEMU does
> not listen for monitor connections until we send the migrate command to
> source QEMU.
>
> To test -incoming defer, send a migrate incoming command to the target,
> after sending the migrate command to the source, as required by
> cpr-transfer mode.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> tests/qtest/migration-test.c | 72 +++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 71 insertions(+), 1 deletion(-)
>
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 8bc665d..4eb641c 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -1729,6 +1729,7 @@ static void test_precopy_common(MigrateCommon *args)
> {
> QTestState *from, *to;
> void *data_hook = NULL;
> + const char *connect_uri;
>
> if (test_migrate_start(&from, &to, args->listen_uri, &args->start)) {
> return;
> @@ -1766,11 +1767,16 @@ static void test_precopy_common(MigrateCommon *args)
> goto finish;
> }
>
> - migrate_qmp(from, to, args->connect_uri, args->connect_channels, "{}");
> + /* If has channels, then connect_uri is only used for listen defer */
> + connect_uri = args->connect_channels ? NULL : args->connect_uri;
> + migrate_qmp(from, to, connect_uri, args->connect_channels, "{}");
>
> if (args->start.defer_target_connect) {
> qtest_connect_deferred(to);
> qtest_qmp_handshake(to);
> + if (!strcmp(args->listen_uri, "defer")) {
> + migrate_incoming_qmp(to, args->connect_uri, "{}");
> + }
> }
>
> if (args->result != MIG_TEST_SUCCEED) {
> @@ -2415,6 +2421,66 @@ static void test_multifd_file_mapped_ram_fdset_dio(void)
> }
> #endif /* !_WIN32 */
>
> +static void *test_mode_transfer_start(QTestState *from, QTestState *to)
> +{
> + migrate_set_parameter_str(from, "mode", "cpr-transfer");
> + return NULL;
> +}
> +
> +/*
> + * cpr-transfer mode cannot use the target monitor prior to starting the
> + * migration, and cannot connect synchronously to the monitor, so defer
> + * the target connection.
> + */
> +static void test_mode_transfer_common(bool incoming_defer)
> +{
> + g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
> + g_autofree char *mig_path = g_strdup_printf("%s/migsocket", tmpfs);
> + g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
> +
> + const char *opts = "-machine aux-ram-share=on -nodefaults";
> + g_autofree char *opts_target = g_strdup_printf(
> + "-incoming \\{\\\'channel-type\\\':\\\'cpr\\\',"
> + "\\\'addr\\\':\\{\\\'transport\\\':\\\'socket\\\',"
> + "\\\'type\\\':\\\'unix\\\',\\\'path\\\':\\\'%s\\\'\\}\\} %s",
> + cpr_path, opts);
> +
> + g_autofree char *channels = g_strdup_printf(
> + "[ { 'channel-type': 'main',"
> + " 'addr': { 'transport': 'socket',"
> + " 'type': 'unix',"
> + " 'path': '%s' } },"
> + " { 'channel-type': 'cpr',"
> + " 'addr': { 'transport': 'socket',"
> + " 'type': 'unix',"
> + " 'path': '%s' } } ]",
> + mig_path, cpr_path);
> +
> + MigrateCommon args = {
> + .start.opts_source = opts,
> + .start.opts_target = opts_target,
> + .start.defer_target_connect = true,
> + .start.memory_backend = "-object memory-backend-memfd,id=pc.ram,size=%s"
> + " -machine memory-backend=pc.ram",
> + .listen_uri = incoming_defer ? "defer" : uri,
> + .connect_uri = incoming_defer ? uri : NULL,
> + .connect_channels = channels,
> + .start_hook = test_mode_transfer_start,
> + };
> +
> + test_precopy_common(&args);
> +}
> +
> +static void test_mode_transfer(void)
> +{
> + test_mode_transfer_common(NULL);
> +}
> +
> +static void test_mode_transfer_defer(void)
> +{
> + test_mode_transfer_common(true);
> +}
> +
> static void test_precopy_tcp_plain(void)
> {
> MigrateCommon args = {
> @@ -3905,6 +3971,10 @@ int main(int argc, char **argv)
> migration_test_add("/migration/mode/reboot", test_mode_reboot);
> }
>
> + migration_test_add("/migration/mode/transfer", test_mode_transfer);
> + migration_test_add("/migration/mode/transfer/defer",
> + test_mode_transfer_defer);
> +
> migration_test_add("/migration/precopy/file/mapped-ram",
> test_precopy_file_mapped_ram);
> migration_test_add("/migration/precopy/file/mapped-ram/live",
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 18/19] migration-test: cpr-transfer
2024-12-02 13:20 ` [PATCH V4 18/19] migration-test: cpr-transfer Steve Sistare
2024-12-18 21:03 ` Steven Sistare
@ 2024-12-19 16:56 ` Peter Xu
2024-12-19 22:34 ` Steven Sistare
1 sibling, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-19 16:56 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:20:10AM -0800, Steve Sistare wrote:
> Add a migration test for cpr-transfer mode. Defer the connection to the
> target monitor, else the test hangs because in cpr-transfer mode QEMU does
> not listen for monitor connections until we send the migrate command to
> source QEMU.
>
> To test -incoming defer, send a migrate incoming command to the target,
> after sending the migrate command to the source, as required by
> cpr-transfer mode.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> tests/qtest/migration-test.c | 72 +++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 71 insertions(+), 1 deletion(-)
>
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 8bc665d..4eb641c 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -1729,6 +1729,7 @@ static void test_precopy_common(MigrateCommon *args)
> {
> QTestState *from, *to;
> void *data_hook = NULL;
> + const char *connect_uri;
>
> if (test_migrate_start(&from, &to, args->listen_uri, &args->start)) {
> return;
> @@ -1766,11 +1767,16 @@ static void test_precopy_common(MigrateCommon *args)
> goto finish;
> }
>
> - migrate_qmp(from, to, args->connect_uri, args->connect_channels, "{}");
> + /* If has channels, then connect_uri is only used for listen defer */
> + connect_uri = args->connect_channels ? NULL : args->connect_uri;
> + migrate_qmp(from, to, connect_uri, args->connect_channels, "{}");
This smells like abuse.
If the test case sets connect_uri only because of below...
>
> if (args->start.defer_target_connect) {
> qtest_connect_deferred(to);
> qtest_qmp_handshake(to);
> + if (!strcmp(args->listen_uri, "defer")) {
> + migrate_incoming_qmp(to, args->connect_uri, "{}");
... here, then IMHO it's abusing connect_uri to start service incoming
ports.
We do have solution for "delay" incoming, right? Shouldn't we use
migrate_get_connect_uri() instead, then never set connect_uri in
cpr-transfer tests?
> + }
> }
>
> if (args->result != MIG_TEST_SUCCEED) {
> @@ -2415,6 +2421,66 @@ static void test_multifd_file_mapped_ram_fdset_dio(void)
> }
> #endif /* !_WIN32 */
>
> +static void *test_mode_transfer_start(QTestState *from, QTestState *to)
> +{
> + migrate_set_parameter_str(from, "mode", "cpr-transfer");
> + return NULL;
> +}
> +
> +/*
> + * cpr-transfer mode cannot use the target monitor prior to starting the
> + * migration, and cannot connect synchronously to the monitor, so defer
> + * the target connection.
> + */
> +static void test_mode_transfer_common(bool incoming_defer)
> +{
> + g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
> + g_autofree char *mig_path = g_strdup_printf("%s/migsocket", tmpfs);
> + g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
> +
> + const char *opts = "-machine aux-ram-share=on -nodefaults";
> + g_autofree char *opts_target = g_strdup_printf(
> + "-incoming \\{\\\'channel-type\\\':\\\'cpr\\\',"
> + "\\\'addr\\\':\\{\\\'transport\\\':\\\'socket\\\',"
> + "\\\'type\\\':\\\'unix\\\',\\\'path\\\':\\\'%s\\\'\\}\\} %s",
> + cpr_path, opts);
Nobody will be able to change this easily.. Maybe use g_strescape()?
> +
> + g_autofree char *channels = g_strdup_printf(
> + "[ { 'channel-type': 'main',"
> + " 'addr': { 'transport': 'socket',"
> + " 'type': 'unix',"
> + " 'path': '%s' } },"
> + " { 'channel-type': 'cpr',"
> + " 'addr': { 'transport': 'socket',"
> + " 'type': 'unix',"
> + " 'path': '%s' } } ]",
> + mig_path, cpr_path);
> +
> + MigrateCommon args = {
> + .start.opts_source = opts,
> + .start.opts_target = opts_target,
> + .start.defer_target_connect = true,
> + .start.memory_backend = "-object memory-backend-memfd,id=pc.ram,size=%s"
> + " -machine memory-backend=pc.ram",
> + .listen_uri = incoming_defer ? "defer" : uri,
> + .connect_uri = incoming_defer ? uri : NULL,
> + .connect_channels = channels,
> + .start_hook = test_mode_transfer_start,
> + };
> +
> + test_precopy_common(&args);
> +}
> +
> +static void test_mode_transfer(void)
> +{
> + test_mode_transfer_common(NULL);
> +}
> +
> +static void test_mode_transfer_defer(void)
> +{
> + test_mode_transfer_common(true);
> +}
> +
> static void test_precopy_tcp_plain(void)
> {
> MigrateCommon args = {
> @@ -3905,6 +3971,10 @@ int main(int argc, char **argv)
> migration_test_add("/migration/mode/reboot", test_mode_reboot);
> }
>
> + migration_test_add("/migration/mode/transfer", test_mode_transfer);
> + migration_test_add("/migration/mode/transfer/defer",
> + test_mode_transfer_defer);
> +
> migration_test_add("/migration/precopy/file/mapped-ram",
> test_precopy_file_mapped_ram);
> migration_test_add("/migration/precopy/file/mapped-ram/live",
> --
> 1.8.3.1
>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 18/19] migration-test: cpr-transfer
2024-12-19 16:56 ` Peter Xu
@ 2024-12-19 22:34 ` Steven Sistare
2024-12-20 15:41 ` Peter Xu
0 siblings, 1 reply; 78+ messages in thread
From: Steven Sistare @ 2024-12-19 22:34 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/19/2024 11:56 AM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:20:10AM -0800, Steve Sistare wrote:
>> Add a migration test for cpr-transfer mode. Defer the connection to the
>> target monitor, else the test hangs because in cpr-transfer mode QEMU does
>> not listen for monitor connections until we send the migrate command to
>> source QEMU.
>>
>> To test -incoming defer, send a migrate incoming command to the target,
>> after sending the migrate command to the source, as required by
>> cpr-transfer mode.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> tests/qtest/migration-test.c | 72 +++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 71 insertions(+), 1 deletion(-)
>>
>> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
>> index 8bc665d..4eb641c 100644
>> --- a/tests/qtest/migration-test.c
>> +++ b/tests/qtest/migration-test.c
>> @@ -1729,6 +1729,7 @@ static void test_precopy_common(MigrateCommon *args)
>> {
>> QTestState *from, *to;
>> void *data_hook = NULL;
>> + const char *connect_uri;
>>
>> if (test_migrate_start(&from, &to, args->listen_uri, &args->start)) {
>> return;
>> @@ -1766,11 +1767,16 @@ static void test_precopy_common(MigrateCommon *args)
>> goto finish;
>> }
>>
>> - migrate_qmp(from, to, args->connect_uri, args->connect_channels, "{}");
>> + /* If has channels, then connect_uri is only used for listen defer */
>> + connect_uri = args->connect_channels ? NULL : args->connect_uri;
>> + migrate_qmp(from, to, connect_uri, args->connect_channels, "{}");
>
> This smells like abuse.
>
> If the test case sets connect_uri only because of below...
>
>>
>> if (args->start.defer_target_connect) {
>> qtest_connect_deferred(to);
>> qtest_qmp_handshake(to);
>> + if (!strcmp(args->listen_uri, "defer")) {
>> + migrate_incoming_qmp(to, args->connect_uri, "{}");
>
> ... here, then IMHO it's abusing connect_uri to start service incoming
> ports.
>
> We do have solution for "delay" incoming, right? Shouldn't we use
> migrate_get_connect_uri() instead, then never set connect_uri in
> cpr-transfer tests?
We cannot use migrate_get_connect_uri() to get the URI to pass to
migrate_incoming_qmp, because the migrate_incoming_qmp sets the URI
returned by query-migrate. chicken-and-egg problem.
I'll add channels support to migrate_incoming_qmp, like migrate_qmp.
The cpr-transfer test will set listen_uri and connect_channels but
will not set connect_uri.
>> + }
>> }
>>
>> if (args->result != MIG_TEST_SUCCEED) {
>> @@ -2415,6 +2421,66 @@ static void test_multifd_file_mapped_ram_fdset_dio(void)
>> }
>> #endif /* !_WIN32 */
>>
>> +static void *test_mode_transfer_start(QTestState *from, QTestState *to)
>> +{
>> + migrate_set_parameter_str(from, "mode", "cpr-transfer");
>> + return NULL;
>> +}
>> +
>> +/*
>> + * cpr-transfer mode cannot use the target monitor prior to starting the
>> + * migration, and cannot connect synchronously to the monitor, so defer
>> + * the target connection.
>> + */
>> +static void test_mode_transfer_common(bool incoming_defer)
>> +{
>> + g_autofree char *cpr_path = g_strdup_printf("%s/cpr.sock", tmpfs);
>> + g_autofree char *mig_path = g_strdup_printf("%s/migsocket", tmpfs);
>> + g_autofree char *uri = g_strdup_printf("unix:%s", mig_path);
>> +
>> + const char *opts = "-machine aux-ram-share=on -nodefaults";
>> + g_autofree char *opts_target = g_strdup_printf(
>> + "-incoming \\{\\\'channel-type\\\':\\\'cpr\\\',"
>> + "\\\'addr\\\':\\{\\\'transport\\\':\\\'socket\\\',"
>> + "\\\'type\\\':\\\'unix\\\',\\\'path\\\':\\\'%s\\\'\\}\\} %s",
>> + cpr_path, opts);
>
> Nobody will be able to change this easily.. Maybe use g_strescape()?
Agreed. Fortunately incoming now accepts dotted keys after a suggestion from
Markus, so this can be:
-incoming cpr,addr.transport=socket,addr.type=unix,addr.path=%s
- Steve
>> +
>> + g_autofree char *channels = g_strdup_printf(
>> + "[ { 'channel-type': 'main',"
>> + " 'addr': { 'transport': 'socket',"
>> + " 'type': 'unix',"
>> + " 'path': '%s' } },"
>> + " { 'channel-type': 'cpr',"
>> + " 'addr': { 'transport': 'socket',"
>> + " 'type': 'unix',"
>> + " 'path': '%s' } } ]",
>> + mig_path, cpr_path);
>> +
>> + MigrateCommon args = {
>> + .start.opts_source = opts,
>> + .start.opts_target = opts_target,
>> + .start.defer_target_connect = true,
>> + .start.memory_backend = "-object memory-backend-memfd,id=pc.ram,size=%s"
>> + " -machine memory-backend=pc.ram",
>> + .listen_uri = incoming_defer ? "defer" : uri,
>> + .connect_uri = incoming_defer ? uri : NULL,
>> + .connect_channels = channels,
>> + .start_hook = test_mode_transfer_start,
>> + };
>> +
>> + test_precopy_common(&args);
>> +}
>> +
>> +static void test_mode_transfer(void)
>> +{
>> + test_mode_transfer_common(NULL);
>> +}
>> +
>> +static void test_mode_transfer_defer(void)
>> +{
>> + test_mode_transfer_common(true);
>> +}
>> +
>> static void test_precopy_tcp_plain(void)
>> {
>> MigrateCommon args = {
>> @@ -3905,6 +3971,10 @@ int main(int argc, char **argv)
>> migration_test_add("/migration/mode/reboot", test_mode_reboot);
>> }
>>
>> + migration_test_add("/migration/mode/transfer", test_mode_transfer);
>> + migration_test_add("/migration/mode/transfer/defer",
>> + test_mode_transfer_defer);
>> +
>> migration_test_add("/migration/precopy/file/mapped-ram",
>> test_precopy_file_mapped_ram);
>> migration_test_add("/migration/precopy/file/mapped-ram/live",
>> --
>> 1.8.3.1
>>
>
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 18/19] migration-test: cpr-transfer
2024-12-19 22:34 ` Steven Sistare
@ 2024-12-20 15:41 ` Peter Xu
0 siblings, 0 replies; 78+ messages in thread
From: Peter Xu @ 2024-12-20 15:41 UTC (permalink / raw)
To: Steven Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Thu, Dec 19, 2024 at 05:34:59PM -0500, Steven Sistare wrote:
> > > @@ -1766,11 +1767,16 @@ static void test_precopy_common(MigrateCommon *args)
> > > goto finish;
> > > }
> > > - migrate_qmp(from, to, args->connect_uri, args->connect_channels, "{}");
> > > + /* If has channels, then connect_uri is only used for listen defer */
> > > + connect_uri = args->connect_channels ? NULL : args->connect_uri;
> > > + migrate_qmp(from, to, connect_uri, args->connect_channels, "{}");
> >
> > This smells like abuse.
> >
> > If the test case sets connect_uri only because of below...
> >
> > > if (args->start.defer_target_connect) {
> > > qtest_connect_deferred(to);
> > > qtest_qmp_handshake(to);
> > > + if (!strcmp(args->listen_uri, "defer")) {
> > > + migrate_incoming_qmp(to, args->connect_uri, "{}");
> >
> > ... here, then IMHO it's abusing connect_uri to start service incoming
> > ports.
> >
> > We do have solution for "delay" incoming, right? Shouldn't we use
> > migrate_get_connect_uri() instead, then never set connect_uri in
> > cpr-transfer tests?
>
> We cannot use migrate_get_connect_uri() to get the URI to pass to
> migrate_incoming_qmp, because the migrate_incoming_qmp sets the URI
> returned by query-migrate. chicken-and-egg problem.
Oh yes, stupid me.
>
> I'll add channels support to migrate_incoming_qmp, like migrate_qmp.
> The cpr-transfer test will set listen_uri and connect_channels but
> will not set connect_uri.
That's still a lightweight abuse, but better than connect_uri indeed.
Hopefully cpr is the only one that uses defer_target_connect, so yeah we
can go with it at least for now..
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* [PATCH V4 19/19] migration: cpr-transfer documentation
2024-12-02 13:19 [PATCH V4 00/19] Live update: cpr-transfer Steve Sistare
` (17 preceding siblings ...)
2024-12-02 13:20 ` [PATCH V4 18/19] migration-test: cpr-transfer Steve Sistare
@ 2024-12-02 13:20 ` Steve Sistare
2024-12-18 21:03 ` Steven Sistare
2024-12-19 17:02 ` Peter Xu
18 siblings, 2 replies; 78+ messages in thread
From: Steve Sistare @ 2024-12-02 13:20 UTC (permalink / raw)
To: qemu-devel
Cc: Peter Xu, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster, Steve Sistare
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
docs/devel/migration/CPR.rst | 176 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 174 insertions(+), 2 deletions(-)
diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst
index 63c3647..a8a57c0 100644
--- a/docs/devel/migration/CPR.rst
+++ b/docs/devel/migration/CPR.rst
@@ -5,7 +5,7 @@ CPR is the umbrella name for a set of migration modes in which the
VM is migrated to a new QEMU instance on the same host. It is
intended for use when the goal is to update host software components
that run the VM, such as QEMU or even the host kernel. At this time,
-cpr-reboot is the only available mode.
+the cpr-reboot and cpr-transfer modes are available.
Because QEMU is restarted on the same host, with access to the same
local devices, CPR is allowed in certain cases where normal migration
@@ -53,7 +53,7 @@ RAM is copied to the migration URI.
Outgoing:
* Set the migration mode parameter to ``cpr-reboot``.
* Set the ``x-ignore-shared`` capability if desired.
- * Issue the ``migrate`` command. It is recommended the the URI be a
+ * Issue the ``migrate`` command. It is recommended the URI be a
``file`` type, but one can use other types such as ``exec``,
provided the command captures all the data from the outgoing side,
and provides all the data to the incoming side.
@@ -145,3 +145,175 @@ Caveats
cpr-reboot mode may not be used with postcopy, background-snapshot,
or COLO.
+
+cpr-transfer mode
+-----------------
+
+This mode allows the user to transfer a guest to a new QEMU instance
+on the same host with minimal guest pause time, by preserving guest
+RAM in place, albeit with new virtual addresses in new QEMU.
+
+The user starts new QEMU on the same host as old QEMU, with the
+same arguments as old QEMU, plus the ``-incoming option``. The user
+issues the migrate command to old QEMU, which stops the VM, saves
+state to the migration channels, and enters the postmigrate state.
+Execution resumes in new QEMU.
+
+This mode requires a second migration channel type "cpr" in the
+channel arguments on the outgoing side. The channel must be a type,
+such as unix socket, that supports SCM_RIGHTS. However, the cpr
+channel cannot be added to the list of channels for a migrate-incoming
+command, because it must be read before new QEMU opens a monitor.
+Instead, the user passes the channel as a second -incoming
+command-line argument to new QEMU using JSON syntax.
+
+Usage
+^^^^^
+
+Memory backend objects must have the ``share=on`` attribute.
+
+The VM must be started with the ``-machine aux-ram-share=on``
+option. This causes implicit RAM blocks (those not described by
+a memory-backend object) to be allocated by mmap'ing a memfd.
+Examples include VGA and ROM.
+
+Outgoing:
+ * Set the migration mode parameter to ``cpr-transfer``.
+ * Issue the ``migrate`` command, containing a main channel and
+ a cpr channel.
+
+Incoming:
+ * Start new QEMU with two ``-incoming`` options.
+ * If the VM was running when the outgoing ``migrate`` command was
+ issued, then QEMU automatically resumes VM execution.
+
+Caveats
+^^^^^^^
+
+cpr-transfer mode may not be used with postcopy, background-snapshot,
+or COLO.
+
+memory-backend-epc is not supported.
+
+The main incoming migration channel cannot be a file type.
+
+If the main incoming migration channel is a tcp type, then the port
+cannot be 0 (meaning dynamically choose a port).
+
+When using ``-incoming defer``, you must issue the migrate command to
+old QEMU before issuing any monitor commands to new QEMU, because new
+QEMU blocks waiting to read from the cpr channel before starting its
+monitor, and old QEMU does not write to the channel until the migrate
+command is issued. However, new QEMU does not open and read the
+main migration channel until you issue the migrate incoming command.
+
+Example 1: incoming channel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In these examples, we simply restart the same version of QEMU, but
+in a real scenario one would start new QEMU on the incoming side.
+Note that new QEMU does not print the monitor prompt until old QEMU
+has issued the migrate command. The outgoing side uses QMP because
+HMP cannot specify a CPR channel. Some QMP responses are omitted for
+brevity.
+
+::
+
+ Outgoing: Incoming:
+
+ # qemu-kvm -qmp stdio
+ -object memory-backend-file,id=ram0,size=4G,
+ mem-path=/dev/shm/ram0,share=on -m 4G
+ -machine aux-ram-share=on
+ ...
+ # qemu-kvm -monitor stdio
+ -incoming tcp:0:44444
+ -incoming '{"channel-type": "cpr",
+ "addr": { "transport": "socket",
+ "type": "unix", "path": "cpr.sock"}}'
+ ...
+ {"execute":"qmp_capabilities"}
+
+ {"execute": "query-status"}
+ {"return": {"status": "running",
+ "running": true}}
+
+ {"execute":"migrate-set-parameters",
+ "arguments":{"mode":"cpr-transfer"}}
+
+ {"execute": "migrate", "arguments": { "channels": [
+ {"channel-type": "main",
+ "addr": { "transport": "socket", "type": "inet",
+ "host": "0", "port": "44444" }},
+ {"channel-type": "cpr",
+ "addr": { "transport": "socket", "type": "unix",
+ "path": "cpr.sock" }}]}}
+
+ QEMU 10.0.50 monitor
+ (qemu) info status
+ VM status: running
+
+ {"execute": "query-status"}
+ {"return": {"status": "postmigrate",
+ "running": false}}
+
+Example 2: incoming defer
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This example uses ``-incoming defer`` to hot plug a device before
+accepting the main migration channel. Again note you must issue the
+migrate command to old QEMU before you can issue any monitor
+commands to new QEMU.
+
+
+::
+
+ Outgoing: Incoming:
+
+ # qemu-kvm -monitor stdio
+ -object memory-backend-file,id=ram0,size=4G,
+ mem-path=/dev/shm/ram0,share=on -m 4G
+ -machine aux-ram-share=on
+ ...
+ # qemu-kvm -monitor stdio
+ -incoming defer
+ -incoming '{"channel-type": "cpr",
+ "addr": { "transport": "socket",
+ "type": "unix", "path": "cpr.sock"}}'
+ ...
+ {"execute":"qmp_capabilities"}
+
+ {"execute": "device_add",
+ "arguments": {"driver": "pcie-root-port"}}
+
+ {"execute":"migrate-set-parameters",
+ "arguments":{"mode":"cpr-transfer"}}
+
+ {"execute": "migrate", "arguments": { "channels": [
+ {"channel-type": "main",
+ "addr": { "transport": "socket", "type": "inet",
+ "host": "0", "port": "44444" }},
+ {"channel-type": "cpr",
+ "addr": { "transport": "socket", "type": "unix",
+ "path": "cpr.sock" }}]}}
+
+ QEMU 10.0.50 monitor
+ (qemu) info status
+ VM status: paused (inmigrate)
+ (qemu) device_add pcie-root-port
+ (qemu) migrate_incoming tcp:0:44444
+ (qemu) info status
+ VM status: running
+
+ {"execute": "query-status"}
+ {"return": {"status": "postmigrate",
+ "running": false}}
+
+Futures
+^^^^^^^
+
+cpr-transfer mode is based on a capability to transfer open file
+descriptors from old to new QEMU. In the future, descriptors for
+vfio, iommufd, vhost, and char devices could be transferred,
+preserving those devices and their kernel state without interruption,
+even if they do not explicitly support live migration.
--
1.8.3.1
^ permalink raw reply related [flat|nested] 78+ messages in thread
* Re: [PATCH V4 19/19] migration: cpr-transfer documentation
2024-12-02 13:20 ` [PATCH V4 19/19] migration: cpr-transfer documentation Steve Sistare
@ 2024-12-18 21:03 ` Steven Sistare
2024-12-19 17:02 ` Peter Xu
1 sibling, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-18 21:03 UTC (permalink / raw)
To: Peter Xu, Fabiano Rosas, Markus Armbruster
Cc: David Hildenbrand, Marcel Apfelbaum, Eduardo Habkost,
Philippe Mathieu-Daude, Paolo Bonzini, Daniel P. Berrange,
qemu-devel
If someone is bored, this needs review, else let it ride until V5.
- Steve
On 12/2/2024 8:20 AM, Steve Sistare wrote:
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> docs/devel/migration/CPR.rst | 176 ++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 174 insertions(+), 2 deletions(-)
>
> diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst
> index 63c3647..a8a57c0 100644
> --- a/docs/devel/migration/CPR.rst
> +++ b/docs/devel/migration/CPR.rst
> @@ -5,7 +5,7 @@ CPR is the umbrella name for a set of migration modes in which the
> VM is migrated to a new QEMU instance on the same host. It is
> intended for use when the goal is to update host software components
> that run the VM, such as QEMU or even the host kernel. At this time,
> -cpr-reboot is the only available mode.
> +the cpr-reboot and cpr-transfer modes are available.
>
> Because QEMU is restarted on the same host, with access to the same
> local devices, CPR is allowed in certain cases where normal migration
> @@ -53,7 +53,7 @@ RAM is copied to the migration URI.
> Outgoing:
> * Set the migration mode parameter to ``cpr-reboot``.
> * Set the ``x-ignore-shared`` capability if desired.
> - * Issue the ``migrate`` command. It is recommended the the URI be a
> + * Issue the ``migrate`` command. It is recommended the URI be a
> ``file`` type, but one can use other types such as ``exec``,
> provided the command captures all the data from the outgoing side,
> and provides all the data to the incoming side.
> @@ -145,3 +145,175 @@ Caveats
>
> cpr-reboot mode may not be used with postcopy, background-snapshot,
> or COLO.
> +
> +cpr-transfer mode
> +-----------------
> +
> +This mode allows the user to transfer a guest to a new QEMU instance
> +on the same host with minimal guest pause time, by preserving guest
> +RAM in place, albeit with new virtual addresses in new QEMU.
> +
> +The user starts new QEMU on the same host as old QEMU, with the
> +same arguments as old QEMU, plus the ``-incoming option``. The user
> +issues the migrate command to old QEMU, which stops the VM, saves
> +state to the migration channels, and enters the postmigrate state.
> +Execution resumes in new QEMU.
> +
> +This mode requires a second migration channel type "cpr" in the
> +channel arguments on the outgoing side. The channel must be a type,
> +such as unix socket, that supports SCM_RIGHTS. However, the cpr
> +channel cannot be added to the list of channels for a migrate-incoming
> +command, because it must be read before new QEMU opens a monitor.
> +Instead, the user passes the channel as a second -incoming
> +command-line argument to new QEMU using JSON syntax.
> +
> +Usage
> +^^^^^
> +
> +Memory backend objects must have the ``share=on`` attribute.
> +
> +The VM must be started with the ``-machine aux-ram-share=on``
> +option. This causes implicit RAM blocks (those not described by
> +a memory-backend object) to be allocated by mmap'ing a memfd.
> +Examples include VGA and ROM.
> +
> +Outgoing:
> + * Set the migration mode parameter to ``cpr-transfer``.
> + * Issue the ``migrate`` command, containing a main channel and
> + a cpr channel.
> +
> +Incoming:
> + * Start new QEMU with two ``-incoming`` options.
> + * If the VM was running when the outgoing ``migrate`` command was
> + issued, then QEMU automatically resumes VM execution.
> +
> +Caveats
> +^^^^^^^
> +
> +cpr-transfer mode may not be used with postcopy, background-snapshot,
> +or COLO.
> +
> +memory-backend-epc is not supported.
> +
> +The main incoming migration channel cannot be a file type.
> +
> +If the main incoming migration channel is a tcp type, then the port
> +cannot be 0 (meaning dynamically choose a port).
> +
> +When using ``-incoming defer``, you must issue the migrate command to
> +old QEMU before issuing any monitor commands to new QEMU, because new
> +QEMU blocks waiting to read from the cpr channel before starting its
> +monitor, and old QEMU does not write to the channel until the migrate
> +command is issued. However, new QEMU does not open and read the
> +main migration channel until you issue the migrate incoming command.
> +
> +Example 1: incoming channel
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +In these examples, we simply restart the same version of QEMU, but
> +in a real scenario one would start new QEMU on the incoming side.
> +Note that new QEMU does not print the monitor prompt until old QEMU
> +has issued the migrate command. The outgoing side uses QMP because
> +HMP cannot specify a CPR channel. Some QMP responses are omitted for
> +brevity.
> +
> +::
> +
> + Outgoing: Incoming:
> +
> + # qemu-kvm -qmp stdio
> + -object memory-backend-file,id=ram0,size=4G,
> + mem-path=/dev/shm/ram0,share=on -m 4G
> + -machine aux-ram-share=on
> + ...
> + # qemu-kvm -monitor stdio
> + -incoming tcp:0:44444
> + -incoming '{"channel-type": "cpr",
> + "addr": { "transport": "socket",
> + "type": "unix", "path": "cpr.sock"}}'
> + ...
> + {"execute":"qmp_capabilities"}
> +
> + {"execute": "query-status"}
> + {"return": {"status": "running",
> + "running": true}}
> +
> + {"execute":"migrate-set-parameters",
> + "arguments":{"mode":"cpr-transfer"}}
> +
> + {"execute": "migrate", "arguments": { "channels": [
> + {"channel-type": "main",
> + "addr": { "transport": "socket", "type": "inet",
> + "host": "0", "port": "44444" }},
> + {"channel-type": "cpr",
> + "addr": { "transport": "socket", "type": "unix",
> + "path": "cpr.sock" }}]}}
> +
> + QEMU 10.0.50 monitor
> + (qemu) info status
> + VM status: running
> +
> + {"execute": "query-status"}
> + {"return": {"status": "postmigrate",
> + "running": false}}
> +
> +Example 2: incoming defer
> +^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +This example uses ``-incoming defer`` to hot plug a device before
> +accepting the main migration channel. Again note you must issue the
> +migrate command to old QEMU before you can issue any monitor
> +commands to new QEMU.
> +
> +
> +::
> +
> + Outgoing: Incoming:
> +
> + # qemu-kvm -monitor stdio
> + -object memory-backend-file,id=ram0,size=4G,
> + mem-path=/dev/shm/ram0,share=on -m 4G
> + -machine aux-ram-share=on
> + ...
> + # qemu-kvm -monitor stdio
> + -incoming defer
> + -incoming '{"channel-type": "cpr",
> + "addr": { "transport": "socket",
> + "type": "unix", "path": "cpr.sock"}}'
> + ...
> + {"execute":"qmp_capabilities"}
> +
> + {"execute": "device_add",
> + "arguments": {"driver": "pcie-root-port"}}
> +
> + {"execute":"migrate-set-parameters",
> + "arguments":{"mode":"cpr-transfer"}}
> +
> + {"execute": "migrate", "arguments": { "channels": [
> + {"channel-type": "main",
> + "addr": { "transport": "socket", "type": "inet",
> + "host": "0", "port": "44444" }},
> + {"channel-type": "cpr",
> + "addr": { "transport": "socket", "type": "unix",
> + "path": "cpr.sock" }}]}}
> +
> + QEMU 10.0.50 monitor
> + (qemu) info status
> + VM status: paused (inmigrate)
> + (qemu) device_add pcie-root-port
> + (qemu) migrate_incoming tcp:0:44444
> + (qemu) info status
> + VM status: running
> +
> + {"execute": "query-status"}
> + {"return": {"status": "postmigrate",
> + "running": false}}
> +
> +Futures
> +^^^^^^^
> +
> +cpr-transfer mode is based on a capability to transfer open file
> +descriptors from old to new QEMU. In the future, descriptors for
> +vfio, iommufd, vhost, and char devices could be transferred,
> +preserving those devices and their kernel state without interruption,
> +even if they do not explicitly support live migration.
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 19/19] migration: cpr-transfer documentation
2024-12-02 13:20 ` [PATCH V4 19/19] migration: cpr-transfer documentation Steve Sistare
2024-12-18 21:03 ` Steven Sistare
@ 2024-12-19 17:02 ` Peter Xu
2024-12-19 22:35 ` Steven Sistare
1 sibling, 1 reply; 78+ messages in thread
From: Peter Xu @ 2024-12-19 17:02 UTC (permalink / raw)
To: Steve Sistare
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On Mon, Dec 02, 2024 at 05:20:11AM -0800, Steve Sistare wrote:
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
I think this one is already good enough, so:
Reviewed-by: Peter Xu <peterx@redhat.com>
But still, a few comments inline.
> ---
> docs/devel/migration/CPR.rst | 176 ++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 174 insertions(+), 2 deletions(-)
>
> diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst
> index 63c3647..a8a57c0 100644
> --- a/docs/devel/migration/CPR.rst
> +++ b/docs/devel/migration/CPR.rst
> @@ -5,7 +5,7 @@ CPR is the umbrella name for a set of migration modes in which the
> VM is migrated to a new QEMU instance on the same host. It is
> intended for use when the goal is to update host software components
> that run the VM, such as QEMU or even the host kernel. At this time,
> -cpr-reboot is the only available mode.
> +the cpr-reboot and cpr-transfer modes are available.
>
> Because QEMU is restarted on the same host, with access to the same
> local devices, CPR is allowed in certain cases where normal migration
> @@ -53,7 +53,7 @@ RAM is copied to the migration URI.
> Outgoing:
> * Set the migration mode parameter to ``cpr-reboot``.
> * Set the ``x-ignore-shared`` capability if desired.
> - * Issue the ``migrate`` command. It is recommended the the URI be a
> + * Issue the ``migrate`` command. It is recommended the URI be a
> ``file`` type, but one can use other types such as ``exec``,
> provided the command captures all the data from the outgoing side,
> and provides all the data to the incoming side.
> @@ -145,3 +145,175 @@ Caveats
>
> cpr-reboot mode may not be used with postcopy, background-snapshot,
> or COLO.
> +
> +cpr-transfer mode
> +-----------------
> +
> +This mode allows the user to transfer a guest to a new QEMU instance
> +on the same host with minimal guest pause time, by preserving guest
> +RAM in place, albeit with new virtual addresses in new QEMU.
IMHO it's important to have some words discussing the diff
v.s. ignore-shared, because from above it's undistinguishable from it.
The important bit could be that cpr-transfer allows pinning. If that's too
internal a concept, we could mention "unlike an ignore-shared migration,
cpr-transfer will start to enable local migrations to work seamlessly with
all kinds of device assignments like VFIO, vDPA and so on".
> +
> +The user starts new QEMU on the same host as old QEMU, with the
> +same arguments as old QEMU, plus the ``-incoming option``. The user
> +issues the migrate command to old QEMU, which stops the VM, saves
> +state to the migration channels, and enters the postmigrate state.
> +Execution resumes in new QEMU.
> +
> +This mode requires a second migration channel type "cpr" in the
> +channel arguments on the outgoing side. The channel must be a type,
> +such as unix socket, that supports SCM_RIGHTS. However, the cpr
> +channel cannot be added to the list of channels for a migrate-incoming
> +command, because it must be read before new QEMU opens a monitor.
> +Instead, the user passes the channel as a second -incoming
> +command-line argument to new QEMU using JSON syntax.
> +
> +Usage
> +^^^^^
> +
> +Memory backend objects must have the ``share=on`` attribute.
> +
> +The VM must be started with the ``-machine aux-ram-share=on``
> +option. This causes implicit RAM blocks (those not described by
> +a memory-backend object) to be allocated by mmap'ing a memfd.
> +Examples include VGA and ROM.
> +
> +Outgoing:
> + * Set the migration mode parameter to ``cpr-transfer``.
> + * Issue the ``migrate`` command, containing a main channel and
> + a cpr channel.
> +
> +Incoming:
> + * Start new QEMU with two ``-incoming`` options.
> + * If the VM was running when the outgoing ``migrate`` command was
> + issued, then QEMU automatically resumes VM execution.
> +
> +Caveats
> +^^^^^^^
> +
> +cpr-transfer mode may not be used with postcopy, background-snapshot,
> +or COLO.
Maybe we can even remove this line. It's not like someone could think
about supporting any of above; they just don't apply in cpr context.
> +
> +memory-backend-epc is not supported.
> +
> +The main incoming migration channel cannot be a file type.
> +
> +If the main incoming migration channel is a tcp type, then the port
> +cannot be 0 (meaning dynamically choose a port).
> +
> +When using ``-incoming defer``, you must issue the migrate command to
> +old QEMU before issuing any monitor commands to new QEMU, because new
> +QEMU blocks waiting to read from the cpr channel before starting its
> +monitor, and old QEMU does not write to the channel until the migrate
> +command is issued. However, new QEMU does not open and read the
> +main migration channel until you issue the migrate incoming command.
> +
> +Example 1: incoming channel
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +In these examples, we simply restart the same version of QEMU, but
> +in a real scenario one would start new QEMU on the incoming side.
> +Note that new QEMU does not print the monitor prompt until old QEMU
> +has issued the migrate command. The outgoing side uses QMP because
> +HMP cannot specify a CPR channel. Some QMP responses are omitted for
> +brevity.
> +
> +::
> +
> + Outgoing: Incoming:
> +
> + # qemu-kvm -qmp stdio
> + -object memory-backend-file,id=ram0,size=4G,
> + mem-path=/dev/shm/ram0,share=on -m 4G
> + -machine aux-ram-share=on
> + ...
> + # qemu-kvm -monitor stdio
> + -incoming tcp:0:44444
> + -incoming '{"channel-type": "cpr",
> + "addr": { "transport": "socket",
> + "type": "unix", "path": "cpr.sock"}}'
> + ...
> + {"execute":"qmp_capabilities"}
> +
> + {"execute": "query-status"}
> + {"return": {"status": "running",
> + "running": true}}
> +
> + {"execute":"migrate-set-parameters",
> + "arguments":{"mode":"cpr-transfer"}}
> +
> + {"execute": "migrate", "arguments": { "channels": [
> + {"channel-type": "main",
> + "addr": { "transport": "socket", "type": "inet",
> + "host": "0", "port": "44444" }},
> + {"channel-type": "cpr",
> + "addr": { "transport": "socket", "type": "unix",
> + "path": "cpr.sock" }}]}}
> +
> + QEMU 10.0.50 monitor
> + (qemu) info status
> + VM status: running
> +
> + {"execute": "query-status"}
> + {"return": {"status": "postmigrate",
> + "running": false}}
> +
> +Example 2: incoming defer
> +^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +This example uses ``-incoming defer`` to hot plug a device before
> +accepting the main migration channel. Again note you must issue the
> +migrate command to old QEMU before you can issue any monitor
> +commands to new QEMU.
> +
> +
> +::
> +
> + Outgoing: Incoming:
> +
> + # qemu-kvm -monitor stdio
> + -object memory-backend-file,id=ram0,size=4G,
> + mem-path=/dev/shm/ram0,share=on -m 4G
> + -machine aux-ram-share=on
> + ...
> + # qemu-kvm -monitor stdio
> + -incoming defer
> + -incoming '{"channel-type": "cpr",
> + "addr": { "transport": "socket",
> + "type": "unix", "path": "cpr.sock"}}'
> + ...
> + {"execute":"qmp_capabilities"}
> +
> + {"execute": "device_add",
> + "arguments": {"driver": "pcie-root-port"}}
> +
> + {"execute":"migrate-set-parameters",
> + "arguments":{"mode":"cpr-transfer"}}
> +
> + {"execute": "migrate", "arguments": { "channels": [
> + {"channel-type": "main",
> + "addr": { "transport": "socket", "type": "inet",
> + "host": "0", "port": "44444" }},
> + {"channel-type": "cpr",
> + "addr": { "transport": "socket", "type": "unix",
> + "path": "cpr.sock" }}]}}
> +
> + QEMU 10.0.50 monitor
> + (qemu) info status
> + VM status: paused (inmigrate)
> + (qemu) device_add pcie-root-port
> + (qemu) migrate_incoming tcp:0:44444
> + (qemu) info status
> + VM status: running
> +
> + {"execute": "query-status"}
> + {"return": {"status": "postmigrate",
> + "running": false}}
> +
> +Futures
> +^^^^^^^
> +
> +cpr-transfer mode is based on a capability to transfer open file
> +descriptors from old to new QEMU. In the future, descriptors for
> +vfio, iommufd, vhost, and char devices could be transferred,
> +preserving those devices and their kernel state without interruption,
> +even if they do not explicitly support live migration.
> --
> 1.8.3.1
>
--
Peter Xu
^ permalink raw reply [flat|nested] 78+ messages in thread
* Re: [PATCH V4 19/19] migration: cpr-transfer documentation
2024-12-19 17:02 ` Peter Xu
@ 2024-12-19 22:35 ` Steven Sistare
0 siblings, 0 replies; 78+ messages in thread
From: Steven Sistare @ 2024-12-19 22:35 UTC (permalink / raw)
To: Peter Xu
Cc: qemu-devel, Fabiano Rosas, David Hildenbrand, Marcel Apfelbaum,
Eduardo Habkost, Philippe Mathieu-Daude, Paolo Bonzini,
Daniel P. Berrange, Markus Armbruster
On 12/19/2024 12:02 PM, Peter Xu wrote:
> On Mon, Dec 02, 2024 at 05:20:11AM -0800, Steve Sistare wrote:
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>
> I think this one is already good enough, so:
>
> Reviewed-by: Peter Xu <peterx@redhat.com>
>
> But still, a few comments inline.
>
>> ---
>> docs/devel/migration/CPR.rst | 176 ++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 174 insertions(+), 2 deletions(-)
>>
>> diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst
>> index 63c3647..a8a57c0 100644
>> --- a/docs/devel/migration/CPR.rst
>> +++ b/docs/devel/migration/CPR.rst
>> @@ -5,7 +5,7 @@ CPR is the umbrella name for a set of migration modes in which the
>> VM is migrated to a new QEMU instance on the same host. It is
>> intended for use when the goal is to update host software components
>> that run the VM, such as QEMU or even the host kernel. At this time,
>> -cpr-reboot is the only available mode.
>> +the cpr-reboot and cpr-transfer modes are available.
>>
>> Because QEMU is restarted on the same host, with access to the same
>> local devices, CPR is allowed in certain cases where normal migration
>> @@ -53,7 +53,7 @@ RAM is copied to the migration URI.
>> Outgoing:
>> * Set the migration mode parameter to ``cpr-reboot``.
>> * Set the ``x-ignore-shared`` capability if desired.
>> - * Issue the ``migrate`` command. It is recommended the the URI be a
>> + * Issue the ``migrate`` command. It is recommended the URI be a
>> ``file`` type, but one can use other types such as ``exec``,
>> provided the command captures all the data from the outgoing side,
>> and provides all the data to the incoming side.
>> @@ -145,3 +145,175 @@ Caveats
>>
>> cpr-reboot mode may not be used with postcopy, background-snapshot,
>> or COLO.
>> +
>> +cpr-transfer mode
>> +-----------------
>> +
>> +This mode allows the user to transfer a guest to a new QEMU instance
>> +on the same host with minimal guest pause time, by preserving guest
>> +RAM in place, albeit with new virtual addresses in new QEMU.
>
> IMHO it's important to have some words discussing the diff
> v.s. ignore-shared, because from above it's undistinguishable from it.
>
> The important bit could be that cpr-transfer allows pinning. If that's too
> internal a concept, we could mention "unlike an ignore-shared migration,
> cpr-transfer will start to enable local migrations to work seamlessly with
> all kinds of device assignments like VFIO, vDPA and so on".
OK, I'll add something like that.
>> +
>> +The user starts new QEMU on the same host as old QEMU, with the
>> +same arguments as old QEMU, plus the ``-incoming option``. The user
>> +issues the migrate command to old QEMU, which stops the VM, saves
>> +state to the migration channels, and enters the postmigrate state.
>> +Execution resumes in new QEMU.
>> +
>> +This mode requires a second migration channel type "cpr" in the
>> +channel arguments on the outgoing side. The channel must be a type,
>> +such as unix socket, that supports SCM_RIGHTS. However, the cpr
>> +channel cannot be added to the list of channels for a migrate-incoming
>> +command, because it must be read before new QEMU opens a monitor.
>> +Instead, the user passes the channel as a second -incoming
>> +command-line argument to new QEMU using JSON syntax.
>> +
>> +Usage
>> +^^^^^
>> +
>> +Memory backend objects must have the ``share=on`` attribute.
>> +
>> +The VM must be started with the ``-machine aux-ram-share=on``
>> +option. This causes implicit RAM blocks (those not described by
>> +a memory-backend object) to be allocated by mmap'ing a memfd.
>> +Examples include VGA and ROM.
>> +
>> +Outgoing:
>> + * Set the migration mode parameter to ``cpr-transfer``.
>> + * Issue the ``migrate`` command, containing a main channel and
>> + a cpr channel.
>> +
>> +Incoming:
>> + * Start new QEMU with two ``-incoming`` options.
>> + * If the VM was running when the outgoing ``migrate`` command was
>> + issued, then QEMU automatically resumes VM execution.
>> +
>> +Caveats
>> +^^^^^^^
>> +
>> +cpr-transfer mode may not be used with postcopy, background-snapshot,
>> +or COLO.
>
> Maybe we can even remove this line. It's not like someone could think
> about supporting any of above; they just don't apply in cpr context.
The same caveat is listed for cpr-reboot mode. We should either delete both
or keep both. But IMO many aspects of migration are confusing for beginners,
and more detail rather than less would be helpful.
- Steve
>> +
>> +memory-backend-epc is not supported.
>> +
>> +The main incoming migration channel cannot be a file type.
>> +
>> +If the main incoming migration channel is a tcp type, then the port
>> +cannot be 0 (meaning dynamically choose a port).
>> +
>> +When using ``-incoming defer``, you must issue the migrate command to
>> +old QEMU before issuing any monitor commands to new QEMU, because new
>> +QEMU blocks waiting to read from the cpr channel before starting its
>> +monitor, and old QEMU does not write to the channel until the migrate
>> +command is issued. However, new QEMU does not open and read the
>> +main migration channel until you issue the migrate incoming command.
>> +
>> +Example 1: incoming channel
>> +^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +In these examples, we simply restart the same version of QEMU, but
>> +in a real scenario one would start new QEMU on the incoming side.
>> +Note that new QEMU does not print the monitor prompt until old QEMU
>> +has issued the migrate command. The outgoing side uses QMP because
>> +HMP cannot specify a CPR channel. Some QMP responses are omitted for
>> +brevity.
>> +
>> +::
>> +
>> + Outgoing: Incoming:
>> +
>> + # qemu-kvm -qmp stdio
>> + -object memory-backend-file,id=ram0,size=4G,
>> + mem-path=/dev/shm/ram0,share=on -m 4G
>> + -machine aux-ram-share=on
>> + ...
>> + # qemu-kvm -monitor stdio
>> + -incoming tcp:0:44444
>> + -incoming '{"channel-type": "cpr",
>> + "addr": { "transport": "socket",
>> + "type": "unix", "path": "cpr.sock"}}'
>> + ...
>> + {"execute":"qmp_capabilities"}
>> +
>> + {"execute": "query-status"}
>> + {"return": {"status": "running",
>> + "running": true}}
>> +
>> + {"execute":"migrate-set-parameters",
>> + "arguments":{"mode":"cpr-transfer"}}
>> +
>> + {"execute": "migrate", "arguments": { "channels": [
>> + {"channel-type": "main",
>> + "addr": { "transport": "socket", "type": "inet",
>> + "host": "0", "port": "44444" }},
>> + {"channel-type": "cpr",
>> + "addr": { "transport": "socket", "type": "unix",
>> + "path": "cpr.sock" }}]}}
>> +
>> + QEMU 10.0.50 monitor
>> + (qemu) info status
>> + VM status: running
>> +
>> + {"execute": "query-status"}
>> + {"return": {"status": "postmigrate",
>> + "running": false}}
>> +
>> +Example 2: incoming defer
>> +^^^^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +This example uses ``-incoming defer`` to hot plug a device before
>> +accepting the main migration channel. Again note you must issue the
>> +migrate command to old QEMU before you can issue any monitor
>> +commands to new QEMU.
>> +
>> +
>> +::
>> +
>> + Outgoing: Incoming:
>> +
>> + # qemu-kvm -monitor stdio
>> + -object memory-backend-file,id=ram0,size=4G,
>> + mem-path=/dev/shm/ram0,share=on -m 4G
>> + -machine aux-ram-share=on
>> + ...
>> + # qemu-kvm -monitor stdio
>> + -incoming defer
>> + -incoming '{"channel-type": "cpr",
>> + "addr": { "transport": "socket",
>> + "type": "unix", "path": "cpr.sock"}}'
>> + ...
>> + {"execute":"qmp_capabilities"}
>> +
>> + {"execute": "device_add",
>> + "arguments": {"driver": "pcie-root-port"}}
>> +
>> + {"execute":"migrate-set-parameters",
>> + "arguments":{"mode":"cpr-transfer"}}
>> +
>> + {"execute": "migrate", "arguments": { "channels": [
>> + {"channel-type": "main",
>> + "addr": { "transport": "socket", "type": "inet",
>> + "host": "0", "port": "44444" }},
>> + {"channel-type": "cpr",
>> + "addr": { "transport": "socket", "type": "unix",
>> + "path": "cpr.sock" }}]}}
>> +
>> + QEMU 10.0.50 monitor
>> + (qemu) info status
>> + VM status: paused (inmigrate)
>> + (qemu) device_add pcie-root-port
>> + (qemu) migrate_incoming tcp:0:44444
>> + (qemu) info status
>> + VM status: running
>> +
>> + {"execute": "query-status"}
>> + {"return": {"status": "postmigrate",
>> + "running": false}}
>> +
>> +Futures
>> +^^^^^^^
>> +
>> +cpr-transfer mode is based on a capability to transfer open file
>> +descriptors from old to new QEMU. In the future, descriptors for
>> +vfio, iommufd, vhost, and char devices could be transferred,
>> +preserving those devices and their kernel state without interruption,
>> +even if they do not explicitly support live migration.
>> --
>> 1.8.3.1
>>
>
^ permalink raw reply [flat|nested] 78+ messages in thread