From: David Francis <David.Francis@amd.com>
To: <criu@lists.linux.dev>
Cc: <tvrtko.ursulin@igalia.com>, David Francis <David.Francis@amd.com>
Subject: [PATCH 2/3] plugin/amdgpu: Add topology dump file
Date: Fri, 10 Apr 2026 10:45:08 -0400 [thread overview]
Message-ID: <20260410144509.738903-3-David.Francis@amd.com> (raw)
In-Reply-To: <20260410144509.738903-1-David.Francis@amd.com>
The state of the source topology (the GPUs, CPUs, and links
between them) is saved by the plugin as part of kfd dump.
If there is no kfd dump, we need to save the topology anyways.
Do so in new file amdgpu-topology.img.
Signed-off-by: David Francis <David.Francis@amd.com>
---
plugins/amdgpu/amdgpu_plugin.c | 84 ++++++++++++++++++++++++++---
plugins/amdgpu/amdgpu_plugin_drm.c | 64 ++++++++++++++++++++--
plugins/amdgpu/amdgpu_plugin_util.h | 9 ++++
plugins/amdgpu/criu-amdgpu.proto | 5 ++
4 files changed, 151 insertions(+), 11 deletions(-)
diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c
index 89ab10dac..1e9785440 100644
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -91,6 +91,9 @@ int current_pid;
*/
bool parallel_disabled = false;
+bool kfd_dump_complete = false;
+bool amdgpu_topology_dump_complete = false;
+
pthread_t parallel_thread = 0;
int parallel_thread_result = 0;
/**************************************************************************************************/
@@ -189,9 +192,14 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDevi
devinfo->node_id = node->id;
if (NODE_IS_GPU(node)) {
- devinfo->gpu_id = maps_get_dest_gpu(maps, node->gpu_id);
- if (!devinfo->gpu_id)
- continue;
+ if (maps) {
+ devinfo->gpu_id = maps_get_dest_gpu(maps, node->gpu_id);
+ if (!devinfo->gpu_id)
+ continue;
+ } else {
+ devinfo->gpu_id = node->gpu_id;
+ }
+
devinfo->simd_count = node->simd_count;
devinfo->mem_banks_count = node->mem_banks_count;
@@ -238,9 +246,13 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDevi
if (!iolink->valid)
continue;
- list_for_each_entry(node2, &sys->nodes, listm_system)
- if (node2->id == iolink->node_to_id && maps_get_dest_gpu(maps, node2->gpu_id) != 0)
- link_to_present_node = true;
+ if (maps) {
+ list_for_each_entry(node2, &sys->nodes, listm_system)
+ if (node2->id == iolink->node_to_id && maps_get_dest_gpu(maps, node2->gpu_id) != 0)
+ link_to_present_node = true;
+ } else {
+ link_to_present_node = true;
+ }
if (!link_to_present_node)
continue;
@@ -386,6 +398,11 @@ int amdgpu_plugin_init(int stage)
maps_init(&checkpoint_maps);
maps_init(&restore_maps);
+ if (stage == CR_PLUGIN_STAGE__DUMP) {
+ kfd_dump_complete = false;
+ amdgpu_topology_dump_complete = false;
+ }
+
if (stage == CR_PLUGIN_STAGE__RESTORE) {
if (has_children(root_item)) {
pr_info("Parallel restore disabled\n");
@@ -1552,6 +1569,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
if (ret)
goto exit;
+ kfd_dump_complete = true;
if (!plugin_added_to_inventory) {
ret = add_inventory_plugin(CR_PLUGIN_DESC.name);
if (ret) {
@@ -1908,6 +1926,60 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id);
+ if (list_empty(&restore_maps.cpu_maps) && list_empty(&restore_maps.gpu_maps)) {
+ AmdgpuDevinfo *ad;
+
+ pr_info("No restore maps found, making them from topology file\n");
+
+ img_fp = open_img_file(IMG_AMDGPU_TOPOLOGY_FILE, false, &img_size, true);
+ if (!img_fp) {
+ pr_err("Failed to find either kfd or amdgpu src topology information\n");
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ buf = xmalloc(img_size);
+ if (!buf) {
+ pr_err("Failed to allocate memory\n");
+ return -ENOMEM;
+ }
+
+ ret = read_fp(img_fp, buf, img_size);
+ if (ret) {
+ pr_err("Unable to read from %s\n", IMG_AMDGPU_TOPOLOGY_FILE);
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ ad = amdgpu_devinfo__unpack(NULL, img_size, buf);
+ if (rd == NULL) {
+ pr_perror("Unable to parse the amdgpu topology message\n");
+ fclose(img_fp);
+ ret = -EINVAL;
+ goto exit;
+ }
+ fclose(img_fp);
+
+ ret = devinfo_to_topology(ad->device_entries, ad->num_of_devices, &src_topology);
+ if (ret) {
+ pr_err("Failed to convert amdgpu device information to topology\n");
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ ret = topology_parse(&dest_topology, "Local");
+ if (ret) {
+ pr_err("Failed to parse local system topology\n");
+ goto exit;
+ }
+
+ ret = set_restore_gpu_maps(&src_topology, &dest_topology, &restore_maps);
+ if (ret) {
+ pr_err("Failed to map GPUs\n");
+ goto exit;
+ }
+ }
+
target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id);
if (!target_gpu_id) {
fd = -ENODEV;
diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c
index c1dfb2dd4..a4c650753 100644
--- a/plugins/amdgpu/amdgpu_plugin_drm.c
+++ b/plugins/amdgpu/amdgpu_plugin_drm.c
@@ -467,11 +467,65 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
return -ENODEV;
}
- /* Get the GPU_ID of the DRM device */
- rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
- if (!rd->gpu_id) {
- pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id);
- return -ENODEV;
+ if (kfd_dump_complete) {
+ /* Get the GPU_ID of the DRM device */
+ rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
+ if (!rd->gpu_id) {
+ pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id);
+ return -ENODEV;
+ }
+ } else {
+ rd->gpu_id = tp_node->gpu_id;
+
+ if (!amdgpu_topology_dump_complete) {
+ AmdgpuDevinfo *ad = NULL;
+ unsigned char *buf;
+
+ ad = xmalloc(sizeof(*ad));
+ amdgpu_devinfo__init(ad);
+
+ ad->num_of_devices = src_topology.num_nodes;
+
+ ad->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * ad->num_of_devices);
+ if (!ad->device_entries) {
+ pr_err("Failed to allocate device_entries\n");
+ return -ENOMEM;
+ }
+
+ for (int i = 0; i < ad->num_of_devices; i++) {
+ KfdDeviceEntry *entry = xzalloc(sizeof(*entry));
+
+ if (!entry) {
+ pr_err("Failed to allocate entry\n");
+ return -ENOMEM;
+ }
+
+ kfd_device_entry__init(entry);
+
+ ad->device_entries[i] = entry;
+ ad->n_device_entries++;
+ }
+
+ topology_to_devinfo(&src_topology, NULL, ad->device_entries);
+
+ len = amdgpu_devinfo__get_packed_size(ad);
+
+ buf = xmalloc(len);
+ if (!buf) {
+ pr_perror("Failed to allocate memory to store protobuf");
+ return -ENOMEM;
+ }
+
+ amdgpu_devinfo__pack(ad, buf);
+
+ ret = write_img_file(IMG_AMDGPU_TOPOLOGY_FILE, buf, len);
+ if (ret) {
+ pr_err("Failed to write image file %s\n", IMG_AMDGPU_TOPOLOGY_FILE);
+ return -EINVAL;
+ }
+
+ amdgpu_topology_dump_complete = true;
+ }
}
len = criu_render_node__get_packed_size(rd);
diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h
index 69b98a31c..ccfe30b49 100644
--- a/plugins/amdgpu/amdgpu_plugin_util.h
+++ b/plugins/amdgpu/amdgpu_plugin_util.h
@@ -2,6 +2,7 @@
#define __AMDGPU_PLUGIN_UTIL_H__
#include <libdrm/amdgpu.h>
+#include "criu-amdgpu.pb-c.h"
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
@@ -59,6 +60,9 @@
/* Name of file having serialized data of DRM device buffer objects (BOs) */
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
+/* Name of file containing the source device topology (generated only if IMG_KFD_FILE is not)*/
+#define IMG_AMDGPU_TOPOLOGY_FILE "amdgpu-topology.img"
+
/* Helper macros to Checkpoint and Restore a ROCm file */
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
#define HSAKMT_SHM "/hsakmt_shared_mem"
@@ -115,6 +119,9 @@ extern bool kfd_vram_size_check;
extern bool kfd_numa_check;
extern bool kfd_capability_check;
+extern bool kfd_dump_complete;
+extern bool amdgpu_topology_dump_complete;
+
int read_fp(FILE *fp, void *buf, const size_t buf_len);
int write_fp(FILE *fp, const void *buf, const size_t buf_len);
int read_file(const char *file_path, void *buf, const size_t buf_len);
@@ -142,4 +149,6 @@ int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
int serve_out_dmabuf_fd(int handle, int fd);
+int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries);
+
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */
diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto
index 7682a8f21..6e44e22aa 100644
--- a/plugins/amdgpu/criu-amdgpu.proto
+++ b/plugins/amdgpu/criu-amdgpu.proto
@@ -93,3 +93,8 @@ message criu_render_node {
message criu_dmabuf_node {
required uint32 gem_handle = 1;
}
+
+message amdgpu_devinfo {
+ required uint32 num_of_devices = 1;
+ repeated kfd_device_entry device_entries = 2;
+}
\ No newline at end of file
--
2.34.1
next prev parent reply other threads:[~2026-04-10 14:45 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-10 14:45 [PATCH 0/3] Patches to allow amdgpu restore without kfd file David Francis
2026-04-10 14:45 ` [PATCH 1/3] plugin/amdgpu: Add plugin to inventory even if there are no vmas David Francis
2026-04-10 15:46 ` Tvrtko Ursulin
2026-04-10 16:15 ` Francis, David
2026-04-10 14:45 ` David Francis [this message]
2026-04-16 14:54 ` [PATCH 2/3] plugin/amdgpu: Add topology dump file Tvrtko Ursulin
2026-04-16 18:26 ` Francis, David
2026-04-17 9:07 ` Tvrtko Ursulin
2026-04-17 13:21 ` Francis, David
2026-04-22 13:58 ` Tvrtko Ursulin
2026-04-10 14:45 ` [PATCH 3/3] plugins/amdgpu: Make next_fd without kfd David Francis
2026-04-10 15:42 ` [PATCH 0/3] Patches to allow amdgpu restore without kfd file Tvrtko Ursulin
2026-04-10 15:46 ` Francis, David
2026-04-10 16:04 ` Tvrtko Ursulin
2026-04-10 16:12 ` Francis, David
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260410144509.738903-3-David.Francis@amd.com \
--to=david.francis@amd.com \
--cc=criu@lists.linux.dev \
--cc=tvrtko.ursulin@igalia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox