CRIU (Checkpoint/Restore in Userspace) mailing list
 help / color / mirror / Atom feed
From: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
To: criu@lists.linux.dev
Cc: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>,
	David Francis <David.Francis@amd.com>
Subject: [PATCH v2 18/23] plugins/amdgpu: Move drm file dump and restore into helpers
Date: Fri, 10 Apr 2026 19:55:09 +0100	[thread overview]
Message-ID: <20260410185514.51153-19-tvrtko.ursulin@igalia.com> (raw)
In-Reply-To: <20260410185514.51153-1-tvrtko.ursulin@igalia.com>

Given that dump and restore of KFD vs DRM forks into completely parallel
flows inside the respective plugin callbacks, lets split it out by
extracting into helpers. This de-clutters the code flow and makes it
obvious which part is done where.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Reviewed-By: David Francis <David.Francis@amd.com>
---
 plugins/amdgpu/amdgpu_plugin.c        | 275 ++++++++++++++------------
 plugins/amdgpu/amdgpu_plugin_dmabuf.c |   2 +
 2 files changed, 146 insertions(+), 131 deletions(-)

diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c
index f00b60453c6c..2e3f997266c0 100644
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -1397,6 +1397,30 @@ exit:
 	return ret;
 }
 
+static int amdgpu_plugin_dump_drm_file(int fd, int id, struct stat *st)
+{
+	int ret;
+
+	/* This is RenderD dumper plugin, for now just save renderD
+	 * minor number to be used during restore. In later phases this
+	 * needs to save more data for video decode etc.
+	 */
+	ret = amdgpu_plugin_drm_dump_file(fd, id, st);
+	if (ret)
+		return ret;
+
+	ret = record_dumped_fd(fd, true);
+	if (ret)
+		return ret;
+
+	ret = try_dump_dmabuf_list();
+
+	if (!ret)
+		ret = amdgpu_add_to_inventory();
+
+	return ret;
+}
+
 int amdgpu_plugin_dump_file(int fd, int id)
 {
 	struct kfd_ioctl_criu_args args = { 0 };
@@ -1435,35 +1459,14 @@ int amdgpu_plugin_dump_file(int fd, int id)
 		pr_perror("Failed to get dmabuf info");
 		return -1;
 	}
-	if (ret == 0) {
-		pr_info("Dumping dmabuf fd = %d\n", fd);
-		return amdgpu_plugin_dmabuf_dump(fd, id);
-	}
-
-	if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
-
-		/* This is RenderD dumper plugin, for now just save renderD
-		 * minor number to be used during restore. In later phases this
-		 * needs to save more data for video decode etc.
-		 */
-		ret = amdgpu_plugin_drm_dump_file(fd, id, &st);
-		if (ret)
-			return ret;
-
-		ret = record_dumped_fd(fd, true);
-		if (ret)
-			return ret;
-
-		ret = try_dump_dmabuf_list();
-
-		if (!ret)
-			ret = amdgpu_add_to_inventory();
-
-		return ret;
-	}
 
 	pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev));
 
+	if (ret == 0)
+		return amdgpu_plugin_dmabuf_dump(fd, id);
+	else if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0)
+		return amdgpu_plugin_dump_drm_file(fd, id, &st);
+
 	/* KFD only allows ioctl calls from the same process that opened the KFD file descriptor.
 	 * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with
 	 * CAP_CHECKPOINT_RESTORE/CAP_SYS_ADMIN. So kernel_supports_criu() needs to open its own file descriptor to
@@ -1835,12 +1838,124 @@ exit:
 	return ret;
 }
 
+static int amdgpu_plugin_restore_drm_file(int id, bool *retry_needed)
+{
+	char img_path[PATH_MAX];
+	struct tp_node *tp_node;
+	uint32_t target_gpu_id;
+	CriuRenderNode *rd;
+	unsigned char *buf;
+	size_t img_size;
+	FILE *img_fp;
+	int fd, ret;
+
+	/* This is restorer plugin for renderD nodes. Criu doesn't guarantee that they will
+	 * be called before the plugin is called for kfd file descriptor.
+	 * TODO: Currently, this code will only work if this function is called for /dev/kfd
+	 * first as we assume restore_maps is already filled. Need to fix this later.
+	 */
+	snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
+
+	img_fp = open_img_file(img_path, false, &img_size, true);
+	if (!img_fp) {
+		ret = amdgpu_plugin_dmabuf_restore(id);
+		if (ret == 1) {
+			/* This is a dmabuf fd, but the corresponding buffer object that was
+			 * exported to make it has not yet been restored. Need to try again
+			 * later when the buffer object exists, so it can be re-exported.
+			 */
+			*retry_needed = true;
+			return 0;
+		}
+		return ret;
+	}
+	pr_info("Restoring RenderD %s\n", img_path);
+	pr_debug("RenderD Image file size:%ld\n", img_size);
+	buf = xmalloc(img_size);
+	if (!buf) {
+		pr_perror("Failed to allocate memory");
+		return -ENOMEM;
+	}
+
+	ret = read_fp(img_fp, buf, img_size);
+	if (ret) {
+		pr_perror("Unable to read from %s", img_path);
+		xfree(buf);
+		return -1;
+	}
+
+	rd = criu_render_node__unpack(NULL, img_size, buf);
+	if (rd == NULL) {
+		pr_perror("Unable to parse the RenderD message %d", id);
+		xfree(buf);
+		fclose(img_fp);
+		return -1;
+	}
+	fclose(img_fp);
+
+	pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id);
+
+	target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id);
+	if (!target_gpu_id) {
+		fd = -ENODEV;
+		goto fail;
+	}
+
+	tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
+	if (!tp_node) {
+		fd = -ENODEV;
+		goto fail;
+	}
+
+	pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
+
+	fd = node_get_drm_render_device(tp_node);
+	if (fd < 0) {
+		pr_err("Failed to open render device (minor:%d) - %s\n",
+		       tp_node->drm_render_minor, strerror(-fd));
+		goto fail;
+	}
+
+	ret = amdgpu_plugin_drm_restore_file(fd, rd);
+	if (ret == 1)
+		*retry_needed = true;
+	if (ret < 0) {
+		fd = ret;
+		goto fail;
+	}
+fail:
+	criu_render_node__free_unpacked(rd, NULL);
+	xfree(buf);
+	/*
+	 * We need to use the file descriptor used to create the BOs for mmap later, otherwise the kernel DRM
+	 * drivers will not allow the mmap. Therefore, we keep a copy of the file descriptor (stored in tp_node)
+	 * so that we can return it in amdgpu_plugin_update_vmamap later. Also, CRIU core will dup and close the
+	 * returned fd after this function returns, and this will make our fd invalid. So we return a dup'ed
+	 * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in
+	 * tp_node.
+	 */
+
+	if (fd < 0)
+		return fd;
+
+	if (!(*retry_needed)) {
+		fd = dup(fd);
+		if (fd == -1) {
+			pr_perror("unable to duplicate the render fd");
+			return -1;
+		}
+		return fd;
+	}
+
+	return 0;
+
+}
+
 int amdgpu_plugin_restore_file(int id, bool *retry_needed)
 {
 	int ret = 0, fd;
 	char img_path[PATH_MAX];
 	unsigned char *buf;
-	CriuRenderNode *rd;
 	CriuKfd *e = NULL;
 	struct kfd_ioctl_criu_args args = { 0 };
 	size_t img_size;
@@ -1856,110 +1971,8 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
 	snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
 
 	img_fp = open_img_file(img_path, false, &img_size, false);
-	if (!img_fp) {
-		struct tp_node *tp_node;
-		uint32_t target_gpu_id;
-
-		/* This is restorer plugin for renderD nodes. Criu doesn't guarantee that they will
-		 * be called before the plugin is called for kfd file descriptor.
-		 * TODO: Currently, this code will only work if this function is called for /dev/kfd
-		 * first as we assume restore_maps is already filled. Need to fix this later.
-		 */
-		snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
-
-		img_fp = open_img_file(img_path, false, &img_size, true);
-		if (!img_fp) {
-			ret = amdgpu_plugin_dmabuf_restore(id);
-			if (ret == 1) {
-				/* This is a dmabuf fd, but the corresponding buffer object that was
-				 * exported to make it has not yet been restored. Need to try again
-				 * later when the buffer object exists, so it can be re-exported.
-				 */
-				*retry_needed = true;
-				return 0;
-			}
-			return ret;
-		}
-		pr_info("Restoring RenderD %s\n", img_path);
-		pr_debug("RenderD Image file size:%ld\n", img_size);
-		buf = xmalloc(img_size);
-		if (!buf) {
-			pr_perror("Failed to allocate memory");
-			return -ENOMEM;
-		}
-
-		ret = read_fp(img_fp, buf, img_size);
-		if (ret) {
-			pr_perror("Unable to read from %s", img_path);
-			xfree(buf);
-			return -1;
-		}
-
-		rd = criu_render_node__unpack(NULL, img_size, buf);
-		if (rd == NULL) {
-			pr_perror("Unable to parse the RenderD message %d", id);
-			xfree(buf);
-			fclose(img_fp);
-			return -1;
-		}
-		fclose(img_fp);
-
-		pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id);
-
-		target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id);
-		if (!target_gpu_id) {
-			fd = -ENODEV;
-			goto fail;
-		}
-
-		tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
-		if (!tp_node) {
-			fd = -ENODEV;
-			goto fail;
-		}
-
-		pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
-
-		fd = node_get_drm_render_device(tp_node);
-		if (fd < 0) {
-			pr_err("Failed to open render device (minor:%d) - %s\n",
-			       tp_node->drm_render_minor, strerror(-fd));
-			return -1;
-		}
-
-		ret = amdgpu_plugin_drm_restore_file(fd, rd);
-		if (ret == 1)
-			*retry_needed = true;
-		if (ret < 0) {
-			fd = ret;
-			goto fail;
-		}
-	fail:
-		criu_render_node__free_unpacked(rd, NULL);
-		xfree(buf);
-		/*
-		 * We need to use the file descriptor used to create the BOs for mmap later, otherwise the kernel DRM
-		 * drivers will not allow the mmap. Therefore, we keep a copy of the file descriptor (stored in tp_node)
-		 * so that we can return it in amdgpu_plugin_update_vmamap later. Also, CRIU core will dup and close the
-		 * returned fd after this function returns, and this will make our fd invalid. So we return a dup'ed
-		 * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in
-		 * tp_node.
-		 */
-
-		if (fd < 0)
-			return fd;
-
-		if (!(*retry_needed)) {
-			fd = dup(fd);
-			if (fd == -1) {
-				pr_perror("unable to duplicate the render fd");
-				return -1;
-			}
-			return fd;
-		}
-
-		return 0;
-	}
+	if (!img_fp)
+		return amdgpu_plugin_restore_drm_file(id, retry_needed);
 
 	fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
 	if (fd < 0) {
diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c
index 312c9f95d048..00181d46ca5c 100644
--- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c
+++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c
@@ -164,6 +164,8 @@ int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
 {
 	int ret;
 
+	pr_info("Dumping dmabuf fd %d\n", dmabuf_fd);
+
 	ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id);
 	if (ret == -EAGAIN) {
 		struct dmabuf *b = xmalloc(sizeof(*b));
-- 
2.52.0


  parent reply	other threads:[~2026-04-10 18:55 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-10 18:54 [PATCH v2 00/23] Amdgpu plugin cleanups and fixes Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 01/23] plugins/amgdpu: Fix one error message Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 02/23] plugins/amdgpu: Remove unused current_pid global variable Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 03/23] plugins/amdgpu: Remove unused new_minor from struct vma_metadata Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 04/23] plugins/amdgpu: Fix drm pages size header Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 05/23] plugins/amdgpu: Fix logging of failures to open files during restore init Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 06/23] plugins/amdgpu: Propagate failure to save buffer object content Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 07/23] plugins/amdgpu: Close the directory when image probing fails Tvrtko Ursulin
2026-04-10 18:54 ` [PATCH v2 08/23] plugins/amdgpu: Close dma-buf image file if the read fails Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 09/23] plugins/amdgpu: Flatten amdgpu_restore_init a bit Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 10/23] plugins/amdgpu: Add error handling for seek operations Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 11/23] plugins/amdgpu: Consolidate vm_info collection Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 12/23] plugins/amdgpu: Remove plugin_log_msg() Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 13/23] plugins/amdgpu: Reduce amount of debug logging a little bit Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 14/23] plugins/amdgpu: Do not eat the errno in kmtIoctl Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 15/23] plugins/amdgpu: Fix open_drm_render_device() Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 16/23] plugins/amdgpu: Check sdma operation type early and once Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 17/23] plugins/amdgpu: Add plugin to inventory even if process has no vmas Tvrtko Ursulin
2026-04-10 18:55 ` Tvrtko Ursulin [this message]
2026-04-10 18:55 ` [PATCH v2 19/23] plugins/amdgpu: Use the load_img helper in drm file restore Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 20/23] plugins/amdgpu: Convert away from libc buffered file IO Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 21/23] plugins/amdgpu: Use save_vma_updates for all call sites Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 22/23] plugins/amdgpu: amdgpu_plugin_drm_restore_file() does not need to use libdrm Tvrtko Ursulin
2026-04-10 18:55 ` [PATCH v2 23/23] plugins/amdgpu: Fix remaining wrong usages of pr_perror Tvrtko Ursulin
2026-04-13 18:23 ` [PATCH v2 00/23] Amdgpu plugin cleanups and fixes Andrei Vagin
2026-04-13 19:47   ` Tvrtko Ursulin
2026-04-13 20:03     ` Andrei Vagin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260410185514.51153-19-tvrtko.ursulin@igalia.com \
    --to=tvrtko.ursulin@igalia.com \
    --cc=David.Francis@amd.com \
    --cc=criu@lists.linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox