AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Felix Kuehling <felix.kuehling@amd.com>
To: Al Viro <viro@zeniv.linux.org.uk>,
	amd-gfx@lists.freedesktop.org, "Deucher,
	Alexander" <Alexander.Deucher@amd.com>
Cc: linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH 2/2][RFC] amdkfd CRIU fixes
Date: Tue, 4 Jun 2024 14:16:00 -0400	[thread overview]
Message-ID: <1cd7980e-1cfa-4470-b712-48d9d2658435@amd.com> (raw)
In-Reply-To: <20240604021456.GB3053943@ZenIV>


On 2024-06-03 22:14, Al Viro wrote:
> Instead of trying to use close_fd() on failure exits, just have
> criu_get_prime_handle() store the file reference without inserting
> it into descriptor table.
>
> Then, once the callers are past the last failure exit, they can go
> and either insert all those file references into the corresponding
> slots of descriptor table, or drop all those file references and
> free the unused descriptors.
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Thank you for the patches and the explanation. One minor nit-pick 
inline. With that fixed, this patch is

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>

I can apply this patch to amd-staging-drm-next, if you want. See one 
comment inline ...


> ---
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index fdf171ad4a3c..3f129e1c0daa 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -36,7 +36,6 @@
>   #include <linux/mman.h>
>   #include <linux/ptrace.h>
>   #include <linux/dma-buf.h>
> -#include <linux/fdtable.h>
>   #include <linux/processor.h>
>   #include "kfd_priv.h"
>   #include "kfd_device_queue_manager.h"
> @@ -1857,7 +1856,8 @@ static uint32_t get_process_num_bos(struct kfd_process *p)
>   }
>   
>   static int criu_get_prime_handle(struct kgd_mem *mem,
> -				 int flags, u32 *shared_fd)
> +				 int flags, u32 *shared_fd,
> +				 struct file **file)
>   {
>   	struct dma_buf *dmabuf;
>   	int ret;
> @@ -1868,13 +1868,14 @@ static int criu_get_prime_handle(struct kgd_mem *mem,
>   		return ret;
>   	}
>   
> -	ret = dma_buf_fd(dmabuf, flags);
> +	ret = get_unused_fd_flags(flags);
>   	if (ret < 0) {
>   		pr_err("dmabuf create fd failed, ret:%d\n", ret);
>   		goto out_free_dmabuf;
>   	}
>   
>   	*shared_fd = ret;
> +	*file = dmabuf->file;
>   	return 0;
>   
>   out_free_dmabuf:
> @@ -1882,6 +1883,24 @@ static int criu_get_prime_handle(struct kgd_mem *mem,
>   	return ret;
>   }
>   
> +static void commit_files(struct file **files,
> +			 struct kfd_criu_bo_bucket *bo_buckets,
> +			 unsigned int count,
> +			 int err)
> +{
> +	while (count--) {
> +		struct file *file = files[count];
> +		if (!file)

checkpatch.pl would complain here without an empty line after the 
variable definition.

Regards,
   Felix


> +			continue;
> +		if (err) {
> +			fput(file);
> +			put_unused_fd(bo_buckets[count].dmabuf_fd);
> +		} else {
> +			fd_install(bo_buckets[count].dmabuf_fd, file);
> +		}
> +	}
> +}
> +
>   static int criu_checkpoint_bos(struct kfd_process *p,
>   			       uint32_t num_bos,
>   			       uint8_t __user *user_bos,
> @@ -1890,6 +1909,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
>   {
>   	struct kfd_criu_bo_bucket *bo_buckets;
>   	struct kfd_criu_bo_priv_data *bo_privs;
> +	struct file **files = NULL;
>   	int ret = 0, pdd_index, bo_index = 0, id;
>   	void *mem;
>   
> @@ -1903,6 +1923,12 @@ static int criu_checkpoint_bos(struct kfd_process *p,
>   		goto exit;
>   	}
>   
> +	files = kvzalloc(num_bos * sizeof(struct file *), GFP_KERNEL);
> +	if (!files) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
>   	for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
>   		struct kfd_process_device *pdd = p->pdds[pdd_index];
>   		struct amdgpu_bo *dumper_bo;
> @@ -1950,7 +1976,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
>   				ret = criu_get_prime_handle(kgd_mem,
>   						bo_bucket->alloc_flags &
>   						KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
> -						&bo_bucket->dmabuf_fd);
> +						&bo_bucket->dmabuf_fd, &files[bo_index]);
>   				if (ret)
>   					goto exit;
>   			} else {
> @@ -2001,12 +2027,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
>   	*priv_offset += num_bos * sizeof(*bo_privs);
>   
>   exit:
> -	while (ret && bo_index--) {
> -		if (bo_buckets[bo_index].alloc_flags
> -		    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> -			close_fd(bo_buckets[bo_index].dmabuf_fd);
> -	}
> -
> +	commit_files(files, bo_buckets, bo_index, ret);
> +	kvfree(files);
>   	kvfree(bo_buckets);
>   	kvfree(bo_privs);
>   	return ret;
> @@ -2358,7 +2380,8 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
>   
>   static int criu_restore_bo(struct kfd_process *p,
>   			   struct kfd_criu_bo_bucket *bo_bucket,
> -			   struct kfd_criu_bo_priv_data *bo_priv)
> +			   struct kfd_criu_bo_priv_data *bo_priv,
> +			   struct file **file)
>   {
>   	struct kfd_process_device *pdd;
>   	struct kgd_mem *kgd_mem;
> @@ -2410,7 +2433,7 @@ static int criu_restore_bo(struct kfd_process *p,
>   	if (bo_bucket->alloc_flags
>   	    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
>   		ret = criu_get_prime_handle(kgd_mem, DRM_RDWR,
> -					    &bo_bucket->dmabuf_fd);
> +					    &bo_bucket->dmabuf_fd, file);
>   		if (ret)
>   			return ret;
>   	} else {
> @@ -2427,6 +2450,7 @@ static int criu_restore_bos(struct kfd_process *p,
>   {
>   	struct kfd_criu_bo_bucket *bo_buckets = NULL;
>   	struct kfd_criu_bo_priv_data *bo_privs = NULL;
> +	struct file **files = NULL;
>   	int ret = 0;
>   	uint32_t i = 0;
>   
> @@ -2440,6 +2464,12 @@ static int criu_restore_bos(struct kfd_process *p,
>   	if (!bo_buckets)
>   		return -ENOMEM;
>   
> +	files = kvzalloc(args->num_bos * sizeof(struct file *), GFP_KERNEL);
> +	if (!files) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
>   	ret = copy_from_user(bo_buckets, (void __user *)args->bos,
>   			     args->num_bos * sizeof(*bo_buckets));
>   	if (ret) {
> @@ -2465,7 +2495,7 @@ static int criu_restore_bos(struct kfd_process *p,
>   
>   	/* Create and map new BOs */
>   	for (; i < args->num_bos; i++) {
> -		ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]);
> +		ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i], &files[i]);
>   		if (ret) {
>   			pr_debug("Failed to restore BO[%d] ret%d\n", i, ret);
>   			goto exit;
> @@ -2480,11 +2510,8 @@ static int criu_restore_bos(struct kfd_process *p,
>   		ret = -EFAULT;
>   
>   exit:
> -	while (ret && i--) {
> -		if (bo_buckets[i].alloc_flags
> -		   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> -			close_fd(bo_buckets[i].dmabuf_fd);
> -	}
> +	commit_files(files, bo_buckets, i, ret);
> +	kvfree(files);
>   	kvfree(bo_buckets);
>   	kvfree(bo_privs);
>   	return ret;

  reply	other threads:[~2024-06-04 18:16 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-04  2:12 [PATCHES][RFC][CFT] close_fd() misuses in amdgpu Al Viro
2024-06-04  2:13 ` [PATCH 1/2][RFC] amdgpu: fix a race in kfd_mem_export_dmabuf() Al Viro
2024-06-04 18:08   ` Felix Kuehling
2024-06-04 19:10     ` Al Viro
2024-06-04 19:17       ` [PATCH v2 " Al Viro
2024-06-05  9:14     ` [PATCH " Christian König
2024-06-06 21:57       ` Felix Kuehling
2024-06-04  2:14 ` [PATCH 2/2][RFC] amdkfd CRIU fixes Al Viro
2024-06-04 18:16   ` Felix Kuehling [this message]
2024-06-04 19:20     ` Al Viro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1cd7980e-1cfa-4470-b712-48d9d2658435@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox