Re: [PATCH v4] drm/amdkfd: Handle errors from svm validate and map

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: James Zhu <jamesz@amd.com>
To: Philip Yang <Philip.Yang@amd.com>, amd-gfx@lists.freedesktop.org
Cc: alex.sierra@amd.com, Felix.Kuehling@amd.com, james.zhu@amd.com
Subject: Re: [PATCH v4] drm/amdkfd: Handle errors from svm validate and map
Date: Mon, 25 Sep 2023 09:39:25 -0400	[thread overview]
Message-ID: <f1f4748a-fd05-339e-01a4-d1eae0541207@amd.com> (raw)
In-Reply-To: <20230920154530.18588-1-Philip.Yang@amd.com>

[-- Attachment #1: Type: text/plain, Size: 6680 bytes --]

Tested-by:JamesZhu<James.Zhu@amd.com>forthis patch


James zhu
On 2023-09-20 11:45, Philip Yang wrote:
> If new range is splited to multiple pranges with max_svm_range_pages
> alignment and added to update_list, svm validate and map should keep
> going after error to make sure prange->mapped_to_gpu flag is up to date
> for the whole range.
>
> svm validate and map update set prange->mapped_to_gpu after mapping to
> GPUs successfully, otherwise clear prange->mapped_to_gpu flag (for
> update mapping case) instead of setting error flag, we can remove
> the redundant error flag to simpliy code.
>
> Refactor to remove goto and update prange->mapped_to_gpu flag inside
> svm_range_lock, to guarant we always evict queues or unmap from GPUs if
> there are invalid ranges.
>
> After svm validate and map return error -EAGIN, the caller retry will
> update the mapping for the whole range again.
>
> Fixes: c22b04407097 ("drm/amdkfd: flag added to handle errors from svm validate and map")
> Signed-off-by: Philip Yang<Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 80 +++++++++++++---------------
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 -
>   2 files changed, 38 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index fb55cf80d74e..0b6a70171320 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -827,7 +827,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
>   		}
>   	}
>   
> -	return !prange->is_error_flag;
> +	return true;
>   }
>   
>   /**
> @@ -1680,7 +1680,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>   
>   	start = prange->start << PAGE_SHIFT;
>   	end = (prange->last + 1) << PAGE_SHIFT;
> -	for (addr = start; addr < end && !r; ) {
> +	for (addr = start; !r && addr < end; ) {
>   		struct hmm_range *hmm_range;
>   		struct vm_area_struct *vma;
>   		unsigned long next;
> @@ -1689,62 +1689,57 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>   		bool readonly;
>   
>   		vma = vma_lookup(mm, addr);
> -		if (!vma) {
> +		if (vma) {
> +			readonly = !(vma->vm_flags & VM_WRITE);
> +
> +			next = min(vma->vm_end, end);
> +			npages = (next - addr) >> PAGE_SHIFT;
> +			WRITE_ONCE(p->svms.faulting_task, current);
> +			r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
> +						       readonly, owner, NULL,
> +						       &hmm_range);
> +			WRITE_ONCE(p->svms.faulting_task, NULL);
> +			if (r) {
> +				pr_debug("failed %d to get svm range pages\n", r);
> +				if (r == -EBUSY)
> +					r = -EAGAIN;
> +			}
> +		} else {
>   			r = -EFAULT;
> -			goto unreserve_out;
> -		}
> -		readonly = !(vma->vm_flags & VM_WRITE);
> -
> -		next = min(vma->vm_end, end);
> -		npages = (next - addr) >> PAGE_SHIFT;
> -		WRITE_ONCE(p->svms.faulting_task, current);
> -		r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
> -					       readonly, owner, NULL,
> -					       &hmm_range);
> -		WRITE_ONCE(p->svms.faulting_task, NULL);
> -		if (r) {
> -			pr_debug("failed %d to get svm range pages\n", r);
> -			if (r == -EBUSY)
> -				r = -EAGAIN;
> -			goto unreserve_out;
>   		}
>   
> -		offset = (addr - start) >> PAGE_SHIFT;
> -		r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
> -				      hmm_range->hmm_pfns);
> -		if (r) {
> -			pr_debug("failed %d to dma map range\n", r);
> -			goto unreserve_out;
> +		if (!r) {
> +			offset = (addr - start) >> PAGE_SHIFT;
> +			r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
> +					      hmm_range->hmm_pfns);
> +			if (r)
> +				pr_debug("failed %d to dma map range\n", r);
>   		}
>   
>   		svm_range_lock(prange);
> -		if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
> +		if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
>   			pr_debug("hmm update the range, need validate again\n");
>   			r = -EAGAIN;
> -			goto unlock_out;
>   		}
> -		if (!list_empty(&prange->child_list)) {
> +
> +		if (!r && !list_empty(&prange->child_list)) {
>   			pr_debug("range split by unmap in parallel, validate again\n");
>   			r = -EAGAIN;
> -			goto unlock_out;
>   		}
>   
> -		r = svm_range_map_to_gpus(prange, offset, npages, readonly,
> -					  ctx->bitmap, wait, flush_tlb);
> +		if (!r)
> +			r = svm_range_map_to_gpus(prange, offset, npages, readonly,
> +						  ctx->bitmap, wait, flush_tlb);
> +
> +		if (!r && next == end)
> +			prange->mapped_to_gpu = true;
>   
> -unlock_out:
>   		svm_range_unlock(prange);
>   
>   		addr = next;
>   	}
>   
> -	if (addr == end)
> -		prange->mapped_to_gpu = true;
> -
> -unreserve_out:
>   	svm_range_unreserve_bos(ctx);
> -
> -	prange->is_error_flag = !!r;
>   	if (!r)
>   		prange->validate_timestamp = ktime_get_boottime();
>   
> @@ -2113,7 +2108,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
>   		next = interval_tree_iter_next(node, start, last);
>   		next_start = min(node->last, last) + 1;
>   
> -		if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
> +		if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
> +		    prange->mapped_to_gpu) {
>   			/* nothing to do */
>   		} else if (node->start < start || node->last > last) {
>   			/* node intersects the update range and its attributes
> @@ -3526,7 +3522,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>   	struct svm_range *next;
>   	bool update_mapping = false;
>   	bool flush_tlb;
> -	int r = 0;
> +	int r, ret = 0;
>   
>   	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
>   		 p->pasid, &p->svms, start, start + size - 1, size);
> @@ -3614,7 +3610,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>   out_unlock_range:
>   		mutex_unlock(&prange->migrate_mutex);
>   		if (r)
> -			break;
> +			ret = r;
>   	}
>   
>   	dynamic_svm_range_dump(svms);
> @@ -3627,7 +3623,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>   	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
>   		 &p->svms, start, start + size - 1, r);
>   
> -	return r;
> +	return ret ? ret : r;
>   }
>   
>   static int
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 5fd958a97a28..c528df1d0ba2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -133,7 +133,6 @@ struct svm_range {
>   	DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
>   	DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
>   	bool				mapped_to_gpu;
> -	bool				is_error_flag;
>   };
>   
>   static inline void svm_range_lock(struct svm_range *prange)

[-- Attachment #2: Type: text/html, Size: 7458 bytes --]

     prev parent reply	other threads:[~2023-09-25 13:39 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-13 15:16 [PATCH] drm/amdkfd: handle errors from svm validate and map Philip Yang
2023-09-13 16:14 ` Felix Kuehling
2023-09-13 17:24   ` Philip Yang
2023-09-13 17:33   ` Philip Yang
2023-09-13 18:27     ` Felix Kuehling
2023-09-15 13:28 ` [PATCH v2] " Philip Yang
2023-09-15 21:06   ` Chen, Xiaogang
2023-09-15 21:20     ` Philip Yang
2023-09-15 21:33       ` Chen, Xiaogang
2023-09-18 13:27         ` Philip Yang
2023-09-19 14:21 ` [PATCH v3] drm/amdkfd: Handle " Philip Yang
2023-09-19 21:15   ` Felix Kuehling
2023-09-20 14:20     ` Philip Yang
2023-09-20 14:35       ` Felix Kuehling
2023-09-20 15:38         ` Philip Yang
2023-09-20 15:45 ` [PATCH v4] " Philip Yang
2023-09-21 19:32   ` Felix Kuehling
2023-09-25 13:39   ` James Zhu [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f1f4748a-fd05-339e-01a4-d1eae0541207@amd.com \
    --to=jamesz@amd.com \
    --cc=Felix.Kuehling@amd.com \
    --cc=Philip.Yang@amd.com \
    --cc=alex.sierra@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=james.zhu@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox