All of lore.kernel.org
 help / color / mirror / Atom feed
From: Felix Kuehling <felix.kuehling@amd.com>
To: Philip Yang <Philip.Yang@amd.com>, amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH v5 5/11] drm/amdkfd: Add user queue eviction restore SMI event
Date: Thu, 30 Jun 2022 10:36:08 -0400	[thread overview]
Message-ID: <26efbe54-e5f9-cf94-ab6b-e4fe5a261eef@amd.com> (raw)
In-Reply-To: <20220628145020.26925-6-Philip.Yang@amd.com>

Am 2022-06-28 um 10:50 schrieb Philip Yang:
> Output user queue eviction and restore event. User queue eviction may be
> triggered by svm or userptr MMU notifier, TTM eviction, device suspend
> and CRIU checkpoint and restore.
>
> User queue restore may be rescheduled if eviction happens again while
> restore.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 12 ++++---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  4 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  4 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 15 ++++++--
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 35 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  4 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  6 ++--
>   9 files changed, 69 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b25b41f50213..73bf8b5f2aa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
>   }
>   #endif
>   /* KGD2KFD callbacks */
> -int kgd2kfd_quiesce_mm(struct mm_struct *mm);
> +int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
>   int kgd2kfd_resume_mm(struct mm_struct *mm);
>   int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>   						struct dma_fence *fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 5ba9070d8722..6a7e045ddcc5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -32,6 +32,7 @@
>   #include "amdgpu_dma_buf.h"
>   #include <uapi/linux/kfd_ioctl.h>
>   #include "amdgpu_xgmi.h"
> +#include "kfd_smi_events.h"
>   
>   /* Userptr restore delay, just long enough to allow consecutive VM
>    * changes to accumulate
> @@ -2381,7 +2382,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
>   	evicted_bos = atomic_inc_return(&process_info->evicted_bos);
>   	if (evicted_bos == 1) {
>   		/* First eviction, stop the queues */
> -		r = kgd2kfd_quiesce_mm(mm);
> +		r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
>   		if (r)
>   			pr_err("Failed to quiesce KFD\n");
>   		schedule_delayed_work(&process_info->restore_userptr_work,
> @@ -2655,13 +2656,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>   
>   unlock_out:
>   	mutex_unlock(&process_info->lock);
> -	mmput(mm);
> -	put_task_struct(usertask);
>   
>   	/* If validation failed, reschedule another attempt */
> -	if (evicted_bos)
> +	if (evicted_bos) {
>   		schedule_delayed_work(&process_info->restore_userptr_work,
>   			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
> +
> +		kfd_smi_event_queue_restore_rescheduled(mm);
> +	}
> +	mmput(mm);
> +	put_task_struct(usertask);
>   }
>   
>   /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index a0246b4bae6b..6abfe10229a2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2428,7 +2428,7 @@ static int criu_restore(struct file *filep,
>   	 * Set the process to evicted state to avoid running any new queues before all the memory
>   	 * mappings are ready.
>   	 */
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
>   	if (ret)
>   		goto exit_unlock;
>   
> @@ -2547,7 +2547,7 @@ static int criu_process_info(struct file *filep,
>   		goto err_unlock;
>   	}
>   
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
>   	if (ret)
>   		goto err_unlock;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c8fee0dbfdcb..6ec0e9f0927d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>   	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
>   }
>   
> -int kgd2kfd_quiesce_mm(struct mm_struct *mm)
> +int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
>   {
>   	struct kfd_process *p;
>   	int r;
> @@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>   		return -ESRCH;
>   
>   	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
> -	r = kfd_process_evict_queues(p);
> +	r = kfd_process_evict_queues(p, trigger);
>   
>   	kfd_unref_process(p);
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 59ba50ce54d3..b9e7e9c52853 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -946,7 +946,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
>   }
>   
>   void kfd_unref_process(struct kfd_process *p);
> -int kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
>   int kfd_process_restore_queues(struct kfd_process *p);
>   void kfd_suspend_all_processes(void);
>   int kfd_resume_all_processes(void);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a13e60d48b73..fc38a4d81420 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -43,6 +43,7 @@ struct mm_struct;
>   #include "kfd_device_queue_manager.h"
>   #include "kfd_iommu.h"
>   #include "kfd_svm.h"
> +#include "kfd_smi_events.h"
>   
>   /*
>    * List of struct kfd_process (field kfd_process).
> @@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
>    * Eviction is reference-counted per process-device. This means multiple
>    * evictions from different sources can be nested safely.
>    */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
>   {
>   	int r = 0;
>   	int i;
> @@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_process_device *pdd = p->pdds[i];
>   
> +		kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
> +					     trigger);
> +
>   		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>   							    &pdd->qpd);
>   		/* evict return -EIO if HWS is hang or asic is resetting, in this case
> @@ -1769,6 +1773,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
>   
>   		if (n_evicted == 0)
>   			break;
> +
> +		kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
> +
>   		if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
>   							      &pdd->qpd))
>   			pr_err("Failed to restore queues\n");
> @@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_process_device *pdd = p->pdds[i];
>   
> +		kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
> +
>   		r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
>   							      &pdd->qpd);
>   		if (r) {
> @@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
>   	flush_delayed_work(&p->restore_work);
>   
>   	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
>   	if (!ret) {
>   		dma_fence_signal(p->ef);
>   		dma_fence_put(p->ef);
> @@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
>   		cancel_delayed_work_sync(&p->eviction_work);
>   		cancel_delayed_work_sync(&p->restore_work);
>   
> -		if (kfd_process_evict_queues(p))
> +		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);
>   		dma_fence_signal(p->ef);
>   		dma_fence_put(p->ef);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index ec4d278c2a47..3917c38204d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
>   			  from, to, trigger);
>   }
>   
> +void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
> +				  uint32_t trigger)
> +{
> +	kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
> +			  "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
> +			  dev->id, trigger);
> +}
> +
> +void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
> +{
> +	kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
> +			  "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
> +			  dev->id);
> +}
> +
> +void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> +{
> +	struct kfd_process *p;
> +	int i;
> +
> +	p = kfd_lookup_process_by_mm(mm);
> +	if (!p)
> +		return;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
> +				  KFD_SMI_EVENT_QUEUE_RESTORE,
> +				  "%lld -%d %x %c\n", ktime_get_boottime_ns(),
> +				  p->lead_thread->pid, pdd->dev->id, 'R');
> +	}
> +	kfd_unref_process(p);
> +}
> +
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
>   {
>   	struct kfd_smi_client *client;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index ec5d74a2fef4..b23292637239 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
>   void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
>   			     unsigned long start, unsigned long end,
>   			     uint32_t from, uint32_t to, uint32_t trigger);
> +void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
> +				  uint32_t trigger);
> +void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
> +void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 5cead2a0e819..ddc1e4651919 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1730,14 +1730,16 @@ static void svm_range_restore_work(struct work_struct *work)
>   	mutex_unlock(&svms->lock);
>   	mmap_write_unlock(mm);
>   	mutex_unlock(&process_info->lock);
> -	mmput(mm);
>   
>   	/* If validation failed, reschedule another attempt */
>   	if (evicted_ranges) {
>   		pr_debug("reschedule to restore svm range\n");
>   		schedule_delayed_work(&svms->restore_work,
>   			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
> +
> +		kfd_smi_event_queue_restore_rescheduled(mm);
>   	}
> +	mmput(mm);
>   }
>   
>   /**
> @@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
>   			 prange->svms, prange->start, prange->last);
>   
>   		/* First eviction, stop the queues */
> -		r = kgd2kfd_quiesce_mm(mm);
> +		r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
>   		if (r)
>   			pr_debug("failed to quiesce KFD\n");
>   

  reply	other threads:[~2022-06-30 14:36 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-28 14:50 [PATCH v5 0/11] HMM profiler interface Philip Yang
2022-06-28 14:50 ` [PATCH v5 1/11] drm/amdkfd: Add KFD SMI event IDs and triggers Philip Yang
2022-06-30 14:46   ` Felix Kuehling
2022-06-28 14:50 ` [PATCH v5 2/11] drm/amdkfd: Enable per process SMI event Philip Yang
2022-06-28 14:50 ` [PATCH v5 3/11] drm/amdkfd: Add GPU recoverable fault " Philip Yang
2022-06-30 14:19   ` Felix Kuehling
2022-06-30 14:53     ` philip yang
2022-06-28 14:50 ` [PATCH v5 4/11] drm/amdkfd: Add migration " Philip Yang
2022-06-30 14:29   ` Felix Kuehling
2022-06-28 14:50 ` [PATCH v5 5/11] drm/amdkfd: Add user queue eviction restore " Philip Yang
2022-06-30 14:36   ` Felix Kuehling [this message]
2022-06-28 14:50 ` [PATCH v5 6/11] drm/amdkfd: Add unmap from GPU " Philip Yang
2022-06-30 14:39   ` Felix Kuehling
2022-06-28 14:50 ` [PATCH v5 7/11] drm/amdkfd: Asynchronously free smi_client Philip Yang
2022-06-30 14:45   ` Felix Kuehling
2022-06-28 14:50 ` [PATCH v5 8/11] drm/amdkfd: Bump KFD API version for SMI profiling event Philip Yang
2022-06-30 14:45   ` Felix Kuehling
2022-06-28 14:50 ` [PATCH 9/11] libhsakmt: hsaKmtGetNodeProperties add gpu_id Philip Yang
2022-06-28 14:50 ` [PATCH 10/11] libhsakmt: add open SMI event handle Philip Yang
2022-06-28 14:50 ` [PATCH 11/11] ROCR-Runtime Basic SVM profiler Philip Yang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=26efbe54-e5f9-cf94-ab6b-e4fe5a261eef@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=Philip.Yang@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.