Re: [PATCH v8 09/11] drm/xe/exec: Switch hw engine group execution mode upon job submission

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: Francois Dugast <francois.dugast@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH v8 09/11] drm/xe/exec: Switch hw engine group execution mode upon job submission
Date: Fri, 9 Aug 2024 03:57:01 +0000	[thread overview]
Message-ID: <ZrWTjR2FifBvpgz4@DUT025-TGLU.fm.intel.com> (raw)
In-Reply-To: <20240808184220.1715625-10-francois.dugast@intel.com>

On Thu, Aug 08, 2024 at 08:40:27PM +0200, Francois Dugast wrote:
> If the job about to be submitted is a dma-fence job, update the current
> execution mode of the hw engine group. This triggers an immediate suspend
> of the exec queues running faulting long-running jobs.
> 
> If the job about to be submitted is a long-running job, kick a new worker
> used to resume the exec queues running faulting long-running jobs once
> the dma-fence jobs have completed.
> 
> v2: Kick the resume worker from exec IOCTL, switch to unordered workqueue,
>     destroy it after use (Matt Brost)
> 
> v3: Do not resume if no exec queue was suspended (Matt Brost)
> 
> v4: Squash commits (Matt Brost)
> 
> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_exec.c            | 20 +++++++-
>  drivers/gpu/drm/xe/xe_hw_engine_group.c | 62 ++++++++++++++++++++++++-
>  drivers/gpu/drm/xe/xe_hw_engine_group.h |  4 ++
>  3 files changed, 84 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index f36980aa26e6..484acfbe0e61 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -14,6 +14,7 @@
>  #include "xe_bo.h"
>  #include "xe_device.h"
>  #include "xe_exec_queue.h"
> +#include "xe_hw_engine_group.h"
>  #include "xe_macros.h"
>  #include "xe_ring_ops_types.h"
>  #include "xe_sched_job.h"
> @@ -124,6 +125,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	bool write_locked, skip_retry = false;
>  	ktime_t end = 0;
>  	int err = 0;
> +	struct xe_hw_engine_group *group;
> +	enum xe_hw_engine_group_execution_mode mode, previous_mode;
>  
>  	if (XE_IOCTL_DBG(xe, args->extensions) ||
>  	    XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) ||
> @@ -182,6 +185,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		}
>  	}
>  
> +	group = q->hwe->hw_engine_group;
> +	mode = xe_hw_engine_group_find_exec_mode(q);
> +
> +	if (mode == EXEC_MODE_DMA_FENCE) {
> +		err = xe_hw_engine_group_get_mode(group, mode, &previous_mode);
> +		if (err)
> +			goto err_syncs;
> +	}
> +
>  retry:
>  	if (!xe_vm_in_lr_mode(vm) && xe_vm_userptr_check_repin(vm)) {
>  		err = down_write_killable(&vm->lock);
> @@ -199,7 +211,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		downgrade_write(&vm->lock);
>  		write_locked = false;
>  		if (err)
> -			goto err_unlock_list;
> +			goto err_hw_exec_mode;
>  	}
>  
>  	if (!args->num_batch_buffer) {
> @@ -312,6 +324,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		spin_unlock(&xe->ttm.lru_lock);
>  	}
>  
> +	if (mode == EXEC_MODE_LR)
> +		xe_hw_engine_group_resume_faulting_lr_jobs(group);
> +
>  err_repin:
>  	if (!xe_vm_in_lr_mode(vm))
>  		up_read(&vm->userptr.notifier_lock);
> @@ -324,6 +339,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	up_read(&vm->lock);
>  	if (err == -EAGAIN && !skip_retry)
>  		goto retry;
> +err_hw_exec_mode:
> +	if (mode == EXEC_MODE_DMA_FENCE)
> +		xe_hw_engine_group_put(group);
>  err_syncs:
>  	while (num_syncs--)
>  		xe_sync_entry_cleanup(&syncs[num_syncs]);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_group.c b/drivers/gpu/drm/xe/xe_hw_engine_group.c
> index e6c235119351..f966b8207c0c 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_group.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_group.c
> @@ -17,9 +17,36 @@ hw_engine_group_free(struct drm_device *drm, void *arg)
>  {
>  	struct xe_hw_engine_group *group = arg;
>  
> +	destroy_workqueue(group->resume_wq);
>  	kfree(group);
>  }
>  
> +static void
> +hw_engine_group_resume_lr_jobs_func(struct work_struct *w)
> +{
> +	struct xe_exec_queue *q;
> +	struct xe_hw_engine_group *group = container_of(w, struct xe_hw_engine_group, resume_work);
> +	int err;
> +	enum xe_hw_engine_group_execution_mode previous_mode;
> +
> +	err = xe_hw_engine_group_get_mode(group, EXEC_MODE_LR, &previous_mode);
> +	if (err)
> +		return;
> +
> +	if (previous_mode == EXEC_MODE_LR)
> +		goto put;
> +
> +	list_for_each_entry(q, &group->exec_queue_list, hw_engine_group_link) {
> +		if (!xe_vm_in_fault_mode(q->vm))
> +			continue;
> +
> +		q->ops->resume(q);
> +	}
> +
> +put:
> +	xe_hw_engine_group_put(group);
> +}
> +
>  static struct xe_hw_engine_group *
>  hw_engine_group_alloc(struct xe_device *xe)
>  {
> @@ -30,7 +57,12 @@ hw_engine_group_alloc(struct xe_device *xe)
>  	if (!group)
>  		return ERR_PTR(-ENOMEM);
>  
> +	group->resume_wq = alloc_workqueue("xe-resume-lr-jobs-wq", 0, 0);
> +	if (!group->resume_wq)
> +		return ERR_PTR(-ENOMEM);
> +
>  	init_rwsem(&group->mode_sem);
> +	INIT_WORK(&group->resume_work, hw_engine_group_resume_lr_jobs_func);
>  	INIT_LIST_HEAD(&group->exec_queue_list);
>  
>  	err = drmm_add_action_or_reset(&xe->drm, hw_engine_group_free, group);
> @@ -134,7 +166,7 @@ int xe_hw_engine_group_add_exec_queue(struct xe_hw_engine_group *group, struct x
>  		if (err)
>  			goto err_suspend;
>  
> -		queue_work(group->resume_wq, &group->resume_work);
> +		xe_hw_engine_group_resume_faulting_lr_jobs(group);
>  	}
>  
>  	list_add(&q->hw_engine_group_link, &group->exec_queue_list);
> @@ -167,6 +199,16 @@ void xe_hw_engine_group_del_exec_queue(struct xe_hw_engine_group *group, struct
>  	up_write(&group->mode_sem);
>  }
>  
> +/**
> + * xe_hw_engine_group_resume_faulting_lr_jobs() - Asynchronously resume the hw engine group's
> + * faulting LR jobs
> + * @group: The hw engine group
> + */
> +void xe_hw_engine_group_resume_faulting_lr_jobs(struct xe_hw_engine_group *group)
> +{
> +	queue_work(group->resume_wq, &group->resume_work);
> +}
> +
>  /**
>   * xe_hw_engine_group_suspend_faulting_lr_jobs() - Suspend the faulting LR jobs of this group
>   * @group: The hw engine group
> @@ -177,6 +219,7 @@ static int xe_hw_engine_group_suspend_faulting_lr_jobs(struct xe_hw_engine_group
>  {
>  	int err;
>  	struct xe_exec_queue *q;
> +	bool need_resume = false;
>  
>  	lockdep_assert_held_write(&group->mode_sem);
>  
> @@ -184,6 +227,7 @@ static int xe_hw_engine_group_suspend_faulting_lr_jobs(struct xe_hw_engine_group
>  		if (!xe_vm_in_fault_mode(q->vm))
>  			continue;
>  
> +		need_resume = true;
>  		q->ops->suspend(q);
>  	}
>  
> @@ -196,6 +240,9 @@ static int xe_hw_engine_group_suspend_faulting_lr_jobs(struct xe_hw_engine_group
>  			goto err_suspend;
>  	}
>  
> +	if (need_resume)
> +		xe_hw_engine_group_resume_faulting_lr_jobs(group);
> +
>  	return 0;
>  
>  err_suspend:
> @@ -310,3 +357,16 @@ __releases(&group->mode_sem)
>  {
>  	up_read(&group->mode_sem);
>  }
> +
> +/**
> + * xe_hw_engine_group_find_exec_mode() - Find the execution mode for this exec queue
> + * @q: The exec_queue
> + */
> +enum xe_hw_engine_group_execution_mode
> +xe_hw_engine_group_find_exec_mode(struct xe_exec_queue *q)
> +{
> +	if (xe_vm_in_lr_mode(q->vm))

s/xe_vm_in_lr_mode/xe_vm_in_fault_mode

Without this xe_vm_in_preempt_fence_mode will kick the worker.

Matt

> +		return EXEC_MODE_LR;
> +	else
> +		return EXEC_MODE_DMA_FENCE;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_group.h b/drivers/gpu/drm/xe/xe_hw_engine_group.h
> index e0deb7c7bb5b..797ee81acbf2 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_group.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_group.h
> @@ -22,4 +22,8 @@ int xe_hw_engine_group_get_mode(struct xe_hw_engine_group *group,
>  				enum xe_hw_engine_group_execution_mode *previous_mode);
>  void xe_hw_engine_group_put(struct xe_hw_engine_group *group);
>  
> +enum xe_hw_engine_group_execution_mode
> +xe_hw_engine_group_find_exec_mode(struct xe_exec_queue *q);
> +void xe_hw_engine_group_resume_faulting_lr_jobs(struct xe_hw_engine_group *group);
> +
>  #endif
> -- 
> 2.43.0
>

next prev parent reply	other threads:[~2024-08-09  3:58 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-08 18:40 [PATCH v8 00/11] Parallel submission of dma fence jobs and LR jobs with shared hardware resources Francois Dugast
2024-08-08 18:40 ` [PATCH v8 01/11] drm/xe/hw_engine_group: Introduce xe_hw_engine_group Francois Dugast
2024-08-08 18:40 ` [PATCH v8 02/11] drm/xe/guc_submit: Make suspend_wait interruptible Francois Dugast
2024-08-08 23:51   ` Matthew Brost
2024-08-08 18:40 ` [PATCH v8 03/11] 'drm/xe/hw_engine_group: Register hw engine group's exec queues Francois Dugast
2024-08-09  3:36   ` Matthew Brost
2024-08-08 18:40 ` [PATCH v8 04/11] drm/xe/hw_engine_group: Add helper to suspend faulting LR jobs Francois Dugast
2024-08-09  3:19   ` Matthew Brost
2024-08-08 18:40 ` [PATCH v8 05/11] drm/xe/exec_queue: Remove duplicated code Francois Dugast
2024-08-08 18:40 ` [PATCH v8 06/11] drm/xe/exec_queue: Prepare last fence for hw engine group resume context Francois Dugast
2024-08-09  3:37   ` Matthew Brost
2024-08-08 18:40 ` [PATCH v8 07/11] drm/xe/hw_engine_group: Add helper to wait for dma fence jobs Francois Dugast
2024-08-09  3:39   ` Matthew Brost
2024-08-08 18:40 ` [PATCH v8 08/11] drm/xe/hw_engine_group: Ensure safe transition between execution modes Francois Dugast
2024-08-09  3:40   ` Matthew Brost
2024-08-08 18:40 ` [PATCH v8 09/11] drm/xe/exec: Switch hw engine group execution mode upon job submission Francois Dugast
2024-08-09  3:57   ` Matthew Brost [this message]
2024-08-08 18:40 ` [PATCH v8 10/11] drm/xe/vm: Remove restriction that all VMs must be faulting if one is Francois Dugast
2024-08-08 18:40 ` [PATCH v8 11/11] drm/xe/device: Remove unused xe_device::usm::num_vm_in_* Francois Dugast
2024-08-08 20:57 ` ✓ CI.Patch_applied: success for Parallel submission of dma fence jobs and LR jobs with shared hardware resources (rev8) Patchwork
2024-08-08 20:58 ` ✗ CI.checkpatch: warning " Patchwork
2024-08-08 20:59 ` ✓ CI.KUnit: success " Patchwork
2024-08-08 21:11 ` ✓ CI.Build: " Patchwork
2024-08-08 21:13 ` ✓ CI.Hooks: " Patchwork
2024-08-08 21:14 ` ✓ CI.checksparse: " Patchwork
2024-08-08 21:35 ` ✗ CI.BAT: failure " Patchwork
2024-08-09  1:29 ` ✗ CI.FULL: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZrWTjR2FifBvpgz4@DUT025-TGLU.fm.intel.com \
    --to=matthew.brost@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox