Re: [PATCH v4 07/34] drm/xe: Track LR jobs in DRM scheduler pending list

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Auld <matthew.auld@intel.com>
To: Matthew Brost <matthew.brost@intel.com>, intel-xe@lists.freedesktop.org
Subject: Re: [PATCH v4 07/34] drm/xe: Track LR jobs in DRM scheduler pending list
Date: Thu, 2 Oct 2025 17:14:51 +0100	[thread overview]
Message-ID: <1af77dfb-1d01-485f-81f3-a464e6ca4f33@intel.com> (raw)
In-Reply-To: <20251002055402.1865880-8-matthew.brost@intel.com>

On 02/10/2025 06:53, Matthew Brost wrote:
> VF migration requires jobs to remain pending so they can be replayed
> after the VF comes back. Previously, LR job fences were intentionally
> signaled immediately after submission to avoid the risk of exporting
> them, as these fences do not naturally signal in a timely manner and
> could break dma-fence contracts. A side effect of this approach was that
> LR jobs were never added to the DRM scheduler’s pending list, preventing
> them from being tracked for later resubmission.
> 
> We now avoid signaling LR job fences and ensure they are never exported;
> Xe already guards against exporting these internal fences. With that
> guarantee in place, we can safely track LR jobs in the scheduler’s
> pending list so they are eligible for resubmission during VF
> post-migration recovery (and similar recovery paths).
> 
> An added benefit is that LR queues now gain the DRM scheduler’s built-in
> flow control over ring usage rather than rejecting new jobs in the exec
> IOCTL if the ring is full.
> 
> v2:
>   - Ensure DRM scheduler TDR doesn't run for LR jobs
>   - Stack variable for killed_or_banned_or_wedged
> v4:
>   - Clarify commit message (Tomasz)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> Reviewed-by: Tomasz Lis <tomasz.lis@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_exec.c       | 12 ++-------
>   drivers/gpu/drm/xe/xe_exec_queue.c | 19 -------------
>   drivers/gpu/drm/xe/xe_exec_queue.h |  2 --
>   drivers/gpu/drm/xe/xe_guc_submit.c | 43 ++++++++++++++++++++----------
>   4 files changed, 31 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index 83897950f0da..0dc27476832b 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -124,7 +124,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	struct xe_validation_ctx ctx;
>   	struct xe_sched_job *job;
>   	struct xe_vm *vm;
> -	bool write_locked, skip_retry = false;
> +	bool write_locked;
>   	int err = 0;
>   	struct xe_hw_engine_group *group;
>   	enum xe_hw_engine_group_execution_mode mode, previous_mode;
> @@ -266,12 +266,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   		goto err_exec;
>   	}
>   
> -	if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) {
> -		err = -EWOULDBLOCK;	/* Aliased to -EAGAIN */
> -		skip_retry = true;
> -		goto err_exec;
> -	}
> -
>   	if (xe_exec_queue_uses_pxp(q)) {
>   		err = xe_vm_validate_protected(q->vm);
>   		if (err)
> @@ -328,8 +322,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   		xe_sched_job_init_user_fence(job, &syncs[i]);
>   	}
>   
> -	if (xe_exec_queue_is_lr(q))
> -		q->ring_ops->emit_job(job);
>   	if (!xe_vm_in_lr_mode(vm))
>   		xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished);
>   	xe_sched_job_push(job);
> @@ -355,7 +347,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   		xe_validation_ctx_fini(&ctx);
>   err_unlock_list:
>   	up_read(&vm->lock);
> -	if (err == -EAGAIN && !skip_retry)
> +	if (err == -EAGAIN)
>   		goto retry;
>   err_hw_exec_mode:
>   	if (mode == EXEC_MODE_DMA_FENCE)
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 6bfaca424ca3..81f707d2c388 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -824,25 +824,6 @@ bool xe_exec_queue_is_lr(struct xe_exec_queue *q)
>   		!(q->flags & EXEC_QUEUE_FLAG_VM);
>   }
>   
> -static s32 xe_exec_queue_num_job_inflight(struct xe_exec_queue *q)
> -{
> -	return q->lrc[0]->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc[0]) - 1;
> -}
> -
> -/**
> - * xe_exec_queue_ring_full() - Whether an exec_queue's ring is full
> - * @q: The exec_queue
> - *
> - * Return: True if the exec_queue's ring is full, false otherwise.
> - */
> -bool xe_exec_queue_ring_full(struct xe_exec_queue *q)
> -{
> -	struct xe_lrc *lrc = q->lrc[0];
> -	s32 max_job = lrc->ring.size / MAX_JOB_SIZE_BYTES;
> -
> -	return xe_exec_queue_num_job_inflight(q) >= max_job;
> -}
> -
>   /**
>    * xe_exec_queue_is_idle() - Whether an exec_queue is idle.
>    * @q: The exec_queue
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> index 8821ceb838d0..a4dfbe858bda 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -64,8 +64,6 @@ static inline bool xe_exec_queue_uses_pxp(struct xe_exec_queue *q)
>   
>   bool xe_exec_queue_is_lr(struct xe_exec_queue *q);
>   
> -bool xe_exec_queue_ring_full(struct xe_exec_queue *q);
> -
>   bool xe_exec_queue_is_idle(struct xe_exec_queue *q);
>   
>   void xe_exec_queue_kill(struct xe_exec_queue *q);
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 13746f32b231..3a534d93505f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -851,30 +851,31 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
>   	struct xe_sched_job *job = to_xe_sched_job(drm_job);
>   	struct xe_exec_queue *q = job->q;
>   	struct xe_guc *guc = exec_queue_to_guc(q);
> -	struct dma_fence *fence = NULL;
> -	bool lr = xe_exec_queue_is_lr(q);
> +	bool lr = xe_exec_queue_is_lr(q), killed_or_banned_or_wedged =
> +		exec_queue_killed_or_banned_or_wedged(q);
>   
>   	xe_gt_assert(guc_to_gt(guc), !(exec_queue_destroyed(q) || exec_queue_pending_disable(q)) ||
>   		     exec_queue_banned(q) || exec_queue_suspended(q));
>   
>   	trace_xe_sched_job_run(job);
>   
> -	if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
> +	if (!killed_or_banned_or_wedged && !xe_sched_job_is_error(job)) {
>   		if (!exec_queue_registered(q))
>   			register_exec_queue(q, GUC_CONTEXT_NORMAL);
> -		if (!lr)	/* LR jobs are emitted in the exec IOCTL */
> -			q->ring_ops->emit_job(job);
> +		q->ring_ops->emit_job(job);
>   		submit_exec_queue(q);
>   	}
>   
> -	if (lr) {
> -		xe_sched_job_set_error(job, -EOPNOTSUPP);
> -		dma_fence_put(job->fence);	/* Drop ref from xe_sched_job_arm */
> -	} else {
> -		fence = job->fence;
> -	}
> +	/*
> +	 * We don't care about job-fence ordering in LR VMs because these fences
> +	 * are never exported; they are used solely to keep jobs on the pending
> +	 * list. Once a queue enters an error state, there's no need to track
> +	 * them.
> +	 */
> +	if (killed_or_banned_or_wedged && lr)
> +		xe_sched_job_set_error(job, -ECANCELED);
>   
> -	return fence;
> +	return job->fence;
>   }
>   
>   static void guc_exec_queue_free_job(struct drm_sched_job *drm_job)
> @@ -916,7 +917,8 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
>   		xe_gt_warn(q->gt, "Pending enable/disable failed to respond\n");
>   		xe_sched_submission_start(sched);
>   		xe_gt_reset_async(q->gt);
> -		xe_sched_tdr_queue_imm(sched);
> +		if (!xe_exec_queue_is_lr(q))
> +			xe_sched_tdr_queue_imm(sched);
>   		return;
>   	}
>   
> @@ -1008,6 +1010,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>   	struct xe_exec_queue *q = ge->q;
>   	struct xe_guc *guc = exec_queue_to_guc(q);
>   	struct xe_gpu_scheduler *sched = &ge->sched;
> +	struct xe_sched_job *job;
>   	bool wedged = false;
>   
>   	xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
> @@ -1058,7 +1061,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>   	if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
>   		xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
>   
> +	xe_hw_fence_irq_stop(q->fence_irq);
> +
>   	xe_sched_submission_start(sched);
> +
> +	spin_lock(&sched->base.job_list_lock);
> +	list_for_each_entry(job, &sched->base.pending_list, drm.list)
> +		xe_sched_job_set_error(job, -ECANCELED);
> +	spin_unlock(&sched->base.job_list_lock);
> +
> +	xe_hw_fence_irq_start(q->fence_irq);
>   }
>   
>   #define ADJUST_FIVE_PERCENT(__t)	mul_u64_u32_div(__t, 105, 100)
> @@ -1129,7 +1141,8 @@ static void enable_scheduling(struct xe_exec_queue *q)
>   		xe_gt_warn(guc_to_gt(guc), "Schedule enable failed to respond");
>   		set_exec_queue_banned(q);
>   		xe_gt_reset_async(q->gt);
> -		xe_sched_tdr_queue_imm(&q->guc->sched);
> +		if (!xe_exec_queue_is_lr(q))
> +			xe_sched_tdr_queue_imm(&q->guc->sched);
>   	}
>   }
>   
> @@ -1187,6 +1200,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	int i = 0;
>   	bool wedged = false, skip_timeout_check;
>   
> +	xe_gt_assert(guc_to_gt(guc), !xe_exec_queue_is_lr(q));

Just some questions around guc_exec_queue_stop/start(). In queue_stop 
there is:

struct xe_sched_job *job = xe_sched_first_pending_job(sched);
bool ban = false;

if (job) {
     if ((xe_sched_job_started(job) &&
	!xe_sched_job_completed(job)) ||
	xe_sched_invalidate_job(job, 2)) {
	    trace_xe_sched_job_ban(job);
	    ban = true;
	}
} else if (xe_exec_queue_is_lr(q) &&
	   !xe_lrc_ring_is_idle(q->lrc[0])) {
     ban = true;
}

Do we still need this else if branch, since the job path is now being 
taken for lr?

Also I guess first_pending_job() strikes again? If it's pending/complete 
but we still have something else in-progress in the pending_list they 
get away clean? Not sure what happens if you skip the ban and get as far 
as resubmit?

> +
>   	/*
>   	 * TDR has fired before free job worker. Common if exec queue
>   	 * immediately closed after last fence signaled. Add back to pending

next prev parent reply	other threads:[~2025-10-02 16:14 UTC|newest]

Thread overview: 71+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-02  5:53 [PATCH v4 00/34] VF migration redesign Matthew Brost
2025-10-02  5:53 ` [PATCH v4 01/34] drm/xe: Add NULL checks to scratch LRC allocation Matthew Brost
2025-10-02 22:02   ` Lis, Tomasz
2025-10-02  5:53 ` [PATCH v4 02/34] Revert "drm/xe/vf: Rebase exec queue parallel commands during migration recovery" Matthew Brost
2025-10-02  5:53 ` [PATCH v4 03/34] Revert "drm/xe/vf: Post migration, repopulate ring area for pending request" Matthew Brost
2025-10-02  5:53 ` [PATCH v4 04/34] Revert "drm/xe/vf: Fixup CTB send buffer messages after migration" Matthew Brost
2025-10-02  5:53 ` [PATCH v4 05/34] drm/xe: Save off position in ring in which a job was programmed Matthew Brost
2025-10-02  5:53 ` [PATCH v4 06/34] drm/xe/guc: Track pending-enable source in submission state Matthew Brost
2025-10-02  5:53 ` [PATCH v4 07/34] drm/xe: Track LR jobs in DRM scheduler pending list Matthew Brost
2025-10-02 16:14   ` Matthew Auld [this message]
2025-10-05  5:21     ` Matthew Brost
2025-10-02  5:53 ` [PATCH v4 08/34] drm/xe: Don't change LRC ring head on job resubmission Matthew Brost
2025-10-02 14:15   ` Matthew Auld
2025-10-05  5:25     ` Matthew Brost
2025-10-05  6:53       ` Matthew Brost
2025-10-06  8:59         ` Matthew Auld
2025-10-02  5:53 ` [PATCH v4 09/34] drm/xe: Make LRC W/A scratch buffer usage consistent Matthew Brost
2025-10-02  5:53 ` [PATCH v4 10/34] drm/xe/guc: Document GuC submission backend Matthew Brost
2025-10-03 14:30   ` Lis, Tomasz
2025-10-02  5:53 ` [PATCH v4 11/34] drm/xe/vf: Add xe_gt_recovery_inprogress helper Matthew Brost
2025-10-03  1:39   ` Lis, Tomasz
2025-10-04  4:32     ` Matthew Brost
2025-10-03  8:40   ` Michal Wajdeczko
2025-10-04  4:32     ` Matthew Brost
2025-10-02  5:53 ` [PATCH v4 12/34] drm/xe/vf: Make VF recovery run on per-GT worker Matthew Brost
2025-10-02  5:53 ` [PATCH v4 13/34] drm/xe/vf: Abort H2G sends during VF post-migration recovery Matthew Brost
2025-10-02  5:53 ` [PATCH v4 14/34] drm/xe/vf: Remove memory allocations from VF post migration recovery Matthew Brost
2025-10-02  5:53 ` [PATCH v4 15/34] drm/xe/vf: Close multi-GT GGTT shift race Matthew Brost
2025-10-03 14:24   ` Michal Wajdeczko
2025-10-04  4:36     ` Matthew Brost
2025-10-02  5:53 ` [PATCH v4 16/34] drm/xe/vf: Teardown VF post migration worker on driver unload Matthew Brost
2025-10-02  5:53 ` [PATCH v4 17/34] drm/xe/vf: Don't allow GT reset to be queued during VF post migration recovery Matthew Brost
2025-10-03 16:09   ` Lis, Tomasz
2025-10-02  5:53 ` [PATCH v4 18/34] drm/xe/vf: Wakeup in GuC backend on " Matthew Brost
2025-10-03 14:38   ` Michal Wajdeczko
2025-10-05  6:22     ` Matthew Brost
2025-10-05  6:35       ` Matthew Brost
2025-10-02  5:53 ` [PATCH v4 19/34] drm/xe/vf: Avoid indefinite blocking in preempt rebind worker for VFs supporting migration Matthew Brost
2025-10-02  5:53 ` [PATCH v4 20/34] drm/xe/vf: Use GUC_HXG_TYPE_EVENT for GuC context register Matthew Brost
2025-10-03 14:26   ` Lis, Tomasz
2025-10-05  5:43     ` Matthew Brost
2025-10-03 14:57   ` Michal Wajdeczko
2025-10-02  5:53 ` [PATCH v4 21/34] drm/xe/vf: Flush and stop CTs in VF post migration recovery Matthew Brost
2025-10-02  5:53 ` [PATCH v4 22/34] drm/xe/vf: Reset TLB invalidations during " Matthew Brost
2025-10-02  5:53 ` [PATCH v4 23/34] drm/xe/vf: Kickstart after resfix in " Matthew Brost
2025-10-02  5:53 ` [PATCH v4 24/34] drm/xe/vf: Start CTs before resfix " Matthew Brost
2025-10-02 21:50   ` Lis, Tomasz
2025-10-03 15:10   ` Michal Wajdeczko
2025-10-05  6:49     ` Matthew Brost
2025-10-05 12:28       ` Michal Wajdeczko
2025-10-02  5:53 ` [PATCH v4 25/34] drm/xe/vf: Abort VF post migration recovery on failure Matthew Brost
2025-10-02  5:53 ` [PATCH v4 26/34] drm/xe/vf: Replay GuC submission state on pause / unpause Matthew Brost
2025-10-02  5:53 ` [PATCH v4 27/34] drm/xe: Move queue init before LRC creation Matthew Brost
2025-10-03 13:25   ` Lis, Tomasz
2025-10-05  8:03     ` Matthew Brost
2025-10-02  5:53 ` [PATCH v4 28/34] drm/xe/vf: Add debug prints for GuC replaying state during VF recovery Matthew Brost
2025-10-03 13:08   ` Lis, Tomasz
2025-10-02  5:53 ` [PATCH v4 29/34] drm/xe/vf: Workaround for race condition in GuC firmware during VF pause Matthew Brost
2025-10-03 13:06   ` Lis, Tomasz
2025-10-02  5:53 ` [PATCH v4 30/34] drm/xe: Use PPGTT addresses for TLB invalidation to avoid GGTT fixups Matthew Brost
2025-10-02  5:53 ` [PATCH v4 31/34] drm/xe/vf: Use primary GT ordered work queue on media GT on PTL VF Matthew Brost
2025-10-02 21:00   ` Lis, Tomasz
2025-10-05  7:03     ` Matthew Brost
2025-10-02  5:54 ` [PATCH v4 32/34] drm/xe/vf: Ensure media GT VF recovery runs after primary GT on PTL Matthew Brost
2025-10-02 20:19   ` Lis, Tomasz
2025-10-02  5:54 ` [PATCH v4 33/34] drm/xe/vf: Rebase CCS save/restore BB GGTT addresses Matthew Brost
2025-10-02  5:54 ` [PATCH v4 34/34] drm/xe/guc: Increase wait timeout to 2sec after BUSY reply from GuC Matthew Brost
2025-10-02  6:45 ` ✗ CI.checkpatch: warning for VF migration redesign (rev4) Patchwork
2025-10-02  6:47 ` ✓ CI.KUnit: success " Patchwork
2025-10-02  7:33 ` ✗ Xe.CI.BAT: failure " Patchwork
2025-10-02  9:19 ` ✗ Xe.CI.Full: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1af77dfb-1d01-485f-81f3-a464e6ca4f33@intel.com \
    --to=matthew.auld@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox