Re: [PATCH 1/2] drm/xe: Fix error handling if PXP fails to start

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: John Harrison <john.c.harrison@intel.com>
To: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>,
	<intel-xe@lists.freedesktop.org>
Cc: Matthew Brost <matthew.brost@intel.com>
Subject: Re: [PATCH 1/2] drm/xe: Fix error handling if PXP fails to start
Date: Thu, 4 Sep 2025 13:45:38 -0700	[thread overview]
Message-ID: <fe1b455b-9917-4dec-8a16-94daa922263f@intel.com> (raw)
In-Reply-To: <20250818234639.2965656-3-daniele.ceraolospurio@intel.com>

On 8/18/2025 4:46 PM, Daniele Ceraolo Spurio wrote:
> Since the PXP start comes after __xe_exec_queue_init() has completed,
> we need to cleanup what was done in that function in case of a PXP
> start error.
> __xe_exec_queue_init calls the submission backend init() function,
> so we need to introduce an opposite for that. Unfortunately, while
> we already have a fini() function pointer, it is does perform other
it is does?

> operations in addition to cleaning up what was done by the init().
> Therefore, for clarity, the existing fini() has been renamed to
> destroy(), while a new fini() has been added to only clean up what was
> done by the init(), with the latter being called by the former (via
> xe_exec_queue_fini).
It would be much easier to follow the changes if the rename was split 
into a prep patch and then the behaviour change patch was just the 
behaviour change.

John.

>
> Fixes: 72d479601d67 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues")
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: John Harrison <John.C.Harrison@Intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_exec_queue.c           | 24 ++++++---
>   drivers/gpu/drm/xe/xe_exec_queue_types.h     |  8 ++-
>   drivers/gpu/drm/xe/xe_execlist.c             | 25 ++++++----
>   drivers/gpu/drm/xe/xe_execlist_types.h       |  2 +-
>   drivers/gpu/drm/xe/xe_guc_exec_queue_types.h |  4 +-
>   drivers/gpu/drm/xe/xe_guc_submit.c           | 52 ++++++++++++--------
>   6 files changed, 74 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 2d10a53f701d..bce507c49517 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -199,6 +199,18 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q)
>   	return err;
>   }
>   
> +static void __xe_exec_queue_fini(struct xe_exec_queue *q)
> +{
> +	int i;
> +
> +	q->ops->fini(q);
> +
> +	for (i = 0; i < q->width; ++i)
> +		xe_lrc_put(q->lrc[i]);
> +
> +	return;
> +}
> +
>   struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,
>   					   u32 logical_mask, u16 width,
>   					   struct xe_hw_engine *hwe, u32 flags,
> @@ -229,11 +241,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v
>   	if (xe_exec_queue_uses_pxp(q)) {
>   		err = xe_pxp_exec_queue_add(xe->pxp, q);
>   		if (err)
> -			goto err_post_alloc;
> +			goto err_post_init;
>   	}
>   
>   	return q;
>   
> +err_post_init:
> +	__xe_exec_queue_fini(q);
>   err_post_alloc:
>   	__xe_exec_queue_free(q);
>   	return ERR_PTR(err);
> @@ -331,13 +345,11 @@ void xe_exec_queue_destroy(struct kref *ref)
>   			xe_exec_queue_put(eq);
>   	}
>   
> -	q->ops->fini(q);
> +	q->ops->destroy(q);
>   }
>   
>   void xe_exec_queue_fini(struct xe_exec_queue *q)
>   {
> -	int i;
> -
>   	/*
>   	 * Before releasing our ref to lrc and xef, accumulate our run ticks
>   	 * and wakeup any waiters.
> @@ -346,9 +358,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
>   	if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal))
>   		wake_up_var(&q->xef->exec_queue.pending_removal);
>   
> -	for (i = 0; i < q->width; ++i)
> -		xe_lrc_put(q->lrc[i]);
> -
> +	__xe_exec_queue_fini(q);
>   	__xe_exec_queue_free(q);
>   }
>   
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index ba443a497b38..27b76cf9da89 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -181,8 +181,14 @@ struct xe_exec_queue_ops {
>   	int (*init)(struct xe_exec_queue *q);
>   	/** @kill: Kill inflight submissions for backend */
>   	void (*kill)(struct xe_exec_queue *q);
> -	/** @fini: Fini exec queue for submission backend */
> +	/** @fini: Undoes the init() for submission backend */
>   	void (*fini)(struct xe_exec_queue *q);
> +	/**
> +	 * @destroy: Destroy exec queue for submission backend. The backend
> +	 * function must call xe_exec_queue_fini() (which will in turn call the
> +	 * fini() backend function) to ensure the queue is properly cleaned up.
> +	 */
> +	void (*destroy)(struct xe_exec_queue *q);
>   	/** @set_priority: Set priority for exec queue */
>   	int (*set_priority)(struct xe_exec_queue *q,
>   			    enum xe_exec_queue_priority priority);
> diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
> index 788f56b066b6..f83d421ac9d3 100644
> --- a/drivers/gpu/drm/xe/xe_execlist.c
> +++ b/drivers/gpu/drm/xe/xe_execlist.c
> @@ -385,10 +385,20 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q)
>   	return err;
>   }
>   
> -static void execlist_exec_queue_fini_async(struct work_struct *w)
> +static void execlist_exec_queue_fini(struct xe_exec_queue *q)
> +{
> +	struct xe_execlist_exec_queue *exl = q->execlist;
> +
> +	drm_sched_entity_fini(&exl->entity);
> +	drm_sched_fini(&exl->sched);
> +
> +	kfree(exl);
> +}
> +
> +static void execlist_exec_queue_destroy_async(struct work_struct *w)
>   {
>   	struct xe_execlist_exec_queue *ee =
> -		container_of(w, struct xe_execlist_exec_queue, fini_async);
> +		container_of(w, struct xe_execlist_exec_queue, destroy_async);
>   	struct xe_exec_queue *q = ee->q;
>   	struct xe_execlist_exec_queue *exl = q->execlist;
>   	struct xe_device *xe = gt_to_xe(q->gt);
> @@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w)
>   		list_del(&exl->active_link);
>   	spin_unlock_irqrestore(&exl->port->lock, flags);
>   
> -	drm_sched_entity_fini(&exl->entity);
> -	drm_sched_fini(&exl->sched);
> -	kfree(exl);
> -
>   	xe_exec_queue_fini(q);
>   }
>   
> @@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q)
>   	/* NIY */
>   }
>   
> -static void execlist_exec_queue_fini(struct xe_exec_queue *q)
> +static void execlist_exec_queue_destroy(struct xe_exec_queue *q)
>   {
> -	INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async);
> -	queue_work(system_unbound_wq, &q->execlist->fini_async);
> +	INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async);
> +	queue_work(system_unbound_wq, &q->execlist->destroy_async);
>   }
>   
>   static int execlist_exec_queue_set_priority(struct xe_exec_queue *q,
> @@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
>   	.init = execlist_exec_queue_init,
>   	.kill = execlist_exec_queue_kill,
>   	.fini = execlist_exec_queue_fini,
> +	.destroy = execlist_exec_queue_destroy,
>   	.set_priority = execlist_exec_queue_set_priority,
>   	.set_timeslice = execlist_exec_queue_set_timeslice,
>   	.set_preempt_timeout = execlist_exec_queue_set_preempt_timeout,
> diff --git a/drivers/gpu/drm/xe/xe_execlist_types.h b/drivers/gpu/drm/xe/xe_execlist_types.h
> index 415140936f11..92c4ba52db0c 100644
> --- a/drivers/gpu/drm/xe/xe_execlist_types.h
> +++ b/drivers/gpu/drm/xe/xe_execlist_types.h
> @@ -42,7 +42,7 @@ struct xe_execlist_exec_queue {
>   
>   	bool has_run;
>   
> -	struct work_struct fini_async;
> +	struct work_struct destroy_async;
>   
>   	enum xe_exec_queue_priority active_priority;
>   	struct list_head active_link;
> diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
> index a3f421e2adc0..c30c0e3ccbbb 100644
> --- a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
> @@ -35,8 +35,8 @@ struct xe_guc_exec_queue {
>   	struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE];
>   	/** @lr_tdr: long running TDR worker */
>   	struct work_struct lr_tdr;
> -	/** @fini_async: do final fini async from this worker */
> -	struct work_struct fini_async;
> +	/** @destroy_async: do final destroy async from this worker */
> +	struct work_struct destroy_async;
>   	/** @resume_time: time of last resume */
>   	u64 resume_time;
>   	/** @state: GuC specific state for this xe_exec_queue */
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 860c07da598a..75208ea4d408 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1418,48 +1418,57 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	return DRM_GPU_SCHED_STAT_NO_HANG;
>   }
>   
> -static void __guc_exec_queue_fini_async(struct work_struct *w)
> +static void guc_exec_queue_fini(struct xe_exec_queue *q)
> +{
> +	struct xe_guc_exec_queue *ge = q->guc;
> +	struct xe_guc *guc = exec_queue_to_guc(q);
> +
> +	release_guc_id(guc, q);
> +	xe_sched_entity_fini(&ge->entity);
> +	xe_sched_fini(&ge->sched);
> +
> +	/*
> +	 * RCU free due sched being exported via DRM scheduler fences
> +	 * (timeline name).
> +	 */
> +	kfree_rcu(ge, rcu);
> +}
> +
> +static void __guc_exec_queue_destroy_async(struct work_struct *w)
>   {
>   	struct xe_guc_exec_queue *ge =
> -		container_of(w, struct xe_guc_exec_queue, fini_async);
> +		container_of(w, struct xe_guc_exec_queue, destroy_async);
>   	struct xe_exec_queue *q = ge->q;
>   	struct xe_guc *guc = exec_queue_to_guc(q);
>   
>   	xe_pm_runtime_get(guc_to_xe(guc));
>   	trace_xe_exec_queue_destroy(q);
>   
> -	release_guc_id(guc, q);
>   	if (xe_exec_queue_is_lr(q))
>   		cancel_work_sync(&ge->lr_tdr);
>   	/* Confirm no work left behind accessing device structures */
>   	cancel_delayed_work_sync(&ge->sched.base.work_tdr);
> -	xe_sched_entity_fini(&ge->entity);
> -	xe_sched_fini(&ge->sched);
>   
> -	/*
> -	 * RCU free due sched being exported via DRM scheduler fences
> -	 * (timeline name).
> -	 */
> -	kfree_rcu(ge, rcu);
>   	xe_exec_queue_fini(q);
> +
>   	xe_pm_runtime_put(guc_to_xe(guc));
>   }
>   
> -static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
> +static void guc_exec_queue_destroy_async(struct xe_exec_queue *q)
>   {
>   	struct xe_guc *guc = exec_queue_to_guc(q);
>   	struct xe_device *xe = guc_to_xe(guc);
>   
> -	INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
> +	INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async);
>   
>   	/* We must block on kernel engines so slabs are empty on driver unload */
>   	if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
> -		__guc_exec_queue_fini_async(&q->guc->fini_async);
> +		__guc_exec_queue_destroy_async(&q->guc->destroy_async);
>   	else
> -		queue_work(xe->destroy_wq, &q->guc->fini_async);
> +		queue_work(xe->destroy_wq, &q->guc->destroy_async);
>   }
>   
> -static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
> +static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q)
>   {
>   	/*
>   	 * Might be done from within the GPU scheduler, need to do async as we
> @@ -1468,7 +1477,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
>   	 * this we and don't really care when everything is fini'd, just that it
>   	 * is.
>   	 */
> -	guc_exec_queue_fini_async(q);
> +	guc_exec_queue_destroy_async(q);
>   }
>   
>   static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
> @@ -1482,7 +1491,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
>   	if (exec_queue_registered(q))
>   		disable_scheduling_deregister(guc, q);
>   	else
> -		__guc_exec_queue_fini(guc, q);
> +		__guc_exec_queue_destroy(guc, q);
>   }
>   
>   static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
> @@ -1715,14 +1724,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q,
>   #define STATIC_MSG_CLEANUP	0
>   #define STATIC_MSG_SUSPEND	1
>   #define STATIC_MSG_RESUME	2
> -static void guc_exec_queue_fini(struct xe_exec_queue *q)
> +static void guc_exec_queue_destroy(struct xe_exec_queue *q)
>   {
>   	struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
>   
>   	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
>   		guc_exec_queue_add_msg(q, msg, CLEANUP);
>   	else
> -		__guc_exec_queue_fini(exec_queue_to_guc(q), q);
> +		__guc_exec_queue_destroy(exec_queue_to_guc(q), q);
>   }
>   
>   static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
> @@ -1852,6 +1861,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
>   	.init = guc_exec_queue_init,
>   	.kill = guc_exec_queue_kill,
>   	.fini = guc_exec_queue_fini,
> +	.destroy = guc_exec_queue_destroy,
>   	.set_priority = guc_exec_queue_set_priority,
>   	.set_timeslice = guc_exec_queue_set_timeslice,
>   	.set_preempt_timeout = guc_exec_queue_set_preempt_timeout,
> @@ -1873,7 +1883,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>   		if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
>   			xe_exec_queue_put(q);
>   		else if (exec_queue_destroyed(q))
> -			__guc_exec_queue_fini(guc, q);
> +			__guc_exec_queue_destroy(guc, q);
>   	}
>   	if (q->guc->suspend_pending) {
>   		set_exec_queue_suspended(q);
> @@ -2202,7 +2212,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
>   	if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
>   		xe_exec_queue_put(q);
>   	else
> -		__guc_exec_queue_fini(guc, q);
> +		__guc_exec_queue_destroy(guc, q);
>   }
>   
>   int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)

next prev parent reply	other threads:[~2025-09-04 20:45 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-18 23:46 [PATCH 1/2] drm/xe: Fix error handling if PXP fails to start Daniele Ceraolo Spurio
2025-08-18 23:46 ` [PATCH 2/2] drm/xe: Allow error injection for xe_pxp_exec_queue_add Daniele Ceraolo Spurio
2025-09-04 20:46   ` John Harrison
2025-08-19  0:18 ` ✗ CI.checkpatch: warning for series starting with [1/2] drm/xe: Fix error handling if PXP fails to start Patchwork
2025-08-19  0:19 ` ✓ CI.KUnit: success " Patchwork
2025-08-19  0:52 ` ✓ Xe.CI.BAT: " Patchwork
2025-08-19 18:50 ` ✓ Xe.CI.Full: " Patchwork
2025-09-04 20:45 ` John Harrison [this message]
2025-09-04 20:55   ` [PATCH 1/2] " Daniele Ceraolo Spurio
2025-09-04 20:57     ` Daniele Ceraolo Spurio
2025-09-04 21:11       ` John Harrison

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=fe1b455b-9917-4dec-8a16-94daa922263f@intel.com \
    --to=john.c.harrison@intel.com \
    --cc=daniele.ceraolospurio@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox