Re: [PATCH 09/16] drm/xe/multi_queue: Handle tearing down of a multi queue

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH 09/16] drm/xe/multi_queue: Handle tearing down of a multi queue
Date: Sat, 1 Nov 2025 17:39:10 -0700	[thread overview]
Message-ID: <aQaoLnIMeoKzYtNA@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <20251031182936.1882062-10-niranjana.vishwanathapura@intel.com>

On Fri, Oct 31, 2025 at 11:29:29AM -0700, Niranjana Vishwanathapura wrote:
> As all queues of a multi queue group use the primary queue of the group
> to interface with GuC. Hence there is a dependency between the queues of
> the group. So, when primary queue of a multi queue group is cleaned up,
> also trigger a cleanup of the secondary queues. During cleanup, stop and
> re-start submission for all queues of a multi queue group to avoid any
> submission happening in parallel when a queue is being cleaned up.
> 
> Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_exec_queue.c       |   2 +
>  drivers/gpu/drm/xe/xe_exec_queue_types.h |   4 +
>  drivers/gpu/drm/xe/xe_guc_submit.c       | 150 +++++++++++++++++++----
>  3 files changed, 134 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 98f8f1c7f13b..3c1bb4f10fd5 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -85,6 +85,7 @@ static void xe_exec_queue_group_cleanup(struct xe_exec_queue *q)
>  
>  	xa_destroy(&group->xa);
>  	mutex_destroy(&group->lock);
> +	mutex_destroy(&group->list_lock);

You init this lock in eariler patch but destroy it in this. Can we get
the init/destroy/instanaition in a single patch? 

>  	xe_bo_unpin_map_no_vm(group->cgp_bo);
>  	kfree(group);
>  }
> @@ -605,6 +606,7 @@ static int xe_exec_queue_group_init(struct xe_device *xe, struct xe_exec_queue *
>  
>  	group->primary = q;
>  	group->cgp_bo = bo;
> +	INIT_LIST_HEAD(&group->list);
>  	xa_init_flags(&group->xa, XA_FLAGS_ALLOC1);
>  	mutex_init(&group->lock);
>  	mutex_init(&group->list_lock);

group->list_lock is taken in the submission backend, which is entirely
in the path reclain. Can we teach lockdep this lock is in the path of
reclaim?

e.g.,

fs_reclaim_acquire(GFP_KERNEL);
might_lock(&group->list_lock);
fs_reclaim_release(GFP_KERNEL);

> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index dcb55b069ed8..e64b6588923e 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -51,6 +51,8 @@ struct xe_exec_queue_group {
>  	struct xe_bo *cgp_bo;
>  	/** @xa: xarray to store LRCs */
>  	struct xarray xa;
> +	/** @list: List of all secondary queues in the group */
> +	struct list_head list;
>  	/** @list_lock: Secondary queue list lock */
>  	struct mutex list_lock;
>  	/** @sync_pending: CGP_SYNC_DONE g2h response pending */
> @@ -140,6 +142,8 @@ struct xe_exec_queue {
>  	struct {
>  		/** @multi_queue.group: Queue group information */
>  		struct xe_exec_queue_group *group;
> +		/** @multi_queue.link: Link into group's secondary queues list */
> +		struct list_head link;
>  		/** @multi_queue.priority: Queue priority within the multi-queue group */
>  		enum xe_multi_queue_priority priority;
>  		/** @multi_queue.pos: Position of queue within the multi-queue group */
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index b84a0be2eefe..87c13feb2cef 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -920,6 +920,81 @@ static void wq_item_append(struct xe_exec_queue *q)
>  	parallel_write(xe, map, wq_desc.tail, q->guc->wqi_tail);
>  }
>  
> +static void xe_guc_exec_queue_submission_start(struct xe_exec_queue *q)
> +{
> +	/*
> +	 * If the exec queue is part of a multi queue group, then start submission
> +	 * on all queues of the multi queue group.
> +	 */
> +	if (xe_exec_queue_is_multi_queue(q)) {
> +		struct xe_exec_queue *primary = xe_exec_queue_multi_queue_primary(q);
> +		struct xe_exec_queue_group *group = q->multi_queue.group;
> +		struct xe_exec_queue *eq;
> +
> +		xe_sched_submission_start(&primary->guc->sched);
> +
> +		mutex_lock(&group->list_lock);
> +		list_for_each_entry(eq, &group->list, multi_queue.link)
> +			xe_sched_submission_start(&eq->guc->sched);
> +		mutex_unlock(&group->list_lock);
> +	} else {
> +		xe_sched_submission_start(&q->guc->sched);
> +	}
> +}
> +
> +static void xe_guc_exec_queue_submission_stop(struct xe_exec_queue *q)
> +{
> +	/*
> +	 * If the exec queue is part of a multi queue group, then stop submission
> +	 * on all queues of the multi queue group.
> +	 */
> +	if (xe_exec_queue_is_multi_queue(q)) {
> +		struct xe_exec_queue *primary = xe_exec_queue_multi_queue_primary(q);
> +		struct xe_exec_queue_group *group = q->multi_queue.group;
> +		struct xe_exec_queue *eq;
> +
> +		xe_sched_submission_stop(&primary->guc->sched);
> +
> +		mutex_lock(&group->list_lock);
> +		list_for_each_entry(eq, &group->list, multi_queue.link)
> +			xe_sched_submission_stop(&eq->guc->sched);
> +		mutex_unlock(&group->list_lock);
> +	} else {
> +		xe_sched_submission_stop(&q->guc->sched);
> +	}
> +}
> +
> +static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
> +{
> +	struct xe_guc *guc = exec_queue_to_guc(q);
> +	struct xe_device *xe = guc_to_xe(guc);
> +
> +	/** to wakeup xe_wait_user_fence ioctl if exec queue is reset */
> +	wake_up_all(&xe->ufence_wq);
> +
> +	if (xe_exec_queue_is_lr(q))
> +		queue_work(guc_to_gt(guc)->ordered_wq, &q->guc->lr_tdr);
> +	else
> +		xe_sched_tdr_queue_imm(&q->guc->sched);
> +}
> +
> +static void xe_guc_exec_queue_trigger_secondary_cleanup(struct xe_exec_queue *q)
> +{
> +	struct xe_exec_queue *primary = xe_exec_queue_multi_queue_primary(q);
> +	struct xe_exec_queue_group *group = q->multi_queue.group;
> +	struct xe_exec_queue *eq;
> +
> +	mutex_lock(&group->list_lock);
> +	list_for_each_entry(eq, &group->list, multi_queue.link) {
> +		if (exec_queue_reset(primary))

Do we need to propagate banned or killed?

Also happens if secondary queue is reset or a job times out? Does that
affect any of the other LRCs in the group?

> +			set_exec_queue_reset(eq);
> +
> +		if (!exec_queue_banned(eq))
> +			xe_guc_exec_queue_trigger_cleanup(eq);
> +	}
> +	mutex_unlock(&group->list_lock);
> +}
> +
>  #define RESUME_PENDING	~0x0ull
>  static void submit_exec_queue(struct xe_exec_queue *q, struct xe_sched_job *job)
>  {
> @@ -1098,20 +1173,6 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
>  			       G2H_LEN_DW_DEREGISTER_CONTEXT, 2);
>  }
>  
> -static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
> -{
> -	struct xe_guc *guc = exec_queue_to_guc(q);
> -	struct xe_device *xe = guc_to_xe(guc);
> -
> -	/** to wakeup xe_wait_user_fence ioctl if exec queue is reset */
> -	wake_up_all(&xe->ufence_wq);
> -
> -	if (xe_exec_queue_is_lr(q))
> -		queue_work(guc_to_gt(guc)->ordered_wq, &q->guc->lr_tdr);
> -	else
> -		xe_sched_tdr_queue_imm(&q->guc->sched);
> -}
> -
>  /**
>   * xe_guc_submit_wedge() - Wedge GuC submission
>   * @guc: the GuC object
> @@ -1185,8 +1246,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>  	if (!exec_queue_killed(q))
>  		wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
>  
> -	/* Kill the run_job / process_msg entry points */
> -	xe_sched_submission_stop(sched);
> +	/*
> +	 * Kill the run_job / process_msg entry points.
> +	 * As this function is serialized across exec queues, it is safe to
> +	 * stop and restart submission on all queues of a multi queue group.
> +	 */
> +	xe_guc_exec_queue_submission_stop(q);
>  
>  	/*
>  	 * Engine state now mostly stable, disable scheduling / deregister if
> @@ -1222,7 +1287,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>  				   q->guc->id);
>  			xe_devcoredump(q, NULL, "Schedule disable failed to respond, guc_id=%d\n",
>  				       q->guc->id);
> -			xe_sched_submission_start(sched);
> +			xe_guc_exec_queue_submission_start(q);
>  			xe_gt_reset_async(q->gt);
>  			return;
>  		}
> @@ -1233,7 +1298,11 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>  
>  	xe_hw_fence_irq_stop(q->fence_irq);
>  
> -	xe_sched_submission_start(sched);
> +	xe_guc_exec_queue_submission_start(q);
> +
> +	/* Trigger cleanup of secondary queues of multi queue group */
> +	if (xe_exec_queue_is_multi_queue_primary(q))
> +		xe_guc_exec_queue_trigger_secondary_cleanup(q);
>  
>  	spin_lock(&sched->base.job_list_lock);
>  	list_for_each_entry(job, &sched->base.pending_list, drm.list)
> @@ -1392,8 +1461,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	    vf_recovery(guc))
>  		return DRM_GPU_SCHED_STAT_NO_HANG;
>  
> -	/* Kill the run_job entry point */
> -	xe_sched_submission_stop(sched);
> +	/*
> +	 * Kill the run_job entry point.
> +	 * As this function is serialized across exec queues, it is safe to
> +	 * stop and restart submission on all queues of a multi queue group.
> +	 */
> +	xe_guc_exec_queue_submission_stop(q);
>  

I'd know where to stick this comment, but disable_scheduling() looks
like a pure software things for secondary queues. We currently need the
LRC to not be running to accurately sample the timestamp - I think we
could fix that part, Umesh would likely know for sure. But until then
I'm pretty sure we'd need to disable scheduling on primary for an
accurate sample of the secondaries queue's LRC timestamp.

>  	/* Must check all state after stopping scheduler */
>  	skip_timeout_check = exec_queue_reset(q) ||
> @@ -1552,7 +1625,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	 * fences that are complete
>  	 */
>  	xe_sched_add_pending_job(sched, job);
> -	xe_sched_submission_start(sched);
> +	xe_guc_exec_queue_submission_start(q);
>  
>  	xe_guc_exec_queue_trigger_cleanup(q);
>  
> @@ -1565,6 +1638,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	/* Start fence signaling */
>  	xe_hw_fence_irq_start(q->fence_irq);
>  
> +	/* Trigger cleanup of secondary queues of multi queue group */
> +	if (xe_exec_queue_is_multi_queue_primary(q))
> +		xe_guc_exec_queue_trigger_secondary_cleanup(q);
> +

I'd stick this part by xe_guc_exec_queue_trigger_cleanup above.

>  	return DRM_GPU_SCHED_STAT_RESET;
>  
>  sched_enable:
> @@ -1576,7 +1653,11 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	 * but there is not currently an easy way to do in DRM scheduler. With
>  	 * some thought, do this in a follow up.
>  	 */
> -	xe_sched_submission_start(sched);
> +	xe_guc_exec_queue_submission_start(q);
> +
> +	/* Trigger cleanup of secondary queues of multi queue group */
> +	if (xe_exec_queue_is_multi_queue_primary(q))
> +		xe_guc_exec_queue_trigger_secondary_cleanup(q);

I don't think you need to trigger a cleanup here - this is no hang
situation, rather a false timeout.

Matt

>  handle_vf_resume:
>  	return DRM_GPU_SCHED_STAT_NO_HANG;
>  }
> @@ -1607,6 +1688,14 @@ static void __guc_exec_queue_destroy_async(struct work_struct *w)
>  	xe_pm_runtime_get(guc_to_xe(guc));
>  	trace_xe_exec_queue_destroy(q);
>  
> +	if (xe_exec_queue_is_multi_queue_secondary(q)) {
> +		struct xe_exec_queue_group *group = q->multi_queue.group;
> +
> +		mutex_lock(&group->list_lock);
> +		list_del(&q->multi_queue.link);
> +		mutex_unlock(&group->list_lock);
> +	}
> +
>  	if (xe_exec_queue_is_lr(q))
>  		cancel_work_sync(&ge->lr_tdr);
>  	/* Confirm no work left behind accessing device structures */
> @@ -1897,6 +1986,19 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
>  
>  	xe_exec_queue_assign_name(q, q->guc->id);
>  
> +	/*
> +	 * Maintain secondary queues of the multi queue group in a list
> +	 * for handling dependencies across the queues in the group.
> +	 */
> +	if (xe_exec_queue_is_multi_queue_secondary(q)) {
> +		struct xe_exec_queue_group *group = q->multi_queue.group;
> +
> +		INIT_LIST_HEAD(&q->multi_queue.link);
> +		mutex_lock(&group->list_lock);
> +		list_add_tail(&q->multi_queue.link, &group->list);
> +		mutex_unlock(&group->list_lock);
> +	}
> +
>  	trace_xe_exec_queue_create(q);
>  
>  	return 0;
> @@ -2125,6 +2227,10 @@ static void guc_exec_queue_resume(struct xe_exec_queue *q)
>  
>  static bool guc_exec_queue_reset_status(struct xe_exec_queue *q)
>  {
> +	if (xe_exec_queue_is_multi_queue_secondary(q) &&
> +	    guc_exec_queue_reset_status(xe_exec_queue_multi_queue_primary(q)))
> +		return true;
> +
>  	return exec_queue_reset(q) || exec_queue_killed_or_banned_or_wedged(q);
>  }
>  
> -- 
> 2.43.0
>

next prev parent reply	other threads:[~2025-11-02  0:39 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-31 18:29 [PATCH 00/16] drm/xe: Multi Queue feature support Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 01/16] drm/xe/multi_queue: Add multi_queue_enable_mask to gt information Niranjana Vishwanathapura
2025-11-02  0:01   ` Matthew Brost
2025-11-03  1:25     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 02/16] drm/xe/multi_queue: Add user interface for multi queue support Niranjana Vishwanathapura
2025-10-31 19:31   ` Matthew Brost
2025-11-03 22:58     ` Niranjana Vishwanathapura
2025-11-02  0:23   ` Matthew Brost
2025-11-03 22:59     ` Niranjana Vishwanathapura
2025-11-02 17:37   ` Matthew Brost
2025-11-03 23:06     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 03/16] drm/xe/multi_queue: Add GuC " Niranjana Vishwanathapura
2025-11-01 18:07   ` Matthew Brost
2025-11-04  4:56     ` Niranjana Vishwanathapura
2025-11-04 17:41       ` Matthew Brost
2025-11-04 18:55         ` Niranjana Vishwanathapura
2025-11-04 19:26           ` Matthew Brost
2025-11-02 18:02   ` Matthew Brost
2025-11-04  5:02     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 04/16] drm/xe/multi_queue: Add multi queue priority property Niranjana Vishwanathapura
2025-11-01 23:59   ` Matthew Brost
2025-11-03  4:45     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 05/16] drm/xe/multi_queue: Handle invalid exec queue property setting Niranjana Vishwanathapura
2025-11-03 22:41   ` Matthew Brost
2025-10-31 18:29 ` [PATCH 06/16] drm/xe/multi_queue: Add exec_queue set_property ioctl support Niranjana Vishwanathapura
2025-11-02 16:53   ` Matthew Brost
2025-11-03  1:49     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 07/16] drm/xe/multi_queue: Add support for multi queue dynamic priority change Niranjana Vishwanathapura
2025-11-01 23:23   ` Matthew Brost
2025-11-03 18:06     ` Niranjana Vishwanathapura
2025-11-01 23:41   ` Matthew Brost
2025-11-03 18:14     ` Niranjana Vishwanathapura
2025-11-03 19:05       ` Matthew Brost
2025-10-31 18:29 ` [PATCH 08/16] drm/xe/multi_queue: Add multi queue information to guc_info dump Niranjana Vishwanathapura
2025-11-01 18:31   ` Matthew Brost
2025-11-03  1:15     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 09/16] drm/xe/multi_queue: Handle tearing down of a multi queue Niranjana Vishwanathapura
2025-11-02  0:39   ` Matthew Brost [this message]
2025-11-04  3:35     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 10/16] drm/xe/multi_queue: Set QUEUE_DRAIN_MODE for Multi Queue batches Niranjana Vishwanathapura
2025-11-02 18:22   ` Matthew Brost
2025-11-03 17:09     ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 11/16] drm/xe/multi_queue: Handle CGP context error Niranjana Vishwanathapura
2025-11-02 18:29   ` Matthew Brost
2025-11-03 16:44     ` Niranjana Vishwanathapura
2025-11-03 17:18       ` Matthew Brost
2025-10-31 18:29 ` [PATCH 12/16] drm/xe/multi_queue: Tracepoint support Niranjana Vishwanathapura
2025-11-01 18:32   ` Matthew Brost
2025-10-31 18:29 ` [PATCH 13/16] drm/xe/multi_queue: Support active group after primary is destroyed Niranjana Vishwanathapura
2025-11-03 22:05   ` Matthew Brost
2025-11-04 17:24     ` Niranjana Vishwanathapura
2025-11-04 17:30       ` Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 14/16] drm/xe/doc: Add documentation for Multi Queue Group Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 15/16] drm/xe/doc: Add documentation for Multi Queue Group GuC interface Niranjana Vishwanathapura
2025-10-31 18:29 ` [PATCH 16/16] drm/xe/multi_queue: Enable multi_queue on xe3p_xpc Niranjana Vishwanathapura
2025-11-02  0:05   ` Matthew Brost
2025-10-31 18:47 ` [PATCH 00/16] drm/xe: Multi Queue feature support Niranjana Vishwanathapura
2025-10-31 21:15 ` ✗ CI.checkpatch: warning for " Patchwork
2025-10-31 21:16 ` ✓ CI.KUnit: success " Patchwork
2025-10-31 22:19 ` ✗ Xe.CI.BAT: failure " Patchwork
2025-11-01 11:25 ` ✗ Xe.CI.Full: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aQaoLnIMeoKzYtNA@lstrano-desk.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=niranjana.vishwanathapura@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox