public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed
From: Matthew Brost <matthew.brost@intel.com>
To: Raag Jadav <raag.jadav@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <rodrigo.vivi@intel.com>,
	<thomas.hellstrom@linux.intel.com>, <riana.tauro@intel.com>,
	<michal.wajdeczko@intel.com>, <matthew.d.roper@intel.com>,
	<michal.winiarski@intel.com>, <matthew.auld@intel.com>,
	<maarten@lankhorst.se>
Subject: Re: [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit()
Date: Fri, 27 Feb 2026 10:06:21 -0800	[thread overview]
Message-ID: <aaHdHSyXmlix/rkl@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <20260227170049.3418863-7-raag.jadav@intel.com>

On Fri, Feb 27, 2026 at 10:30:46PM +0530, Raag Jadav wrote:
> In preparation of usecases which require re-initializing LRC after PCIe
> FLR, introduce xe_lrc_reinit() helper. The LRC bo already exists but
> since it's contents are on VRAM, they are lost on PCIe FLR. Recreate
> ring context as part of re-initialization.
> 
> Signed-off-by: Raag Jadav <raag.jadav@intel.com>
> ---
> v2: Re-initialize migrate context (Matthew Brost)
> ---
>  drivers/gpu/drm/xe/xe_lrc.c | 149 +++++++++++++++++++++---------------
>  drivers/gpu/drm/xe/xe_lrc.h |   2 +
>  2 files changed, 90 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 84360fcdf743..9fc8720f62ca 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -1438,65 +1438,16 @@ void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_pri
>  	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
>  }
>  
> -static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> -		       struct xe_vm *vm, void *replay_state, u32 ring_size,
> -		       u16 msix_vec,
> -		       u32 init_flags)
> +static int xe_lrc_init_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> +			   void *replay_state, u16 msix_vec, u32 init_flags)
>  {
>  	struct xe_gt *gt = hwe->gt;
> -	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> -	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
>  	struct xe_tile *tile = gt_to_tile(gt);
>  	struct xe_device *xe = gt_to_xe(gt);
> -	struct xe_bo *seqno_bo;
>  	struct iosys_map map;
>  	u32 arb_enable;
> -	u32 bo_flags;
>  	int err;
>  
> -	kref_init(&lrc->refcount);
> -	lrc->gt = gt;
> -	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> -	lrc->size = lrc_size;
> -	lrc->flags = 0;
> -	lrc->ring.size = ring_size;
> -	lrc->ring.tail = 0;
> -
> -	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> -		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> -		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> -	}
> -
> -	if (xe_gt_has_indirect_ring_state(gt))
> -		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> -
> -	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> -		   XE_BO_FLAG_GGTT_INVALIDATE;
> -
> -	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> -		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> -
> -	lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
> -					    bo_size,
> -					    ttm_bo_type_kernel,
> -					    bo_flags, false);
> -	if (IS_ERR(lrc->bo))
> -		return PTR_ERR(lrc->bo);
> -
> -	seqno_bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> -					     ttm_bo_type_kernel,
> -					     XE_BO_FLAG_GGTT |
> -					     XE_BO_FLAG_GGTT_INVALIDATE |
> -					     XE_BO_FLAG_SYSTEM, false);
> -	if (IS_ERR(seqno_bo)) {
> -		err = PTR_ERR(seqno_bo);
> -		goto err_lrc_finish;
> -	}
> -	lrc->seqno_bo = seqno_bo;
> -
> -	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> -			     hwe->fence_irq, hwe->name);
> -
>  	/*
>  	 * Init Per-Process of HW status Page, LRC / context state to known
>  	 * values. If there's already a primed default_lrc, just copy it, otherwise
> @@ -1508,7 +1459,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
>  		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
>  				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
> -				 lrc_size - LRC_PPHWSP_SIZE);
> +				 lrc->size - LRC_PPHWSP_SIZE);
>  		if (replay_state)
>  			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
>  					 replay_state, lrc->replay_size);
> @@ -1516,21 +1467,16 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  		void *init_data = empty_lrc_data(hwe);
>  
>  		if (!init_data) {
> -			err = -ENOMEM;
> -			goto err_lrc_finish;
> +			return -ENOMEM;
>  		}
>  
> -		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
> +		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
>  		kfree(init_data);
>  	}
>  
> -	if (vm) {
> +	if (vm)
>  		xe_lrc_set_ppgtt(lrc, vm);
>  
> -		if (vm->xef)
> -			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> -	}
> -
>  	if (xe_device_has_msix(xe)) {
>  		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
>  				     xe_memirq_status_ptr(&tile->memirq, hwe));
> @@ -1602,12 +1548,93 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  
>  	err = setup_wa_bb(lrc, hwe);
>  	if (err)
> -		goto err_lrc_finish;
> +		return err;
>  
>  	err = setup_indirect_ctx(lrc, hwe);
> +
> +	return err;
> +}
> +
> +/**
> + * xe_lrc_reinit() - Re-initialize LRC
> + * @lrc: Pointer to the LRC
> + * @hwe: Hardware Engine
> + * @vm: The VM (address space)
> + * @replay_state: GPU hang replay state
> + * @msix_vec: MSI-X interrupt vector (for platforms that support it)
> + * @init_flags: LRC initialization flags
> + *
> + * Returns: 0 on success, negative error code otherwise.
> + */
> +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> +		  void *replay_state, u16 msix_vec, u32 init_flags)
> +{

I think you likely want to set lrc->ring.tail = 0 here (or in
xe_lrc_init_ctx), right? Alternatively, you could set both
INDIRECT_CTX_RING_HEAD and INDIRECT_CTX_RING_TAIL to lrc->ring.tail in
xe_lrc_init_ctx.

Consider the case where a bunch of work has run on the migration queue
and lrc->ring.tail ends up in the middle of the ring, then xe_lrc_reinit
is called. The next submission on the LRC will execute the instructions
between 0 (INDIRECT_CTX_RING_HEAD is set zero in xe_lrc_init_ctx) and
the lrc->ring.tail value at the time xe_lrc_reinit was invoked, which
will be stale or invalid if VRAM was clobbered.

I would have expected this to show up in testing if you ran something
like:

xe_exec_basic;
echo 1 > /sys/bus/pci/devices/<BDF>/reset;
xe_exec_basic;

Otherwise this is good cleanup adding xe_lrc_init_ctx regardless of Xe
PCIe FLR so if we work out above, feel free to post this an independent
which we can merge ahead of Xe PCIe FLR.

Matt

> +	return xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> +}
> +
> +static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> +		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
> +{
> +	struct xe_gt *gt = hwe->gt;
> +	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> +	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> +	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_device *xe = gt_to_xe(gt);
> +	struct xe_bo *bo;
> +	u32 bo_flags;
> +	int err;
> +
> +	kref_init(&lrc->refcount);
> +	lrc->gt = gt;
> +	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> +	lrc->size = lrc_size;
> +	lrc->flags = 0;
> +	lrc->ring.size = ring_size;
> +	lrc->ring.tail = 0;
> +
> +	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> +		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> +		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> +	}
> +
> +	if (xe_gt_has_indirect_ring_state(gt))
> +		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> +
> +	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> +		   XE_BO_FLAG_GGTT_INVALIDATE;
> +
> +	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> +		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> +
> +	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
> +				       ttm_bo_type_kernel,
> +				       bo_flags, false);
> +	if (IS_ERR(lrc->bo))
> +		return PTR_ERR(lrc->bo);
> +
> +	lrc->bo = bo;
> +
> +	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> +				       ttm_bo_type_kernel,
> +				       XE_BO_FLAG_GGTT |
> +				       XE_BO_FLAG_GGTT_INVALIDATE |
> +				       XE_BO_FLAG_SYSTEM, false);
> +	if (IS_ERR(bo)) {
> +		err = PTR_ERR(bo);
> +		goto err_lrc_finish;
> +	}
> +	lrc->seqno_bo = bo;
> +
> +	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> +			     hwe->fence_irq, hwe->name);
> +
> +	err = xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
>  	if (err)
>  		goto err_lrc_finish;
>  
> +	if (vm && vm->xef)
> +		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> +
>  	return 0;
>  
>  err_lrc_finish:
> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> index 3e500004f1ae..af31de8df408 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.h
> +++ b/drivers/gpu/drm/xe/xe_lrc.h
> @@ -52,6 +52,8 @@ struct xe_lrc_snapshot {
>  
>  struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
>  			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
> +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> +		  void *replay_state, u16 msix_vec, u32 init_flags);
>  void xe_lrc_destroy(struct kref *ref);
>  
>  /**
> -- 
> 2.43.0
> 

  reply	other threads:[~2026-02-27 18:06 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-27 17:00 [PATCH v2 0/9] Introduce Xe PCIe FLR Raag Jadav
2026-02-27 17:00 ` [PATCH v2 1/9] drm/xe/uc_fw: Allow re-initializing firmware Raag Jadav
2026-02-27 17:00 ` [PATCH v2 2/9] drm/xe/gt: Introduce FLR helpers Raag Jadav
2026-02-27 17:00 ` [PATCH v2 3/9] drm/xe/irq: Introduce xe_irq_disable() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 4/9] drm/xe: Introduce xe_device_assert_lmem_ready() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 5/9] drm/xe/bo_evict: Introduce xe_bo_restore_map() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit() Raag Jadav
2026-02-27 18:06   ` Matthew Brost [this message]
2026-02-28  5:11     ` Raag Jadav
2026-02-27 17:00 ` [PATCH v2 7/9] drm/xe/exec_queue: Introduce xe_exec_queue_reinit() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 8/9] drm/xe/migrate: Introduce xe_migrate_reinit() Raag Jadav
2026-02-27 18:32   ` Matthew Brost
2026-02-28  5:12     ` Raag Jadav
2026-03-03  5:29       ` Raag Jadav
2026-02-27 17:00 ` [PATCH v2 9/9] drm/xe/pci: Introduce PCIe FLR Raag Jadav
2026-02-27 17:49   ` Vivi, Rodrigo
2026-02-28  5:24     ` Raag Jadav
2026-03-02 16:58       ` Rodrigo Vivi
2026-03-02 19:37     ` Laguna, Lukasz
2026-02-27 17:50 ` [PATCH v2 0/9] Introduce Xe " Vivi, Rodrigo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aaHdHSyXmlix/rkl@lstrano-desk.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=maarten@lankhorst.se \
    --cc=matthew.auld@intel.com \
    --cc=matthew.d.roper@intel.com \
    --cc=michal.wajdeczko@intel.com \
    --cc=michal.winiarski@intel.com \
    --cc=raag.jadav@intel.com \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox