Re: [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit()

public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed

From: Raag Jadav <raag.jadav@intel.com>
To: Matthew Brost <matthew.brost@intel.com>
Cc: intel-xe@lists.freedesktop.org, rodrigo.vivi@intel.com,
	thomas.hellstrom@linux.intel.com, riana.tauro@intel.com,
	michal.wajdeczko@intel.com, matthew.d.roper@intel.com,
	michal.winiarski@intel.com, matthew.auld@intel.com,
	maarten@lankhorst.se
Subject: Re: [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit()
Date: Sat, 28 Feb 2026 06:11:22 +0100	[thread overview]
Message-ID: <aaJ4-gjgZBj0V7BA@black.igk.intel.com> (raw)
In-Reply-To: <aaHdHSyXmlix/rkl@lstrano-desk.jf.intel.com>

On Fri, Feb 27, 2026 at 10:06:21AM -0800, Matthew Brost wrote:
> On Fri, Feb 27, 2026 at 10:30:46PM +0530, Raag Jadav wrote:
> > In preparation of usecases which require re-initializing LRC after PCIe
> > FLR, introduce xe_lrc_reinit() helper. The LRC bo already exists but
> > since it's contents are on VRAM, they are lost on PCIe FLR. Recreate
> > ring context as part of re-initialization.
> > 
> > Signed-off-by: Raag Jadav <raag.jadav@intel.com>
> > ---
> > v2: Re-initialize migrate context (Matthew Brost)
> > ---
> >  drivers/gpu/drm/xe/xe_lrc.c | 149 +++++++++++++++++++++---------------
> >  drivers/gpu/drm/xe/xe_lrc.h |   2 +
> >  2 files changed, 90 insertions(+), 61 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> > index 84360fcdf743..9fc8720f62ca 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.c
> > +++ b/drivers/gpu/drm/xe/xe_lrc.c
> > @@ -1438,65 +1438,16 @@ void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_pri
> >  	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
> >  }
> >  
> > -static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > -		       struct xe_vm *vm, void *replay_state, u32 ring_size,
> > -		       u16 msix_vec,
> > -		       u32 init_flags)
> > +static int xe_lrc_init_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > +			   void *replay_state, u16 msix_vec, u32 init_flags)
> >  {
> >  	struct xe_gt *gt = hwe->gt;
> > -	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> > -	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> >  	struct xe_tile *tile = gt_to_tile(gt);
> >  	struct xe_device *xe = gt_to_xe(gt);
> > -	struct xe_bo *seqno_bo;
> >  	struct iosys_map map;
> >  	u32 arb_enable;
> > -	u32 bo_flags;
> >  	int err;
> >  
> > -	kref_init(&lrc->refcount);
> > -	lrc->gt = gt;
> > -	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> > -	lrc->size = lrc_size;
> > -	lrc->flags = 0;
> > -	lrc->ring.size = ring_size;
> > -	lrc->ring.tail = 0;
> > -
> > -	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> > -		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > -		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> > -	}
> > -
> > -	if (xe_gt_has_indirect_ring_state(gt))
> > -		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > -
> > -	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> > -		   XE_BO_FLAG_GGTT_INVALIDATE;
> > -
> > -	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> > -		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> > -
> > -	lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
> > -					    bo_size,
> > -					    ttm_bo_type_kernel,
> > -					    bo_flags, false);
> > -	if (IS_ERR(lrc->bo))
> > -		return PTR_ERR(lrc->bo);
> > -
> > -	seqno_bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> > -					     ttm_bo_type_kernel,
> > -					     XE_BO_FLAG_GGTT |
> > -					     XE_BO_FLAG_GGTT_INVALIDATE |
> > -					     XE_BO_FLAG_SYSTEM, false);
> > -	if (IS_ERR(seqno_bo)) {
> > -		err = PTR_ERR(seqno_bo);
> > -		goto err_lrc_finish;
> > -	}
> > -	lrc->seqno_bo = seqno_bo;
> > -
> > -	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> > -			     hwe->fence_irq, hwe->name);
> > -
> >  	/*
> >  	 * Init Per-Process of HW status Page, LRC / context state to known
> >  	 * values. If there's already a primed default_lrc, just copy it, otherwise
> > @@ -1508,7 +1459,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> >  		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
> >  		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
> >  				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
> > -				 lrc_size - LRC_PPHWSP_SIZE);
> > +				 lrc->size - LRC_PPHWSP_SIZE);
> >  		if (replay_state)
> >  			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
> >  					 replay_state, lrc->replay_size);
> > @@ -1516,21 +1467,16 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> >  		void *init_data = empty_lrc_data(hwe);
> >  
> >  		if (!init_data) {
> > -			err = -ENOMEM;
> > -			goto err_lrc_finish;
> > +			return -ENOMEM;
> >  		}
> >  
> > -		xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
> > +		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
> >  		kfree(init_data);
> >  	}
> >  
> > -	if (vm) {
> > +	if (vm)
> >  		xe_lrc_set_ppgtt(lrc, vm);
> >  
> > -		if (vm->xef)
> > -			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> > -	}
> > -
> >  	if (xe_device_has_msix(xe)) {
> >  		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
> >  				     xe_memirq_status_ptr(&tile->memirq, hwe));
> > @@ -1602,12 +1548,93 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> >  
> >  	err = setup_wa_bb(lrc, hwe);
> >  	if (err)
> > -		goto err_lrc_finish;
> > +		return err;
> >  
> >  	err = setup_indirect_ctx(lrc, hwe);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * xe_lrc_reinit() - Re-initialize LRC
> > + * @lrc: Pointer to the LRC
> > + * @hwe: Hardware Engine
> > + * @vm: The VM (address space)
> > + * @replay_state: GPU hang replay state
> > + * @msix_vec: MSI-X interrupt vector (for platforms that support it)
> > + * @init_flags: LRC initialization flags
> > + *
> > + * Returns: 0 on success, negative error code otherwise.
> > + */
> > +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > +		  void *replay_state, u16 msix_vec, u32 init_flags)
> > +{
> 
> I think you likely want to set lrc->ring.tail = 0 here (or in
> xe_lrc_init_ctx), right? Alternatively, you could set both
> INDIRECT_CTX_RING_HEAD and INDIRECT_CTX_RING_TAIL to lrc->ring.tail in
> xe_lrc_init_ctx.
> 
> Consider the case where a bunch of work has run on the migration queue
> and lrc->ring.tail ends up in the middle of the ring, then xe_lrc_reinit
> is called. The next submission on the LRC will execute the instructions
> between 0 (INDIRECT_CTX_RING_HEAD is set zero in xe_lrc_init_ctx) and
> the lrc->ring.tail value at the time xe_lrc_reinit was invoked, which
> will be stale or invalid if VRAM was clobbered.
> 
> I would have expected this to show up in testing if you ran something
> like:
> 
> xe_exec_basic;
> echo 1 > /sys/bus/pci/devices/<BDF>/reset;
> xe_exec_basic;
> 
> Otherwise this is good cleanup adding xe_lrc_init_ctx regardless of Xe
> PCIe FLR so if we work out above, feel free to post this an independent
> which we can merge ahead of Xe PCIe FLR.

Makes sense.

Raag

> > +	return xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> > +}
> > +
> > +static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > +		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
> > +{
> > +	struct xe_gt *gt = hwe->gt;
> > +	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> > +	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> > +	struct xe_tile *tile = gt_to_tile(gt);
> > +	struct xe_device *xe = gt_to_xe(gt);
> > +	struct xe_bo *bo;
> > +	u32 bo_flags;
> > +	int err;
> > +
> > +	kref_init(&lrc->refcount);
> > +	lrc->gt = gt;
> > +	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> > +	lrc->size = lrc_size;
> > +	lrc->flags = 0;
> > +	lrc->ring.size = ring_size;
> > +	lrc->ring.tail = 0;
> > +
> > +	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> > +		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > +		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> > +	}
> > +
> > +	if (xe_gt_has_indirect_ring_state(gt))
> > +		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > +
> > +	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> > +		   XE_BO_FLAG_GGTT_INVALIDATE;
> > +
> > +	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> > +		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> > +
> > +	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
> > +				       ttm_bo_type_kernel,
> > +				       bo_flags, false);
> > +	if (IS_ERR(lrc->bo))
> > +		return PTR_ERR(lrc->bo);
> > +
> > +	lrc->bo = bo;
> > +
> > +	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> > +				       ttm_bo_type_kernel,
> > +				       XE_BO_FLAG_GGTT |
> > +				       XE_BO_FLAG_GGTT_INVALIDATE |
> > +				       XE_BO_FLAG_SYSTEM, false);
> > +	if (IS_ERR(bo)) {
> > +		err = PTR_ERR(bo);
> > +		goto err_lrc_finish;
> > +	}
> > +	lrc->seqno_bo = bo;
> > +
> > +	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> > +			     hwe->fence_irq, hwe->name);
> > +
> > +	err = xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> >  	if (err)
> >  		goto err_lrc_finish;
> >  
> > +	if (vm && vm->xef)
> > +		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> > +
> >  	return 0;
> >  
> >  err_lrc_finish:
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> > index 3e500004f1ae..af31de8df408 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.h
> > +++ b/drivers/gpu/drm/xe/xe_lrc.h
> > @@ -52,6 +52,8 @@ struct xe_lrc_snapshot {
> >  
> >  struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
> >  			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
> > +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > +		  void *replay_state, u16 msix_vec, u32 init_flags);
> >  void xe_lrc_destroy(struct kref *ref);
> >  
> >  /**
> > -- 
> > 2.43.0
> >

next prev parent reply	other threads:[~2026-02-28  5:11 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-27 17:00 [PATCH v2 0/9] Introduce Xe PCIe FLR Raag Jadav
2026-02-27 17:00 ` [PATCH v2 1/9] drm/xe/uc_fw: Allow re-initializing firmware Raag Jadav
2026-02-27 17:00 ` [PATCH v2 2/9] drm/xe/gt: Introduce FLR helpers Raag Jadav
2026-02-27 17:00 ` [PATCH v2 3/9] drm/xe/irq: Introduce xe_irq_disable() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 4/9] drm/xe: Introduce xe_device_assert_lmem_ready() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 5/9] drm/xe/bo_evict: Introduce xe_bo_restore_map() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit() Raag Jadav
2026-02-27 18:06   ` Matthew Brost
2026-02-28  5:11     ` Raag Jadav [this message]
2026-02-27 17:00 ` [PATCH v2 7/9] drm/xe/exec_queue: Introduce xe_exec_queue_reinit() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 8/9] drm/xe/migrate: Introduce xe_migrate_reinit() Raag Jadav
2026-02-27 18:32   ` Matthew Brost
2026-02-28  5:12     ` Raag Jadav
2026-03-03  5:29       ` Raag Jadav
2026-02-27 17:00 ` [PATCH v2 9/9] drm/xe/pci: Introduce PCIe FLR Raag Jadav
2026-02-27 17:49   ` Vivi, Rodrigo
2026-02-28  5:24     ` Raag Jadav
2026-03-02 16:58       ` Rodrigo Vivi
2026-03-02 19:37     ` Laguna, Lukasz
2026-02-27 17:50 ` [PATCH v2 0/9] Introduce Xe " Vivi, Rodrigo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aaJ4-gjgZBj0V7BA@black.igk.intel.com \
    --to=raag.jadav@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=maarten@lankhorst.se \
    --cc=matthew.auld@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=matthew.d.roper@intel.com \
    --cc=michal.wajdeczko@intel.com \
    --cc=michal.winiarski@intel.com \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox