From: Raag Jadav <raag.jadav@intel.com>
To: Matthew Brost <matthew.brost@intel.com>
Cc: intel-xe@lists.freedesktop.org, rodrigo.vivi@intel.com,
thomas.hellstrom@linux.intel.com, riana.tauro@intel.com,
michal.wajdeczko@intel.com, matthew.d.roper@intel.com,
michal.winiarski@intel.com, matthew.auld@intel.com,
maarten@lankhorst.se
Subject: Re: [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit()
Date: Sat, 28 Feb 2026 06:11:22 +0100 [thread overview]
Message-ID: <aaJ4-gjgZBj0V7BA@black.igk.intel.com> (raw)
In-Reply-To: <aaHdHSyXmlix/rkl@lstrano-desk.jf.intel.com>
On Fri, Feb 27, 2026 at 10:06:21AM -0800, Matthew Brost wrote:
> On Fri, Feb 27, 2026 at 10:30:46PM +0530, Raag Jadav wrote:
> > In preparation of usecases which require re-initializing LRC after PCIe
> > FLR, introduce xe_lrc_reinit() helper. The LRC bo already exists but
> > since it's contents are on VRAM, they are lost on PCIe FLR. Recreate
> > ring context as part of re-initialization.
> >
> > Signed-off-by: Raag Jadav <raag.jadav@intel.com>
> > ---
> > v2: Re-initialize migrate context (Matthew Brost)
> > ---
> > drivers/gpu/drm/xe/xe_lrc.c | 149 +++++++++++++++++++++---------------
> > drivers/gpu/drm/xe/xe_lrc.h | 2 +
> > 2 files changed, 90 insertions(+), 61 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> > index 84360fcdf743..9fc8720f62ca 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.c
> > +++ b/drivers/gpu/drm/xe/xe_lrc.c
> > @@ -1438,65 +1438,16 @@ void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_pri
> > lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
> > }
> >
> > -static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > - struct xe_vm *vm, void *replay_state, u32 ring_size,
> > - u16 msix_vec,
> > - u32 init_flags)
> > +static int xe_lrc_init_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u16 msix_vec, u32 init_flags)
> > {
> > struct xe_gt *gt = hwe->gt;
> > - const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> > - u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> > struct xe_tile *tile = gt_to_tile(gt);
> > struct xe_device *xe = gt_to_xe(gt);
> > - struct xe_bo *seqno_bo;
> > struct iosys_map map;
> > u32 arb_enable;
> > - u32 bo_flags;
> > int err;
> >
> > - kref_init(&lrc->refcount);
> > - lrc->gt = gt;
> > - lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> > - lrc->size = lrc_size;
> > - lrc->flags = 0;
> > - lrc->ring.size = ring_size;
> > - lrc->ring.tail = 0;
> > -
> > - if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> > - lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > - bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> > - }
> > -
> > - if (xe_gt_has_indirect_ring_state(gt))
> > - lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > -
> > - bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> > - XE_BO_FLAG_GGTT_INVALIDATE;
> > -
> > - if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> > - bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> > -
> > - lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
> > - bo_size,
> > - ttm_bo_type_kernel,
> > - bo_flags, false);
> > - if (IS_ERR(lrc->bo))
> > - return PTR_ERR(lrc->bo);
> > -
> > - seqno_bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> > - ttm_bo_type_kernel,
> > - XE_BO_FLAG_GGTT |
> > - XE_BO_FLAG_GGTT_INVALIDATE |
> > - XE_BO_FLAG_SYSTEM, false);
> > - if (IS_ERR(seqno_bo)) {
> > - err = PTR_ERR(seqno_bo);
> > - goto err_lrc_finish;
> > - }
> > - lrc->seqno_bo = seqno_bo;
> > -
> > - xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> > - hwe->fence_irq, hwe->name);
> > -
> > /*
> > * Init Per-Process of HW status Page, LRC / context state to known
> > * values. If there's already a primed default_lrc, just copy it, otherwise
> > @@ -1508,7 +1459,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
> > xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
> > gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
> > - lrc_size - LRC_PPHWSP_SIZE);
> > + lrc->size - LRC_PPHWSP_SIZE);
> > if (replay_state)
> > xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
> > replay_state, lrc->replay_size);
> > @@ -1516,21 +1467,16 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > void *init_data = empty_lrc_data(hwe);
> >
> > if (!init_data) {
> > - err = -ENOMEM;
> > - goto err_lrc_finish;
> > + return -ENOMEM;
> > }
> >
> > - xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
> > + xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
> > kfree(init_data);
> > }
> >
> > - if (vm) {
> > + if (vm)
> > xe_lrc_set_ppgtt(lrc, vm);
> >
> > - if (vm->xef)
> > - xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> > - }
> > -
> > if (xe_device_has_msix(xe)) {
> > xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
> > xe_memirq_status_ptr(&tile->memirq, hwe));
> > @@ -1602,12 +1548,93 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> >
> > err = setup_wa_bb(lrc, hwe);
> > if (err)
> > - goto err_lrc_finish;
> > + return err;
> >
> > err = setup_indirect_ctx(lrc, hwe);
> > +
> > + return err;
> > +}
> > +
> > +/**
> > + * xe_lrc_reinit() - Re-initialize LRC
> > + * @lrc: Pointer to the LRC
> > + * @hwe: Hardware Engine
> > + * @vm: The VM (address space)
> > + * @replay_state: GPU hang replay state
> > + * @msix_vec: MSI-X interrupt vector (for platforms that support it)
> > + * @init_flags: LRC initialization flags
> > + *
> > + * Returns: 0 on success, negative error code otherwise.
> > + */
> > +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u16 msix_vec, u32 init_flags)
> > +{
>
> I think you likely want to set lrc->ring.tail = 0 here (or in
> xe_lrc_init_ctx), right? Alternatively, you could set both
> INDIRECT_CTX_RING_HEAD and INDIRECT_CTX_RING_TAIL to lrc->ring.tail in
> xe_lrc_init_ctx.
>
> Consider the case where a bunch of work has run on the migration queue
> and lrc->ring.tail ends up in the middle of the ring, then xe_lrc_reinit
> is called. The next submission on the LRC will execute the instructions
> between 0 (INDIRECT_CTX_RING_HEAD is set zero in xe_lrc_init_ctx) and
> the lrc->ring.tail value at the time xe_lrc_reinit was invoked, which
> will be stale or invalid if VRAM was clobbered.
>
> I would have expected this to show up in testing if you ran something
> like:
>
> xe_exec_basic;
> echo 1 > /sys/bus/pci/devices/<BDF>/reset;
> xe_exec_basic;
>
> Otherwise this is good cleanup adding xe_lrc_init_ctx regardless of Xe
> PCIe FLR so if we work out above, feel free to post this an independent
> which we can merge ahead of Xe PCIe FLR.
Makes sense.
Raag
> > + return xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> > +}
> > +
> > +static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
> > +{
> > + struct xe_gt *gt = hwe->gt;
> > + const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> > + u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> > + struct xe_tile *tile = gt_to_tile(gt);
> > + struct xe_device *xe = gt_to_xe(gt);
> > + struct xe_bo *bo;
> > + u32 bo_flags;
> > + int err;
> > +
> > + kref_init(&lrc->refcount);
> > + lrc->gt = gt;
> > + lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> > + lrc->size = lrc_size;
> > + lrc->flags = 0;
> > + lrc->ring.size = ring_size;
> > + lrc->ring.tail = 0;
> > +
> > + if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> > + lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > + bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> > + }
> > +
> > + if (xe_gt_has_indirect_ring_state(gt))
> > + lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > +
> > + bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> > + XE_BO_FLAG_GGTT_INVALIDATE;
> > +
> > + if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> > + bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> > +
> > + bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
> > + ttm_bo_type_kernel,
> > + bo_flags, false);
> > + if (IS_ERR(lrc->bo))
> > + return PTR_ERR(lrc->bo);
> > +
> > + lrc->bo = bo;
> > +
> > + bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> > + ttm_bo_type_kernel,
> > + XE_BO_FLAG_GGTT |
> > + XE_BO_FLAG_GGTT_INVALIDATE |
> > + XE_BO_FLAG_SYSTEM, false);
> > + if (IS_ERR(bo)) {
> > + err = PTR_ERR(bo);
> > + goto err_lrc_finish;
> > + }
> > + lrc->seqno_bo = bo;
> > +
> > + xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> > + hwe->fence_irq, hwe->name);
> > +
> > + err = xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> > if (err)
> > goto err_lrc_finish;
> >
> > + if (vm && vm->xef)
> > + xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> > +
> > return 0;
> >
> > err_lrc_finish:
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> > index 3e500004f1ae..af31de8df408 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.h
> > +++ b/drivers/gpu/drm/xe/xe_lrc.h
> > @@ -52,6 +52,8 @@ struct xe_lrc_snapshot {
> >
> > struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
> > void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
> > +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u16 msix_vec, u32 init_flags);
> > void xe_lrc_destroy(struct kref *ref);
> >
> > /**
> > --
> > 2.43.0
> >
next prev parent reply other threads:[~2026-02-28 5:11 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-27 17:00 [PATCH v2 0/9] Introduce Xe PCIe FLR Raag Jadav
2026-02-27 17:00 ` [PATCH v2 1/9] drm/xe/uc_fw: Allow re-initializing firmware Raag Jadav
2026-02-27 17:00 ` [PATCH v2 2/9] drm/xe/gt: Introduce FLR helpers Raag Jadav
2026-02-27 17:00 ` [PATCH v2 3/9] drm/xe/irq: Introduce xe_irq_disable() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 4/9] drm/xe: Introduce xe_device_assert_lmem_ready() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 5/9] drm/xe/bo_evict: Introduce xe_bo_restore_map() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit() Raag Jadav
2026-02-27 18:06 ` Matthew Brost
2026-02-28 5:11 ` Raag Jadav [this message]
2026-02-27 17:00 ` [PATCH v2 7/9] drm/xe/exec_queue: Introduce xe_exec_queue_reinit() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 8/9] drm/xe/migrate: Introduce xe_migrate_reinit() Raag Jadav
2026-02-27 18:32 ` Matthew Brost
2026-02-28 5:12 ` Raag Jadav
2026-03-03 5:29 ` Raag Jadav
2026-02-27 17:00 ` [PATCH v2 9/9] drm/xe/pci: Introduce PCIe FLR Raag Jadav
2026-02-27 17:49 ` Vivi, Rodrigo
2026-02-28 5:24 ` Raag Jadav
2026-03-02 16:58 ` Rodrigo Vivi
2026-03-02 19:37 ` Laguna, Lukasz
2026-02-27 17:50 ` [PATCH v2 0/9] Introduce Xe " Vivi, Rodrigo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aaJ4-gjgZBj0V7BA@black.igk.intel.com \
--to=raag.jadav@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=maarten@lankhorst.se \
--cc=matthew.auld@intel.com \
--cc=matthew.brost@intel.com \
--cc=matthew.d.roper@intel.com \
--cc=michal.wajdeczko@intel.com \
--cc=michal.winiarski@intel.com \
--cc=riana.tauro@intel.com \
--cc=rodrigo.vivi@intel.com \
--cc=thomas.hellstrom@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox