From: Raag Jadav <raag.jadav@intel.com>
To: Matthew Brost <matthew.brost@intel.com>
Cc: intel-xe@lists.freedesktop.org, rodrigo.vivi@intel.com,
thomas.hellstrom@linux.intel.com, riana.tauro@intel.com,
michal.wajdeczko@intel.com, matthew.d.roper@intel.com,
michal.winiarski@intel.com, matthew.auld@intel.com,
maarten@lankhorst.se
Subject: Re: [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit()
Date: Sat, 28 Feb 2026 06:11:22 +0100 [thread overview]
Message-ID: <aaJ4-gjgZBj0V7BA@black.igk.intel.com> (raw)
In-Reply-To: <aaHdHSyXmlix/rkl@lstrano-desk.jf.intel.com>
On Fri, Feb 27, 2026 at 10:06:21AM -0800, Matthew Brost wrote:
> On Fri, Feb 27, 2026 at 10:30:46PM +0530, Raag Jadav wrote:
> > In preparation of usecases which require re-initializing LRC after PCIe
> > FLR, introduce xe_lrc_reinit() helper. The LRC bo already exists but
> > since it's contents are on VRAM, they are lost on PCIe FLR. Recreate
> > ring context as part of re-initialization.
> >
> > Signed-off-by: Raag Jadav <raag.jadav@intel.com>
> > ---
> > v2: Re-initialize migrate context (Matthew Brost)
> > ---
> > drivers/gpu/drm/xe/xe_lrc.c | 149 +++++++++++++++++++++---------------
> > drivers/gpu/drm/xe/xe_lrc.h | 2 +
> > 2 files changed, 90 insertions(+), 61 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> > index 84360fcdf743..9fc8720f62ca 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.c
> > +++ b/drivers/gpu/drm/xe/xe_lrc.c
> > @@ -1438,65 +1438,16 @@ void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_pri
> > lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
> > }
> >
> > -static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > - struct xe_vm *vm, void *replay_state, u32 ring_size,
> > - u16 msix_vec,
> > - u32 init_flags)
> > +static int xe_lrc_init_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u16 msix_vec, u32 init_flags)
> > {
> > struct xe_gt *gt = hwe->gt;
> > - const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> > - u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> > struct xe_tile *tile = gt_to_tile(gt);
> > struct xe_device *xe = gt_to_xe(gt);
> > - struct xe_bo *seqno_bo;
> > struct iosys_map map;
> > u32 arb_enable;
> > - u32 bo_flags;
> > int err;
> >
> > - kref_init(&lrc->refcount);
> > - lrc->gt = gt;
> > - lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> > - lrc->size = lrc_size;
> > - lrc->flags = 0;
> > - lrc->ring.size = ring_size;
> > - lrc->ring.tail = 0;
> > -
> > - if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> > - lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > - bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> > - }
> > -
> > - if (xe_gt_has_indirect_ring_state(gt))
> > - lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > -
> > - bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> > - XE_BO_FLAG_GGTT_INVALIDATE;
> > -
> > - if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> > - bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> > -
> > - lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
> > - bo_size,
> > - ttm_bo_type_kernel,
> > - bo_flags, false);
> > - if (IS_ERR(lrc->bo))
> > - return PTR_ERR(lrc->bo);
> > -
> > - seqno_bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> > - ttm_bo_type_kernel,
> > - XE_BO_FLAG_GGTT |
> > - XE_BO_FLAG_GGTT_INVALIDATE |
> > - XE_BO_FLAG_SYSTEM, false);
> > - if (IS_ERR(seqno_bo)) {
> > - err = PTR_ERR(seqno_bo);
> > - goto err_lrc_finish;
> > - }
> > - lrc->seqno_bo = seqno_bo;
> > -
> > - xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> > - hwe->fence_irq, hwe->name);
> > -
> > /*
> > * Init Per-Process of HW status Page, LRC / context state to known
> > * values. If there's already a primed default_lrc, just copy it, otherwise
> > @@ -1508,7 +1459,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
> > xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
> > gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
> > - lrc_size - LRC_PPHWSP_SIZE);
> > + lrc->size - LRC_PPHWSP_SIZE);
> > if (replay_state)
> > xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
> > replay_state, lrc->replay_size);
> > @@ -1516,21 +1467,16 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > void *init_data = empty_lrc_data(hwe);
> >
> > if (!init_data) {
> > - err = -ENOMEM;
> > - goto err_lrc_finish;
> > + return -ENOMEM;
> > }
> >
> > - xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
> > + xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
> > kfree(init_data);
> > }
> >
> > - if (vm) {
> > + if (vm)
> > xe_lrc_set_ppgtt(lrc, vm);
> >
> > - if (vm->xef)
> > - xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> > - }
> > -
> > if (xe_device_has_msix(xe)) {
> > xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
> > xe_memirq_status_ptr(&tile->memirq, hwe));
> > @@ -1602,12 +1548,93 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> >
> > err = setup_wa_bb(lrc, hwe);
> > if (err)
> > - goto err_lrc_finish;
> > + return err;
> >
> > err = setup_indirect_ctx(lrc, hwe);
> > +
> > + return err;
> > +}
> > +
> > +/**
> > + * xe_lrc_reinit() - Re-initialize LRC
> > + * @lrc: Pointer to the LRC
> > + * @hwe: Hardware Engine
> > + * @vm: The VM (address space)
> > + * @replay_state: GPU hang replay state
> > + * @msix_vec: MSI-X interrupt vector (for platforms that support it)
> > + * @init_flags: LRC initialization flags
> > + *
> > + * Returns: 0 on success, negative error code otherwise.
> > + */
> > +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u16 msix_vec, u32 init_flags)
> > +{
>
> I think you likely want to set lrc->ring.tail = 0 here (or in
> xe_lrc_init_ctx), right? Alternatively, you could set both
> INDIRECT_CTX_RING_HEAD and INDIRECT_CTX_RING_TAIL to lrc->ring.tail in
> xe_lrc_init_ctx.
>
> Consider the case where a bunch of work has run on the migration queue
> and lrc->ring.tail ends up in the middle of the ring, then xe_lrc_reinit
> is called. The next submission on the LRC will execute the instructions
> between 0 (INDIRECT_CTX_RING_HEAD is set zero in xe_lrc_init_ctx) and
> the lrc->ring.tail value at the time xe_lrc_reinit was invoked, which
> will be stale or invalid if VRAM was clobbered.
>
> I would have expected this to show up in testing if you ran something
> like:
>
> xe_exec_basic;
> echo 1 > /sys/bus/pci/devices/<BDF>/reset;
> xe_exec_basic;
>
> Otherwise this is good cleanup adding xe_lrc_init_ctx regardless of Xe
> PCIe FLR so if we work out above, feel free to post this an independent
> which we can merge ahead of Xe PCIe FLR.
Makes sense.
Raag
> > + return xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> > +}
> > +
> > +static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
> > +{
> > + struct xe_gt *gt = hwe->gt;
> > + const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
> > + u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
> > + struct xe_tile *tile = gt_to_tile(gt);
> > + struct xe_device *xe = gt_to_xe(gt);
> > + struct xe_bo *bo;
> > + u32 bo_flags;
> > + int err;
> > +
> > + kref_init(&lrc->refcount);
> > + lrc->gt = gt;
> > + lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
> > + lrc->size = lrc_size;
> > + lrc->flags = 0;
> > + lrc->ring.size = ring_size;
> > + lrc->ring.tail = 0;
> > +
> > + if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
> > + lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > + bo_size += LRC_INDIRECT_CTX_BO_SIZE;
> > + }
> > +
> > + if (xe_gt_has_indirect_ring_state(gt))
> > + lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > +
> > + bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
> > + XE_BO_FLAG_GGTT_INVALIDATE;
> > +
> > + if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
> > + bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
> > +
> > + bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
> > + ttm_bo_type_kernel,
> > + bo_flags, false);
> > + if (IS_ERR(lrc->bo))
> > + return PTR_ERR(lrc->bo);
> > +
> > + lrc->bo = bo;
> > +
> > + bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
> > + ttm_bo_type_kernel,
> > + XE_BO_FLAG_GGTT |
> > + XE_BO_FLAG_GGTT_INVALIDATE |
> > + XE_BO_FLAG_SYSTEM, false);
> > + if (IS_ERR(bo)) {
> > + err = PTR_ERR(bo);
> > + goto err_lrc_finish;
> > + }
> > + lrc->seqno_bo = bo;
> > +
> > + xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
> > + hwe->fence_irq, hwe->name);
> > +
> > + err = xe_lrc_init_ctx(lrc, hwe, vm, replay_state, msix_vec, init_flags);
> > if (err)
> > goto err_lrc_finish;
> >
> > + if (vm && vm->xef)
> > + xe_drm_client_add_bo(vm->xef->client, lrc->bo);
> > +
> > return 0;
> >
> > err_lrc_finish:
> > diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> > index 3e500004f1ae..af31de8df408 100644
> > --- a/drivers/gpu/drm/xe/xe_lrc.h
> > +++ b/drivers/gpu/drm/xe/xe_lrc.h
> > @@ -52,6 +52,8 @@ struct xe_lrc_snapshot {
> >
> > struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
> > void *replay_state, u32 ring_size, u16 msix_vec, u32 flags);
> > +int xe_lrc_reinit(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
> > + void *replay_state, u16 msix_vec, u32 init_flags);
> > void xe_lrc_destroy(struct kref *ref);
> >
> > /**
> > --
> > 2.43.0
> >
next prev parent reply other threads:[~2026-02-28 5:11 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-27 17:00 [PATCH v2 0/9] Introduce Xe PCIe FLR Raag Jadav
2026-02-27 17:00 ` [PATCH v2 1/9] drm/xe/uc_fw: Allow re-initializing firmware Raag Jadav
2026-02-27 17:00 ` [PATCH v2 2/9] drm/xe/gt: Introduce FLR helpers Raag Jadav
2026-02-27 17:00 ` [PATCH v2 3/9] drm/xe/irq: Introduce xe_irq_disable() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 4/9] drm/xe: Introduce xe_device_assert_lmem_ready() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 5/9] drm/xe/bo_evict: Introduce xe_bo_restore_map() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 6/9] drm/xe/lrc: Introduce xe_lrc_reinit() Raag Jadav
2026-02-27 18:06 ` Matthew Brost
2026-02-28 5:11 ` Raag Jadav [this message]
2026-02-27 17:00 ` [PATCH v2 7/9] drm/xe/exec_queue: Introduce xe_exec_queue_reinit() Raag Jadav
2026-02-27 17:00 ` [PATCH v2 8/9] drm/xe/migrate: Introduce xe_migrate_reinit() Raag Jadav
2026-02-27 18:32 ` Matthew Brost
2026-02-28 5:12 ` Raag Jadav
2026-03-03 5:29 ` Raag Jadav
2026-02-27 17:00 ` [PATCH v2 9/9] drm/xe/pci: Introduce PCIe FLR Raag Jadav
2026-02-27 17:49 ` Vivi, Rodrigo
2026-02-28 5:24 ` Raag Jadav
2026-03-02 16:58 ` Rodrigo Vivi
2026-03-02 19:37 ` Laguna, Lukasz
2026-02-27 17:50 ` [PATCH v2 0/9] Introduce Xe " Vivi, Rodrigo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aaJ4-gjgZBj0V7BA@black.igk.intel.com \
--to=raag.jadav@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=maarten@lankhorst.se \
--cc=matthew.auld@intel.com \
--cc=matthew.brost@intel.com \
--cc=matthew.d.roper@intel.com \
--cc=michal.wajdeczko@intel.com \
--cc=michal.winiarski@intel.com \
--cc=riana.tauro@intel.com \
--cc=rodrigo.vivi@intel.com \
--cc=thomas.hellstrom@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.