Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Brost <matthew.brost@intel.com>
To: Nitin Gote <nitin.r.gote@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <stuart.summers@intel.com>
Subject: Re: [PATCH] drm/xe: share USM BCS engine via root-tile helper
Date: Fri, 24 Oct 2025 08:00:07 -0700	[thread overview]
Message-ID: <aPuUd/NwtfvIVo1g@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <aPJ+JszqVdAOSvtf@lstrano-desk.jf.intel.com>

On Fri, Oct 17, 2025 at 10:34:30AM -0700, Matthew Brost wrote:
> On Fri, Oct 17, 2025 at 05:52:53PM +0530, Nitin Gote wrote:
> > Introduce optional root‑tile USM BCS engine sharing controlled
> > by the device descriptor flag (info.use_root_usm_bcs) and a
> > debug module parameter(force_use_root_usm_bcs). Each GT records
> > its USM BCS instance and hw engine during hw_engine_init.
> > 
> > Add helper xe_usm_bcs_reserved_hwe() which, when sharing is
> > enabled, returns the root tile’s USM BCS engine; otherwise
> > it returns the local GT’s engine. Exec queue and migrate
> > initialization now use this helper, avoiding failed
> > instance lookups on tiles lacking lower-numbered BCS engines.
> > 
> > Signed-off-by: Nitin Gote <nitin.r.gote@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_device_types.h |  2 ++
> >  drivers/gpu/drm/xe/xe_exec_queue.c   | 12 ++++++++----
> >  drivers/gpu/drm/xe/xe_gt.c           | 27 +++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_gt.h           |  1 +
> >  drivers/gpu/drm/xe/xe_gt_types.h     |  2 ++
> >  drivers/gpu/drm/xe/xe_hw_engine.c    |  7 +++++--
> >  drivers/gpu/drm/xe/xe_migrate.c      | 13 +++++++++----
> >  drivers/gpu/drm/xe/xe_module.c       |  4 ++++
> >  drivers/gpu/drm/xe/xe_module.h       |  1 +
> >  drivers/gpu/drm/xe/xe_pci.c          |  1 +
> >  drivers/gpu/drm/xe/xe_pci_types.h    |  1 +
> >  11 files changed, 61 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > index 02c04ad7296e..8f3ea6c637b3 100644
> > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > @@ -336,6 +336,8 @@ struct xe_device {
> >  		u8 skip_pcode:1;
> >  		/** @info.needs_shared_vf_gt_wq: needs shared GT WQ on VF */
> >  		u8 needs_shared_vf_gt_wq:1;
> > +		/** @info.use_root_usm_bcs: share single USM BCS from root tile */
> > +		u8 use_root_usm_bcs:1;
> >  	} info;
> >  
> >  	/** @wa_active: keep track of active workarounds */
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 90cbc95f8e2e..06c5585b01fc 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -348,10 +348,14 @@ struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
> >  
> >  	migrate_vm = xe_migrate_get_vm(tile->migrate);
> >  	if (xe->info.has_usm) {
> > -		struct xe_hw_engine *hwe = xe_gt_hw_engine(gt,
> > -							   XE_ENGINE_CLASS_COPY,
> > -							   gt->usm.reserved_bcs_instance,
> > -							   false);
> > +		struct xe_hw_engine *hwe = xe_usm_bcs_reserved_hwe(gt);
> 
> Can we just call single function here which figures out the HWE?
> 
> > +
> > +		if (!hwe) {
> > +			hwe = xe_gt_hw_engine(gt,
> > +					      XE_ENGINE_CLASS_COPY,
> > +					      gt->usm.reserved_bcs_instance,
> > +					      false);
> > +		}
> >  
> >  		if (!hwe) {
> >  			xe_vm_put(migrate_vm);
> > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > index d8e94fb8b9bd..b84a2c38c4aa 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.c
> > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > @@ -49,6 +49,7 @@
> >  #include "xe_map.h"
> >  #include "xe_migrate.h"
> >  #include "xe_mmio.h"
> > +#include "xe_module.h"
> >  #include "xe_pat.h"
> >  #include "xe_pm.h"
> >  #include "xe_mocs.h"
> > @@ -519,6 +520,32 @@ static int gt_init_with_gt_forcewake(struct xe_gt *gt)
> >  	return err;
> >  }
> >  
> > +/**
> > + * xe_usm_bcs_reserved_hwe - select USM BCS engine for a GT
> > + * @gt: GT whose USM BCS engine is requested
> > + *
> > + * If root-tile sharing (info.use_root_usm_bcs or force_use_root_usm_bcs)
> > + * is enabled, returns the root tile's reserved USM BCS COPY engine pointer.
> > + * Otherwise returns this GT's own reserved USM BCS engine.
> > + *
> > + * Returns:
> > + * Pointer to xe_hw_engine or NULL if USM unsupported or engine not ready.
> > + */
> > +struct xe_hw_engine *xe_usm_bcs_reserved_hwe(struct xe_gt *gt)
> > +{
> > +	struct xe_device *xe = gt_to_xe(gt);
> > +
> > +	if (xe->info.use_root_usm_bcs ||
> > +	    xe_modparam.force_use_root_usm_bcs) {
> 
> Don't check xe_modparam inline, rather on device probe override
> xe->info.use_root_usm_bcs with the modparam.
> 
> > +		struct xe_tile *root = xe_device_get_root_tile(xe);
> > +
> > +		if (root && root->primary_gt && root->primary_gt->usm.reserved_bcs_hwe)
> > +			return root->primary_gt->usm.reserved_bcs_hwe;
> > +	}
> > +
> > +	return gt->usm.reserved_bcs_hwe;
> > +}
> > +
> >  static int gt_init_with_all_forcewake(struct xe_gt *gt)
> >  {
> >  	unsigned int fw_ref;
> > diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> > index 5df2ffe3ff83..2a896ef28fa0 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.h
> > +++ b/drivers/gpu/drm/xe/xe_gt.h
> > @@ -54,6 +54,7 @@ int xe_gt_resume(struct xe_gt *gt);
> >  void xe_gt_reset_async(struct xe_gt *gt);
> >  void xe_gt_sanitize(struct xe_gt *gt);
> >  int xe_gt_sanitize_freq(struct xe_gt *gt);
> > +struct xe_hw_engine *xe_usm_bcs_reserved_hwe(struct xe_gt *gt);
> >  
> >  /**
> >   * xe_gt_wait_for_reset - wait for gt's async reset to finalize.
> > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> > index 8b5f604d7883..3aca91686e0c 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > @@ -212,6 +212,8 @@ struct xe_gt {
> >  		 * operations (e.g. migrations, fixing page tables)
> >  		 */
> >  		u16 reserved_bcs_instance;
> > +		/** @usm.reserved_bcs_hwe: reserved BCS hardware engine used for USM */
> > +		struct xe_hw_engine *reserved_bcs_hwe;
> 
> I don't think you need this storage. On GTs which don't reserve BCS
> instance reserved_bcs_instance would just be clear.
> 
> >  		/** @usm.pf_wq: page fault work queue, unbound, high priority */
> >  		struct workqueue_struct *pf_wq;
> >  		/** @usm.acc_wq: access counter work queue, unbound, high priority */
> > diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> > index cba4375525c7..175407ae3ad8 100644
> > --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> > +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> > @@ -35,6 +35,7 @@
> >  #include "xe_lrc.h"
> >  #include "xe_macros.h"
> >  #include "xe_mmio.h"
> > +#include "xe_module.h"
> >  #include "xe_reg_sr.h"
> >  #include "xe_reg_whitelist.h"
> >  #include "xe_rtp.h"
> > @@ -633,9 +634,11 @@ static int hw_engine_init(struct xe_gt *gt, struct xe_hw_engine *hwe,
> >  			xe_hw_engine_enable_ring(hwe);
> >  	}
> >  
> > -	/* We reserve the highest BCS instance for USM */
> > -	if (xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY)
> > +	/* Record BCS instance for USM; keep highest instance seen */
> > +	if (xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY) {
> >  		gt->usm.reserved_bcs_instance = hwe->instance;
> > +		gt->usm.reserved_bcs_hwe = hwe;
> > +	}
> >  
> >  	/* Ensure IDLEDLY is lower than MAXCNT */
> >  	adjust_idledly(hwe);
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> > index 3112c966c67d..e96fcfb13e24 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -450,10 +450,15 @@ int xe_migrate_init(struct xe_migrate *m)
> >  		goto err_out;
> >  
> >  	if (xe->info.has_usm) {
> > -		struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt,
> > -							   XE_ENGINE_CLASS_COPY,
> > -							   primary_gt->usm.reserved_bcs_instance,
> > -							   false);
> > +		struct xe_hw_engine *hwe = xe_usm_bcs_reserved_hwe(primary_gt);
> > +
> > +		if (!hwe) {
> > +			hwe = xe_gt_hw_engine(primary_gt,
> > +					      XE_ENGINE_CLASS_COPY,
> > +					      primary_gt->usm.reserved_bcs_instance,
> > +					      false);
> > +		}
> > +
> 
> Here, we'd still create a xe_migrate instance per tile but each migrate
> instance would point to the same HWE but be different queues. Different
> queues means we will be context switching on the HWE and context
> switches are quite expensive. I think what we want to do is if
> use_root_usm_bcs is set, point all tiles the same xe_migrate instance.
> 

This is more complex than I initially thought—we’ll need a broader
structure and a rework of the bind pipeline. Currently, each bind queue
is tied to a single tile, with two TLB invalidation job queues (one per
GT on the tile) attached. If a single bind queue targets multiple tiles,
it must now issue invalidations on remote tiles as well.

We also need to decide whether we still want multiple instances of PPGTT
mirrored across tiles. A broader architectural discussion may be
necessary to clarify what we’re aiming to support and determine the best
path forward.

Matt

> Matt
> 
> >  		u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt);
> >  
> >  		if (!hwe || !logical_mask) {
> > diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> > index d08338fc3bc1..9c3104c36897 100644
> > --- a/drivers/gpu/drm/xe/xe_module.c
> > +++ b/drivers/gpu/drm/xe/xe_module.c
> > @@ -80,6 +80,10 @@ MODULE_PARM_DESC(force_probe,
> >  		 "Force probe options for specified devices. See CONFIG_DRM_XE_FORCE_PROBE for details "
> >  		 "[default=" DEFAULT_FORCE_PROBE "])");
> >  
> > +module_param_named(force_use_root_usm_bcs, xe_modparam.force_use_root_usm_bcs, bool, 0400);
> > +MODULE_PARM_DESC(force_use_root_usm_bcs,
> > +		 "Force all tiles to share USM BCS from root tile (default: false, debug only)");
> > +
> >  #ifdef CONFIG_PCI_IOV
> >  module_param_named(max_vfs, xe_modparam.max_vfs, uint, 0400);
> >  MODULE_PARM_DESC(max_vfs,
> > diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> > index 5a3bfea8b7b4..61332b0ecc18 100644
> > --- a/drivers/gpu/drm/xe/xe_module.h
> > +++ b/drivers/gpu/drm/xe/xe_module.h
> > @@ -23,6 +23,7 @@ struct xe_modparam {
> >  #endif
> >  	int wedged_mode;
> >  	u32 svm_notifier_size;
> > +	bool force_use_root_usm_bcs;
> >  };
> >  
> >  extern struct xe_modparam xe_modparam;
> > diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> > index 24a38904bb50..c7dffb21b18d 100644
> > --- a/drivers/gpu/drm/xe/xe_pci.c
> > +++ b/drivers/gpu/drm/xe/xe_pci.c
> > @@ -638,6 +638,7 @@ static int xe_info_init_early(struct xe_device *xe,
> >  	xe->info.skip_pcode = desc->skip_pcode;
> >  	xe->info.needs_scratch = desc->needs_scratch;
> >  	xe->info.needs_shared_vf_gt_wq = desc->needs_shared_vf_gt_wq;
> > +	xe->info.use_root_usm_bcs = desc->use_root_usm_bcs;
> >  
> >  	xe->info.probe_display = IS_ENABLED(CONFIG_DRM_XE_DISPLAY) &&
> >  				 xe_modparam.probe_display &&
> > diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h
> > index a4451bdc79fb..cbbb338ee580 100644
> > --- a/drivers/gpu/drm/xe/xe_pci_types.h
> > +++ b/drivers/gpu/drm/xe/xe_pci_types.h
> > @@ -53,6 +53,7 @@ struct xe_device_desc {
> >  	u8 skip_mtcfg:1;
> >  	u8 skip_pcode:1;
> >  	u8 needs_shared_vf_gt_wq:1;
> > +	u8 use_root_usm_bcs:1;
> >  };
> >  
> >  struct xe_graphics_desc {
> > -- 
> > 2.25.1
> > 

  reply	other threads:[~2025-10-24 15:00 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-17 12:22 [PATCH] drm/xe: share USM BCS engine via root-tile helper Nitin Gote
2025-10-17 12:44 ` ✓ CI.KUnit: success for " Patchwork
2025-10-17 13:24 ` ✓ Xe.CI.BAT: " Patchwork
2025-10-17 17:34 ` [PATCH] " Matthew Brost
2025-10-24 15:00   ` Matthew Brost [this message]
2025-10-18  9:50 ` ✗ Xe.CI.Full: failure for " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2025-10-31  5:14 [PATCH] " Nitin Gote

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aPuUd/NwtfvIVo1g@lstrano-desk.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=nitin.r.gote@intel.com \
    --cc=stuart.summers@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox