Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Adam Miszczak <adam.miszczak@linux.intel.com>
To: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>,
	intel-xe@lists.freedesktop.org
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>,
	Matthew Brost <matthew.brost@intel.com>,
	Tomasz Lis <tomasz.lis@intel.com>
Subject: Re: [PATCH v7 4/4] drm/xe/vf: Add debugfs entries to test VF double migration
Date: Mon, 1 Dec 2025 07:04:57 +0100	[thread overview]
Message-ID: <5fe9573b-3de7-4605-b4f5-6ab9dc3ed0bb@linux.intel.com> (raw)
In-Reply-To: <20251128133052.17120-10-satyanarayana.k.v.p@intel.com>

On 11/28/2025 2:30 PM, Satyanarayana K V P wrote:
> VF migration sends a marker to the GUC before resource fixups begin,
> and repeats the marker with the RESFIX_DONE notification. This prevents
> the GUC from submitting jobs during double migration events.
>
> To reliably test double migration, a second migration must be triggered
> while fixups from the first migration are still in progress. Since fixups
> complete quickly, reproducing this scenario is difficult. Introduce
> debugfs controls to add delays in the post-fixup phase, creating a
> deterministic window for subsequent migrations.
>
> New debugfs entries:
> 	/sys/kernel/debug/dri/BDF/
> 	├── tile0
> 	│   ├─gt0
> 	│   │ ├──vf
> 	│   │ │  ├── resfix_stoppers
>
> resfix_stoppers: Predefined checkpoints that allow the migration process
> to pause at specific stages. The stages are given below.
>
> VF_MIGRATION_WAIT_RESFIX_START		- BIT(0)
> VF_MIGRATION_WAIT_FIXUPS		- BIT(1)
> VF_MIGRATION_WAIT_RESTART_JOBS		- BIT(2)
> VF_MIGRATION_WAIT_RESFIX_DONE		- BIT(3)
>
> Each state will pause with a 1-second delay per iteration, continuing until
> its corresponding bit is cleared.
>
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Tomasz Lis <tomasz.lis@intel.com>
>
> ---
> V6 -> V7:
> - Fixed review comments (Michal W).
> - Updated commit message.
>
> V5 -> V6:
> - Fixed review comments (Michal W).
> - Removed timeout and VF KMD waits infinately when resfix_stoppers bits are
> set.
> - Created helper macro for WAIT positions.
>
> V4 -> V5:
> - Updated debugfs entries (Michal W).
>
> V3 -> V4:
> - New commit
>
> V2 -> V3:
> - None.
>
> V1 -> V2:
> - None.
> ---
>   drivers/gpu/drm/xe/xe_gt_sriov_vf.c         | 40 +++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c | 12 +++++++
>   drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h   |  8 +++++
>   3 files changed, 60 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index 937554657440..75c5c6ad0b75 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -5,6 +5,7 @@
>   
>   #include <linux/bitfield.h>
>   #include <linux/bsearch.h>
> +#include <linux/delay.h>
>   
>   #include <drm/drm_managed.h>
>   #include <drm/drm_print.h>
> @@ -41,6 +42,37 @@
>   
>   #define make_u64_from_u32(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo)))
>   
> +#ifdef CONFIG_DRM_XE_DEBUG
> +enum VF_MIGRATION_WAIT_POINTS {
> +	VF_MIGRATION_WAIT_RESFIX_START	= BIT(0),
> +	VF_MIGRATION_WAIT_FIXUPS	= BIT(1),
> +	VF_MIGRATION_WAIT_RESTART_JOBS	= BIT(2),
> +	VF_MIGRATION_WAIT_RESFIX_DONE	= BIT(3),
> +};
> +
> +#define VF_MIGRATION_WAIT_DELAY_IN_MS	1000
> +static void vf_post_migration_inject_wait(struct xe_gt *gt,
> +					  enum VF_MIGRATION_WAIT_POINTS wait)
> +{
> +	while (gt->sriov.vf.migration.debug.resfix_stoppers & wait) {
> +		xe_gt_dbg(gt,
> +			  "*TESTING* injecting %u ms delay due to resfix_stoppers=%#x, to continue clear %#x\n",
> +			  VF_MIGRATION_WAIT_DELAY_IN_MS,
> +			  gt->sriov.vf.migration.debug.resfix_stoppers, wait);
> +
> +		msleep(VF_MIGRATION_WAIT_DELAY_IN_MS);
> +	}
> +}
> +
> +#define VF_MIGRATION_INJECT_WAIT(gt, _POS) ({					\
> +	struct xe_gt *__gt = (gt);						\
> +	vf_post_migration_inject_wait(__gt, VF_MIGRATION_WAIT_##_POS);		\
> +	})
> +
> +#else
> +#define VF_MIGRATION_INJECT_WAIT(_gt, _POS)	typecheck(struct xe_gt *, (_gt))
> +#endif
> +
>   static int guc_action_vf_reset(struct xe_guc *guc)
>   {
>   	u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
> @@ -320,6 +352,8 @@ static int vf_resfix_start(struct xe_gt *gt, u16 marker)
>   
>   	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
>   
> +	VF_MIGRATION_INJECT_WAIT(gt, RESFIX_START);
> +
>   	xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);
>   
>   	return guc_action_vf_resfix_start(guc, marker);
> @@ -1158,6 +1192,8 @@ static int vf_post_migration_fixups(struct xe_gt *gt)
>   	void *buf = gt->sriov.vf.migration.scratch;
>   	int err;
>   
> +	VF_MIGRATION_INJECT_WAIT(gt, FIXUPS);
> +
>   	/* xe_gt_sriov_vf_query_config will fixup the GGTT addresses */
>   	err = xe_gt_sriov_vf_query_config(gt);
>   	if (err)
> @@ -1176,6 +1212,8 @@ static int vf_post_migration_fixups(struct xe_gt *gt)
>   
>   static void vf_post_migration_rearm(struct xe_gt *gt)
>   {
> +	VF_MIGRATION_INJECT_WAIT(gt, RESTART_JOBS);
> +
>   	xe_guc_ct_restart(&gt->uc.guc.ct);
>   	xe_guc_submit_unpause_prepare_vf(&gt->uc.guc);
>   }
> @@ -1199,6 +1237,8 @@ static void vf_post_migration_abort(struct xe_gt *gt)
>   
>   static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
>   {
> +	VF_MIGRATION_INJECT_WAIT(gt, RESFIX_DONE);
> +
>   	spin_lock_irq(&gt->sriov.vf.migration.lock);
>   	if (gt->sriov.vf.migration.recovery_queued)
>   		xe_gt_sriov_dbg(gt, "another recovery imminent\n");
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
> index 2ed5b6780d30..507718326e1f 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
> @@ -69,4 +69,16 @@ void xe_gt_sriov_vf_debugfs_register(struct xe_gt *gt, struct dentry *root)
>   	vfdentry->d_inode->i_private = gt;
>   
>   	drm_debugfs_create_files(vf_info, ARRAY_SIZE(vf_info), vfdentry, minor);
> +
> +	/*
> +	 *      /sys/kernel/debug/dri/BDF/
> +	 *      ├── tile0
> +	 *          ├── gt0
> +	 *              ├── vf
> +	 *                  ├── resfix_stoppers
> +	 */
> +	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
> +		debugfs_create_x8("resfix_stoppers", 0600, vfdentry,
> +				  &gt->sriov.vf.migration.debug.resfix_stoppers);
> +	}
>   }
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
> index db2f8b3ed3e9..510c33116fbd 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
> @@ -52,6 +52,14 @@ struct xe_gt_sriov_vf_migration {
>   	wait_queue_head_t wq;
>   	/** @scratch: Scratch memory for VF recovery */
>   	void *scratch;
> +	/** @debug: Debug hooks for delaying migration */
> +	struct {
> +		/**
> +		 * @debug.resfix_stoppers: Stop and wait at different stages
> +		 * during post migration recovery
> +		 */
> +		u8 resfix_stoppers;
> +	} debug;
>   	/**
>   	 * @resfix_marker: Marker sent on start and on end of post-migration
>   	 * steps.
Solution preliminarily tested, this approach to debug hooks works for me.
Acked-by: Adam Miszczak <adam.miszczak@linux.intel.com>

  parent reply	other threads:[~2025-12-01  6:05 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-28 13:30 [PATCH v7 0/4] VF double migration Satyanarayana K V P
2025-11-28 13:30 ` [PATCH v7 1/4] drm/xe/vf: Enable VF migration only on supported GuC versions Satyanarayana K V P
2025-11-28 14:29   ` Michal Wajdeczko
2025-11-28 13:30 ` [PATCH v7 2/4] drm/xe/vf: Introduce RESFIX start marker support Satyanarayana K V P
2025-11-29 20:01   ` Michal Wajdeczko
2025-12-01  9:26     ` K V P, Satyanarayana
2025-11-28 13:30 ` [PATCH v7 3/4] drm/xe/vf: Requeue recovery on GuC MIGRATION error during VF post-migration Satyanarayana K V P
2025-11-29 20:27   ` Michal Wajdeczko
2025-11-28 13:30 ` [PATCH v7 4/4] drm/xe/vf: Add debugfs entries to test VF double migration Satyanarayana K V P
2025-11-29 21:07   ` Michal Wajdeczko
2025-12-01  6:04   ` Adam Miszczak [this message]
2025-11-28 14:21 ` ✗ CI.checkpatch: warning for VF double migration (rev7) Patchwork
2025-11-28 14:22 ` ✓ CI.KUnit: success " Patchwork
2025-11-28 15:33 ` ✓ Xe.CI.BAT: " Patchwork
2025-11-28 16:50 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5fe9573b-3de7-4605-b4f5-6ab9dc3ed0bb@linux.intel.com \
    --to=adam.miszczak@linux.intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    --cc=michal.wajdeczko@intel.com \
    --cc=satyanarayana.k.v.p@intel.com \
    --cc=tomasz.lis@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox