intel-xe.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
To: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>,
	<intel-xe@lists.freedesktop.org>
Cc: Matthew Brost <matthew.brost@intel.com>,
	Tomasz Lis <tomasz.lis@intel.com>
Subject: Re: [PATCH v5 3/3] drm/xe/vf: Add debugfs entries to test VF double migration
Date: Thu, 20 Nov 2025 22:53:02 +0100	[thread overview]
Message-ID: <49e4f27e-a921-458b-ac2e-d8e38d715de3@intel.com> (raw)
In-Reply-To: <20251120145157.6995-8-satyanarayana.k.v.p@intel.com>



On 11/20/2025 3:52 PM, Satyanarayana K V P wrote:
> VF migration sends a marker to the GUC before resource fixups begin,
> and repeats the marker with the RESFIX_DONE notification. This prevents
> the GUC from submitting jobs during double migration events.
> 
> To reliably test double migration, a second migration must be triggered
> while fixups from the first migration are still in progress. Since fixups
> complete quickly, reproducing this scenario is difficult. Introduce
> debugfs controls to add delays in the post-fixup phase, creating a
> deterministic window for subsequent migrations.
> 
> New debugfs entries:
> 	/sys/kernel/debug/dri/<card>/
> 	├── gt0
> 	│   ├── vf
> 	│   │   ├── resfix_stoppers
> 
> - resfix_stoppers: Predefined checkpoints that allow the migration process
> to pause at specific stages. The stages are given below.
> 
> VF_MIGRATION_WAIT_BEFORE_RESFIX_START	- BIT(0)
> VF_MIGRATION_WAIT_BEFORE_FIXUPS		- BIT(1)
> VF_MIGRATION_WAIT_BEFORE_RESTART_JOBS	- BIT(2)
> VF_MIGRATION_WAIT_BEFORE_RESFIX_DONE	- BIT(3)
> 
> Each state will pause with a 1-second delay per iteration, continuing until
> its corresponding bit is cleared or a maximum of 10 iterations is reached.

fixed 10s might be still not always sufficient for the test

if you're afraid of locking down VF driver due to test bug that forget to clear the stop-bit,
then maybe we should look for some other solution ...

> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Tomasz Lis <tomasz.lis@intel.com>
> 
> ---
> V4 -> V5:
> - Updated debugfs entries (Michal W).
> 
> V3 -> V4:
> - New commit
> 
> V2 -> V3:
> - None.
> 
> V1 -> V2:
> - None.
> ---
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.c         | 37 +++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c |  5 +++
>  drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h   |  8 +++++
>  3 files changed, 50 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index 1cd3d8204c15..0f25091f4949 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -5,6 +5,7 @@
>  
>  #include <linux/bitfield.h>
>  #include <linux/bsearch.h>
> +#include <linux/delay.h>
>  
>  #include <drm/drm_managed.h>
>  #include <drm/drm_print.h>
> @@ -1234,6 +1235,34 @@ static int vf_post_migration_notify_resfix_done(struct xe_gt *gt, u16 marker)
>  	return vf_notify_resfix_done(gt, marker);
>  }
>  
> +#define VF_MIGRATION_WAIT_NO_WAIT			0
> +#define VF_MIGRATION_WAIT_BEFORE_RESFIX_START		BIT(0)
> +#define VF_MIGRATION_WAIT_BEFORE_FIXUPS		BIT(1)
> +#define VF_MIGRATION_WAIT_BEFORE_RESTART_JOBS		BIT(2)
> +#define VF_MIGRATION_WAIT_BEFORE_RESFIX_DONE		BIT(3)
> +
> +#define VF_MIGRATION_WAIT_DELAY_PER_ITER_IN_MS		1000
> +#define VF_MIGRATION_WAIT_DELAY_MAX_ITERS		10
> +
> +#ifdef CONFIG_DRM_XE_DEBUG
> +static inline void vf_post_migration_inject_wait(struct xe_gt *gt, int wait)
> +{
> +	uint delay_ms = VF_MIGRATION_WAIT_DELAY_PER_ITER_IN_MS;
> +	int max_delay_iter = VF_MIGRATION_WAIT_DELAY_MAX_ITERS;
> +
> +	while ((gt->sriov.vf.migration.debug.resfix_stoppers &  wait) &&
> +	       max_delay_iter--) {
> +		xe_gt_dbg(gt,
> +			  "*TESTING* injecting delay resfix_stoppers = 0x%x, remaining iters = %d\n",
> +			  gt->sriov.vf.migration.debug.resfix_stoppers, max_delay_iter);
> +
> +		msleep(delay_ms);
> +	}
> +}
> +#else
> +static inline void vf_post_migration_inject_wait(struct xe_gt *gt, int wait) { }
> +#endif

we don't use "inline" in .c

> +
>  /*
>   * Increment the startup marker again if it overflows, since GUC
>   * requires a non-zero marker to be set.
> @@ -1270,18 +1299,26 @@ static void vf_post_migration_recovery(struct xe_gt *gt)
>  		goto fail;
>  	}
>  
> +	vf_post_migration_inject_wait(gt, VF_MIGRATION_WAIT_BEFORE_RESFIX_START);

can we code that in some nicer way:

	VF_MIGRATION_INJECT_WAIT(RESFIX_START);

#define VF_MIGRATION_INJECT_WAIT(_HERE) \
	vf_post_migration_inject_wait(VF_MIGRATION_WAIT_##_HERE)

> +
>  	marker = vf_post_migration_resfix_start_marker(gt);
>  
>  	err = vf_notify_resfix_start(gt, marker);
>  	if (err)
>  		goto fail;
>  
> +	vf_post_migration_inject_wait(gt, VF_MIGRATION_WAIT_BEFORE_FIXUPS);
> +
>  	err = vf_post_migration_fixups(gt);
>  	if (err)
>  		goto fail;
>  
> +	vf_post_migration_inject_wait(gt, VF_MIGRATION_WAIT_BEFORE_RESTART_JOBS);
> +
>  	vf_post_migration_rearm(gt);
>  
> +	vf_post_migration_inject_wait(gt, VF_MIGRATION_WAIT_BEFORE_RESFIX_DONE);
> +
>  	err = vf_post_migration_notify_resfix_done(gt, marker);
>  	if (err && err != -EAGAIN)
>  		goto fail;
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
> index 2ed5b6780d30..66d5ffd0e371 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_debugfs.c
> @@ -69,4 +69,9 @@ void xe_gt_sriov_vf_debugfs_register(struct xe_gt *gt, struct dentry *root)
>  	vfdentry->d_inode->i_private = gt;
>  
>  	drm_debugfs_create_files(vf_info, ARRAY_SIZE(vf_info), vfdentry, minor);
> +
> +	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
> +		debugfs_create_x8("resfix_stoppers", 0600, vfdentry,
> +				  &gt->sriov.vf.migration.debug.resfix_stoppers);
> +	}
>  }
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
> index 66c0062a42c6..3ece00f74e46 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h
> @@ -52,6 +52,14 @@ struct xe_gt_sriov_vf_migration {
>  	wait_queue_head_t wq;
>  	/** @scratch: Scratch memory for VF recovery */
>  	void *scratch;
> +	/** @debug: Debug hooks for delaying migration */
> +	struct {
> +		/**
> +		 * @debug.resfix_stoppers: Stop and wait at different stages
> +		 * during post migration recovery
> +		 */
> +		u8 resfix_stoppers;
> +	} debug;
>  	/**
>  	 * @resfix_marker: Marker sent on start and on end of post-migration
>  	 * steps.

as this is still a quite invasive change, can you post a IGT patch which uses these stop-points to prove it works as expected?


  reply	other threads:[~2025-11-20 21:53 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-20 14:51 [PATCH v5 0/3] VF double migration Satyanarayana K V P
2025-11-20 14:51 ` [PATCH v5 1/3] drm/xe/vf: Enable VF migration only on supported GuC versions Satyanarayana K V P
2025-11-20 21:18   ` Michal Wajdeczko
2025-11-20 14:52 ` [PATCH v5 2/3] drm/xe/vf: Introduce RESFIX start marker support Satyanarayana K V P
2025-11-20 21:37   ` Michal Wajdeczko
2025-11-20 14:52 ` [PATCH v5 3/3] drm/xe/vf: Add debugfs entries to test VF double migration Satyanarayana K V P
2025-11-20 21:53   ` Michal Wajdeczko [this message]
2025-11-21  7:21     ` Matthew Brost
2025-11-20 16:01 ` ✓ CI.KUnit: success for VF double migration (rev5) Patchwork
2025-11-20 16:40 ` ✓ Xe.CI.BAT: " Patchwork
2025-11-20 21:10 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49e4f27e-a921-458b-ac2e-d8e38d715de3@intel.com \
    --to=michal.wajdeczko@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    --cc=satyanarayana.k.v.p@intel.com \
    --cc=tomasz.lis@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).