Re: [PATCH v7 1/4] drm/xe/vf: Divide GGTT ballooning into allocation and insertion

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Michal Wajdeczko <michal.wajdeczko@intel.com>
To: Tomasz Lis <tomasz.lis@intel.com>, intel-xe@lists.freedesktop.org
Cc: "Michał Winiarski" <michal.winiarski@intel.com>,
	"Piotr Piórkowski" <piotr.piorkowski@intel.com>,
	"Matthew Brost" <matthew.brost@intel.com>,
	"Lucas De Marchi" <lucas.demarchi@intel.com>
Subject: Re: [PATCH v7 1/4] drm/xe/vf: Divide GGTT ballooning into allocation and insertion
Date: Tue, 8 Apr 2025 13:59:56 +0200	[thread overview]
Message-ID: <df3d130a-8d4c-443d-99ba-92d79eda14d5@intel.com> (raw)
In-Reply-To: <20250403184055.2317409-2-tomasz.lis@intel.com>



On 03.04.2025 20:40, Tomasz Lis wrote:
> The balloon nodes, which are used to fill areas of GGTT inaccessible
> for a specific VF, were allocated and inserted into GGTT within one
> function. To be able to re-use that insertion code during VF
> migration recovery, we need to split it.
> 
> This patch separates allocation (init/fini functs) from the insertion
> of balloons (balloon/deballoon functs). Locks are also moved to ensure
> calls from post-migration recovery worker will not cause a deadlock.
> 
> v2: Moved declarations to proper header
> v3: Rephrased description, introduced "_locked" versions of some
>   functs, more lockdep checks, some functions renamed, altered error
>   handling, added missing kerneldocs.
> 
> Signed-off-by: Tomasz Lis <tomasz.lis@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_ggtt.c        |  11 +--
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 102 +++++++++++++++++++++-------
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.h |   2 +
>  3 files changed, 82 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index 5fcb2b4c2c13..769a8dc9be6e 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -447,14 +447,13 @@ int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
>  	xe_tile_assert(ggtt->tile, IS_ALIGNED(start, XE_PAGE_SIZE));
>  	xe_tile_assert(ggtt->tile, IS_ALIGNED(end, XE_PAGE_SIZE));
>  	xe_tile_assert(ggtt->tile, !drm_mm_node_allocated(&node->base));
> +	lockdep_assert_held(&ggtt->lock);

since lock is now prerequisite, this function shall be renamed to:

	xe_ggtt_node_insert_balloon_locked()

likely with update to kerneldoc like:

	"To be used in cases where ggtt->lock is already taken.

>  
>  	node->base.color = 0;
>  	node->base.start = start;
>  	node->base.size = end - start;
>  
> -	mutex_lock(&ggtt->lock);
>  	err = drm_mm_reserve_node(&ggtt->mm, &node->base);
> -	mutex_unlock(&ggtt->lock);
>  
>  	if (xe_gt_WARN(ggtt->tile->primary_gt, err,
>  		       "Failed to balloon GGTT %#llx-%#llx (%pe)\n",
> @@ -477,16 +476,12 @@ void xe_ggtt_node_remove_balloon(struct xe_ggtt_node *node)

same as in earlier comment, rename function to:

	xe_ggtt_node_remove_balloon_locked()

>  		return;
>  
>  	if (!drm_mm_node_allocated(&node->base))
> -		goto free_node;
> +		return;
>  
> +	lockdep_assert_held(&node->ggtt->lock);

this should be earlier in the function, as by API SLA lock must be
always taken, not only when node is allocated

>  	xe_ggtt_dump_node(node->ggtt, &node->base, "remove-balloon");
>  
> -	mutex_lock(&node->ggtt->lock);
>  	drm_mm_remove_node(&node->base);
> -	mutex_unlock(&node->ggtt->lock);
> -
> -free_node:
> -	xe_ggtt_node_fini(node);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index a439261bf4d7..c3ca33725161 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -560,35 +560,38 @@ u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt)
>  	return gt->sriov.vf.self_config.lmem_size;
>  }
>  
> -static struct xe_ggtt_node *
> -vf_balloon_ggtt_node(struct xe_ggtt *ggtt, u64 start, u64 end)
> +static int vf_init_ggtt_balloons(struct xe_gt *gt)
>  {
> -	struct xe_ggtt_node *node;
> -	int err;
> +	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_ggtt *ggtt = tile->mem.ggtt;
>  
> -	node = xe_ggtt_node_init(ggtt);
> -	if (IS_ERR(node))
> -		return node;
> +	tile->sriov.vf.ggtt_balloon[0] = xe_ggtt_node_init(ggtt);
> +	if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> +		return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
>  
> -	err = xe_ggtt_node_insert_balloon(node, start, end);
> -	if (err) {
> -		xe_ggtt_node_fini(node);
> -		return ERR_PTR(err);
> -	}
> +	tile->sriov.vf.ggtt_balloon[1] = xe_ggtt_node_init(ggtt);
> +	if (IS_ERR(tile->sriov.vf.ggtt_balloon[1]))
> +		return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);

what about ggtt_balloon[0] ? no need to fini() it ?
>  
> -	return node;
> +	return 0;
>  }
>  
> -static int vf_balloon_ggtt(struct xe_gt *gt)
> +/**
> + * xe_gt_sriov_vf_balloon_ggtt_locked - Insert balloon nodes to limit used GGTT address range.
> + * @gt: the &xe_gt struct instance
> + * Return: 0 on success or a negative error code on failure.
> + */
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt)
>  {
>  	struct xe_gt_sriov_vf_selfconfig *config = &gt->sriov.vf.self_config;
>  	struct xe_tile *tile = gt_to_tile(gt);
> -	struct xe_ggtt *ggtt = tile->mem.ggtt;
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u64 start, end;
> +	int err;
>  
>  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
>  	xe_gt_assert(gt, !xe_gt_is_media_type(gt));
> +	lockdep_assert_held(&tile->mem.ggtt->lock);
>  
>  	if (!config->ggtt_size)
>  		return -ENODATA;
> @@ -611,33 +614,76 @@ static int vf_balloon_ggtt(struct xe_gt *gt)
>  	start = xe_wopcm_size(xe);
>  	end = config->ggtt_base;
>  	if (end != start) {
> -		tile->sriov.vf.ggtt_balloon[0] = vf_balloon_ggtt_node(ggtt, start, end);
> -		if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> -			return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
> +		err = xe_ggtt_node_insert_balloon(tile->sriov.vf.ggtt_balloon[0], start, end);
> +		if (err)
> +			return err;
>  	}
>  
>  	start = config->ggtt_base + config->ggtt_size;
>  	end = GUC_GGTT_TOP;
>  	if (end != start) {
> -		tile->sriov.vf.ggtt_balloon[1] = vf_balloon_ggtt_node(ggtt, start, end);
> -		if (IS_ERR(tile->sriov.vf.ggtt_balloon[1])) {
> +		err = xe_ggtt_node_insert_balloon(tile->sriov.vf.ggtt_balloon[1], start, end);
> +		if (err) {
>  			xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
> -			return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);
> +			return err;
>  		}
>  	}
>  
>  	return 0;
>  }
>  
> -static void deballoon_ggtt(struct drm_device *drm, void *arg)
> +static int vf_balloon_ggtt(struct xe_gt *gt)
>  {
> -	struct xe_tile *tile = arg;
> +	struct xe_ggtt *ggtt = gt_to_tile(gt)->mem.ggtt;
> +	int err;
> +
> +	mutex_lock(&ggtt->lock);
> +	err = xe_gt_sriov_vf_balloon_ggtt_locked(gt);
> +	mutex_unlock(&ggtt->lock);
> +
> +	return err;
> +}
> +
> +/**
> + * xe_gt_sriov_vf_deballoon_ggtt_locked - Remove balloon nodes which limited used address renge.
> + * @gt: the &xe_gt struct instance
> + */
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt)
> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
>  
>  	xe_tile_assert(tile, IS_SRIOV_VF(tile_to_xe(tile)));
> +	lockdep_assert_held(&tile->mem.ggtt->lock);

nit: IMO this is redundant as lock shall be asserted in
xe_ggtt_node_remove_balloon_locked()

> +
>  	xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[1]);
>  	xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
>  }
>  
> +static void vf_deballoon_ggtt(struct xe_gt *gt)

hmm, in this patch you don't really need to split (de)balloon logic into
locked/unlocked parts, so maybe keep it as it was and introduce such
split when really needed

also it's quite unusual that unlocked part is named in completely
different fashion than locked, can't it be (later) defined as pair of:

	xe_gt_sriov_vf_deballoon_ggtt_locked()
	xe_gt_sriov_vf_deballoon_ggtt()

and

	xe_gt_sriov_vf_balloon_ggtt_locked()
	xe_gt_sriov_vf_balloon_ggtt()

> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
> +
> +	mutex_lock(&tile->mem.ggtt->lock);
> +	xe_gt_sriov_vf_deballoon_ggtt_locked(gt);
> +	mutex_unlock(&tile->mem.ggtt->lock);
> +}
> +
> +static void vf_balloon_fini(struct xe_gt *gt)
> +{
> +	struct xe_tile *tile = gt_to_tile(gt);

missing asserts:

  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
  	xe_gt_assert(gt, !xe_gt_is_media_type(gt));

> +
> +	xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[1]);
> +	xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[0]);
> +}
> +
> +static void deballoon_and_fini_ggtt(struct drm_device *drm, void *arg)
> +{
> +	struct xe_tile *tile = arg;
> +
> +	vf_deballoon_ggtt(tile->primary_gt);
> +	vf_balloon_fini(tile->primary_gt);
> +}
> +
>  /**
>   * xe_gt_sriov_vf_prepare_ggtt - Prepare a VF's GGTT configuration.
>   * @gt: the &xe_gt
> @@ -655,11 +701,17 @@ int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt)
>  	if (xe_gt_is_media_type(gt))
>  		return 0;
>  
> -	err = vf_balloon_ggtt(gt);
> +	err = vf_init_ggtt_balloons(gt);
>  	if (err)
>  		return err;
>  
> -	return drmm_add_action_or_reset(&xe->drm, deballoon_ggtt, tile);
> +	err = vf_balloon_ggtt(gt);
> +	if (err) {
> +		vf_balloon_fini(gt);
> +		return err;
> +	}
> +
> +	return drmm_add_action_or_reset(&xe->drm, deballoon_and_fini_ggtt, tile);
>  }
>  
>  static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor)
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> index ba6c5d74e326..d717deb8af91 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> @@ -18,6 +18,8 @@ int xe_gt_sriov_vf_query_config(struct xe_gt *gt);
>  int xe_gt_sriov_vf_connect(struct xe_gt *gt);
>  int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt);
>  int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt);
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt);
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt);
>  int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt);
>  void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt);
>

next prev parent reply	other threads:[~2025-04-08 12:00 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-03 18:40 [PATCH v7 0/4] drm/xe/vf: Post-migration recovery of GGTT nodes and CTB Tomasz Lis
2025-04-03 18:40 ` [PATCH v7 1/4] drm/xe/vf: Divide GGTT ballooning into allocation and insertion Tomasz Lis
2025-04-08 11:59   ` Michal Wajdeczko [this message]
2025-04-09 20:58     ` Lis, Tomasz
2025-04-03 18:40 ` [PATCH v7 2/4] drm/xe/vf: Shifting GGTT area post migration Tomasz Lis
2025-04-08 13:23   ` Michal Wajdeczko
2025-04-09 21:03     ` Lis, Tomasz
2025-04-03 18:40 ` [PATCH v7 3/4] drm/xe/guc: Introduce enum with offsets for context register H2Gs Tomasz Lis
2025-04-08 13:34   ` Michal Wajdeczko
2025-04-09 21:04     ` Lis, Tomasz
2025-04-03 18:40 ` [PATCH v7 4/4] drm/xe/vf: Fixup CTB send buffer messages after migration Tomasz Lis
2025-04-08 14:23   ` Michal Wajdeczko
2025-04-09 21:09     ` Lis, Tomasz
2025-04-10 18:24       ` Michal Wajdeczko
2025-04-11 14:34         ` Lis, Tomasz
2025-04-04  0:22 ` ✓ CI.Patch_applied: success for drm/xe/vf: Post-migration recovery of GGTT nodes and CTB (rev6) Patchwork
2025-04-04  0:23 ` ✗ CI.checkpatch: warning " Patchwork
2025-04-04  0:24 ` ✓ CI.KUnit: success " Patchwork
2025-04-04  0:40 ` ✓ CI.Build: " Patchwork
2025-04-04  0:43 ` ✓ CI.Hooks: " Patchwork
2025-04-04  0:44 ` ✓ CI.checksparse: " Patchwork
2025-04-04  1:29 ` ✗ Xe.CI.BAT: failure " Patchwork
2025-04-04 10:12 ` ✓ Xe.CI.Full: success " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=df3d130a-8d4c-443d-99ba-92d79eda14d5@intel.com \
    --to=michal.wajdeczko@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=lucas.demarchi@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=michal.winiarski@intel.com \
    --cc=piotr.piorkowski@intel.com \
    --cc=tomasz.lis@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox