Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Teres Alexis, Alan Previn" <alan.previn.teres.alexis@intel.com>
To: "intel-xe@lists.freedesktop.org" <intel-xe@lists.freedesktop.org>
Cc: "dri-devel@lists.freedesktop.org"
	<dri-devel@lists.freedesktop.org>,
	"Harrison, John C" <john.c.harrison@intel.com>,
	"Brost, Matthew" <matthew.brost@intel.com>,
	"Dong, Zhanjun" <zhanjun.dong@intel.com>,
	"Ceraolo Spurio, Daniele" <daniele.ceraolospurio@intel.com>,
	"Vivi, Rodrigo" <rodrigo.vivi@intel.com>
Subject: Re: [PATCH v6 2/6] drm/xe/guc: Don't store capture nodes in xe_devcoredump_snapshot
Date: Thu, 30 Jan 2025 17:57:56 +0000	[thread overview]
Message-ID: <18a003982875f1d613344a5982ce99037ef2c450.camel@intel.com> (raw)
In-Reply-To: <20250128183653.4027915-3-alan.previn.teres.alexis@intel.com>

On Tue, 2025-01-28 at 10:36 -0800, Teres Alexis, Alan Previn wrote:
> GuC-Err-Capture should not be storing register snapshot
> nodes directly inside of the top level xe_devcoredump_snapshot
> structure that it doesn't control. Furthermore, that is
> is not right from a driver subsystem layering perspective.
> 
> 
alan:snip

> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index a99e3160724b..26006d72904f 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -25,6 +25,7 @@
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
>  #include "xe_guc_capture.h"
> +#include "xe_guc_capture_snapshot_types.h"
>  #include "xe_hw_engine_group.h"
>  #include "xe_hw_fence.h"
>  #include "xe_irq.h"
> @@ -867,22 +868,20 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_exec_queue *q)
>                 return snapshot;
>  
>         if (q) {
> -               /* If got guc capture, set source to GuC */
> -               node = xe_guc_capture_get_matching_and_lock(q);
> -               if (node) {
> -                       struct xe_device *xe = gt_to_xe(hwe->gt);
> -                       struct xe_devcoredump *coredump = &xe->devcoredump;
> -
> -                       coredump->snapshot.matched_node = node;
> -                       xe_gt_dbg(hwe->gt, "Found and locked GuC-err-capture node");
> -                       return snapshot;
> +               /* First, retrieve the manual GuC-Error-Capture node if it exists */
> +               node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_MANUAL);
> +               /* Find preferred node type sourced from firmware if available */
> +               snapshot->matched_node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_GUC);
> +               if (!snapshot->matched_node) {
> +                       xe_gt_dbg(hwe->gt, "No fw sourced GuC-Err-Capture for queue %s", q->name);
> +                       snapshot->matched_node = node;
> +               } else if (node) {
> +                       xe_guc_capture_put_matched_nodes(&hwe->gt->uc.guc, node);
>                 }
> +               if (!snapshot->matched_node)
> +                       xe_gt_warn(hwe->gt, "Can't retrieve any GuC-Err-Capture node");
alan: a couple of the CI full-test failures was caused by this. It turns out that
we have other code paths that can attempt to generate a xe_devcoredump without being triggered
from a timed-out-job event. John Harrison fedback that such cases are still valid so this should
be a xe_gt_dbg, not xe_gt_warn. Additionally, we agreed that there is value in reporting
such cases in the dump file. So as opposed to "GuC source" vs "Manual source" engine dumps
we could add additional differentiation Guc-src vs Manual-early vs Manual-late.

Will add that in next rev.


>         }
>  
> -       /* otherwise, do manual capture */
> -       xe_engine_manual_capture(hwe, snapshot);
> -       xe_gt_dbg(hwe->gt, "Proceeding with manual engine snapshot");
> -
>         return snapshot;
>  }
>  
> @@ -900,12 +899,7 @@ void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot)
>                 return;
>  
>         gt = snapshot->hwe->gt;
> -       /*
> -        * xe_guc_capture_put_matched_nodes is called here and from
> -        * xe_devcoredump_snapshot_free, to cover the 2 calling paths
> -        * of hw_engines - debugfs and devcoredump free.
> -        */
> -       xe_guc_capture_put_matched_nodes(&gt->uc.guc);
> +       xe_guc_capture_put_matched_nodes(&gt->uc.guc, snapshot->matched_node);
>  
>         kfree(snapshot->name);
>         kfree(snapshot);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index de69e2628f2f..de1f82c11bcf 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -152,6 +152,7 @@ struct xe_hw_engine {
>         struct xe_hw_engine_group *hw_engine_group;
>  };
>  
> +struct xe_guc_capture_snapshot;
>  /**
>   * struct xe_hw_engine_snapshot - Hardware engine snapshot
>   *
> @@ -175,6 +176,13 @@ struct xe_hw_engine_snapshot {
>         u32 mmio_base;
>         /** @kernel_reserved: Engine reserved, can't be used by userspace */
>         bool kernel_reserved;
> +       /**
> +        * @matched_node: GuC Capture snapshot:
> +        * The matched capture node for the timedout job
> +        * this single-node tracker works because devcoredump will always only
> +        * produce one hw-engine capture per devcoredump event
> +        */
> +       struct xe_guc_capture_snapshot *matched_node;
>  };
>  
>  #endif


  reply	other threads:[~2025-01-30 17:58 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-28 18:36 [PATCH v6 0/6] Maintenence of devcoredump <-> GuC-Err-Capture plumbing Alan Previn
2025-01-28 18:36 ` [PATCH v6 1/6] drm/xe/guc: Rename __guc_capture_parsed_output Alan Previn
2025-01-30 22:37   ` Rodrigo Vivi
2025-01-31 18:44     ` Teres Alexis, Alan Previn
2025-02-10 19:01   ` Dong, Zhanjun
2025-01-28 18:36 ` [PATCH v6 2/6] drm/xe/guc: Don't store capture nodes in xe_devcoredump_snapshot Alan Previn
2025-01-30 17:57   ` Teres Alexis, Alan Previn [this message]
2025-02-10 23:41   ` Dong, Zhanjun
2025-02-12 19:25     ` Teres Alexis, Alan Previn
2025-01-28 18:36 ` [PATCH v6 3/6] drm/xe/guc: Split engine state print between xe_hw_engine vs xe_guc_capture Alan Previn
2025-01-30 22:42   ` Rodrigo Vivi
2025-01-31 18:55     ` Teres Alexis, Alan Previn
2025-02-10 18:45       ` Teres Alexis, Alan Previn
2025-01-28 18:36 ` [PATCH v6 4/6] drm/xe/guc: Move xe_hw_engine_snapshot creation back to xe_hw_engine.c Alan Previn
2025-01-30 22:43   ` Rodrigo Vivi
2025-01-31 18:56     ` Teres Alexis, Alan Previn
2025-01-28 18:36 ` [PATCH v6 5/6] drm/xe/xe_hw_engine: Update hw_engine_snapshot_capture for debugfs Alan Previn
2025-01-28 20:45   ` kernel test robot
2025-01-28 18:36 ` [PATCH v6 6/6] drm/xe/guc: Update comments on GuC-Err-Capture flows Alan Previn
2025-01-28 21:19 ` ✓ CI.Patch_applied: success for Maintenence of devcoredump <-> GuC-Err-Capture plumbing Patchwork
2025-01-28 21:21 ` ✗ CI.checkpatch: warning " Patchwork
2025-01-28 21:22 ` ✓ CI.KUnit: success " Patchwork
2025-01-28 21:38 ` ✓ CI.Build: " Patchwork
2025-01-28 21:40 ` ✗ CI.Hooks: failure " Patchwork
2025-01-28 21:41 ` ✓ CI.checksparse: success " Patchwork
2025-01-28 22:01 ` ✓ Xe.CI.BAT: " Patchwork
2025-01-29 12:54 ` ✗ Xe.CI.Full: failure " Patchwork
2025-01-30 17:13   ` Teres Alexis, Alan Previn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=18a003982875f1d613344a5982ce99037ef2c450.camel@intel.com \
    --to=alan.previn.teres.alexis@intel.com \
    --cc=daniele.ceraolospurio@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=john.c.harrison@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=zhanjun.dong@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox