Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
To: John.C.Harrison@Intel.com, Intel-Xe@Lists.FreeDesktop.Org
Subject: Re: [PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info
Date: Wed, 12 Jun 2024 00:49:47 +0200	[thread overview]
Message-ID: <5fc7f334-0f3d-4180-93fa-d40a1c98064c@intel.com> (raw)
In-Reply-To: <20240611012028.2305024-4-John.C.Harrison@Intel.com>



On 11.06.2024 03:20, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> Split the GuC log dump into a two stage snapshot and print mechanism.
> This allows the log to be captured at the point of an error (which may
> be in a resticted context) and then dump it out later (from a regular

typo

> context such as a worker function or a sysfs file handler).
> 
> Also add a bunch of other useful pieces of information that can help
> (or are fundamentally required!) to decode and parse the log.
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_guc_regs.h |   1 +
>  drivers/gpu/drm/xe/xe_guc_log.c       | 150 ++++++++++++++++++++------
>  drivers/gpu/drm/xe/xe_guc_log.h       |   6 ++
>  drivers/gpu/drm/xe/xe_guc_log_types.h |  29 +++++
>  4 files changed, 155 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> index a5fd14307f94..b27b73680c12 100644
> --- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> @@ -84,6 +84,7 @@
>  #define   HUC_LOADING_AGENT_GUC			REG_BIT(1)
>  #define   GUC_WOPCM_OFFSET_VALID		REG_BIT(0)
>  #define GUC_MAX_IDLE_COUNT			XE_REG(0xc3e4)
> +#define GUC_PMTIMESTAMP				XE_REG(0xc3e8)
>  
>  #define GUC_SEND_INTERRUPT			XE_REG(0xc4c8)
>  #define   GUC_SEND_TRIGGER			REG_BIT(0)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> index a35309926271..a35d3cc0c369 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.c
> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> @@ -7,12 +7,20 @@
>  
>  #include <drm/drm_managed.h>
>  
> +#include "regs/xe_guc_regs.h"
>  #include "xe_bo.h"
>  #include "xe_gt.h"
>  #include "xe_gt_printk.h"
>  #include "xe_map.h"
> +#include "xe_mmio.h"
>  #include "xe_module.h"
>  
> +static struct xe_guc *
> +log_to_guc(struct xe_guc_log *log)

any reason why to split signature into two lines?

> +{
> +	return container_of(log, struct xe_guc, log);
> +}
> +
>  static struct xe_gt *
>  log_to_gt(struct xe_guc_log *log)
>  {
> @@ -108,62 +116,142 @@ static void xe_hexdump_blob(struct xe_device *xe, const void *blob, size_t size,
>  
>  #define GUC_LOG_CHUNK_SIZE	SZ_2M
>  
> -void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic)

no kernel-doc

> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_alloc(struct xe_guc_log *log, bool atomic)
>  {
> -	struct xe_device *xe = log_to_xe(log);
> -	size_t size, remain;
> -	void **copy;
> -	int num_chunks, i;
> +	struct xe_guc_log_snapshot *snapshot;
> +	size_t remain;
> +	int i;
>  
> -	xe_assert(xe, log->bo);
> +	snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL);
> +	if (!snapshot)
> +		return NULL;
>  
>  	/*
>  	 * NB: kmalloc has a hard limit well below the maximum GuC log buffer size.
>  	 * Also, can't use vmalloc as might be called from atomic context. So need
>  	 * to break the buffer up into smaller chunks that can be allocated.
>  	 */
> -	size = log->bo->size;
> -	num_chunks = (size + GUC_LOG_CHUNK_SIZE - 1) / GUC_LOG_CHUNK_SIZE;
> +	snapshot->size = log->bo->size;
> +	snapshot->num_chunks = (snapshot->size + GUC_LOG_CHUNK_SIZE - 1) / GUC_LOG_CHUNK_SIZE;
>  
> -	copy = kcalloc(num_chunks, sizeof(*copy), atomic ? GFP_ATOMIC : GFP_KERNEL);
> -	if (!copy) {
> -		drm_printf(p, "Failed to allocate array x%d", num_chunks);
> -		return;
> -	}
> +	snapshot->copy = kcalloc(snapshot->num_chunks, sizeof(*snapshot->copy),
> +				 atomic ? GFP_ATOMIC : GFP_KERNEL);
> +	if (!snapshot->copy)
> +		goto fail_snap;
>  
> -	remain = size;
> -	for (i = 0; i < num_chunks; i++) {
> +	remain = snapshot->size;
> +	for (i = 0; i < snapshot->num_chunks; i++) {
>  		size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>  
> -		copy[i] = kmalloc(size, atomic ? GFP_ATOMIC : GFP_KERNEL);
> -		if (!copy[i]) {
> -			drm_printf(p, "Failed to allocate %ld at chunk %d of %d",
> -				   size, i, num_chunks);
> -			goto out;
> -		}
> +		snapshot->copy[i] = kmalloc(size, atomic ? GFP_ATOMIC : GFP_KERNEL);
> +		if (!snapshot->copy[i])
> +			goto fail_copy;
>  		remain -= size;
>  	}
>  
> -	remain = size;
> -	for (i = 0; i < num_chunks; i++) {
> +	return snapshot;
> +
> +fail_copy:
> +	for (i = 0; i < snapshot->num_chunks; i++)
> +		kfree(snapshot->copy[i]);
> +	kfree(snapshot->copy);
> +fail_snap:
> +	kfree(snapshot);
> +	return NULL;
> +}
> +
> +void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot)
> +{
> +	int i;
> +
> +	if (!snapshot)
> +		return;
> +
> +	if (!snapshot->copy) {
> +		for (i = 0; i < snapshot->num_chunks; i++)
> +			kfree(snapshot->copy[i]);
> +		kfree(snapshot->copy);
> +	}
> +
> +	kfree(snapshot);
> +}
> +
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_capture(struct xe_guc_log *log, bool atomic)
> +{
> +	struct xe_guc_log_snapshot *snapshot;
> +	struct xe_device *xe = log_to_xe(log);
> +	struct xe_guc *guc = log_to_guc(log);
> +	struct xe_gt *gt = log_to_gt(log);
> +	size_t remain;
> +	int i;
> +
> +	if (!log->bo) {
> +		xe_gt_err(gt, "GuC log not allocated!\n");
> +		return NULL;
> +	}
> +
> +	snapshot = xe_guc_log_snapshot_alloc(log, atomic);
> +	if (!snapshot) {
> +		xe_gt_err(gt, "GuC log snapshot not allocated!\n");
> +		return NULL;
> +	}
> +
> +	remain = snapshot->size;
> +	for (i = 0; i < snapshot->num_chunks; i++) {
>  		size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>  
> -		xe_map_memcpy_from(xe, copy[i], &log->bo->vmap, i * GUC_LOG_CHUNK_SIZE, size);
> +		xe_map_memcpy_from(xe, snapshot->copy[i], &log->bo->vmap,
> +				   i * GUC_LOG_CHUNK_SIZE, size);
>  		remain -= size;
>  	}
>  
> -	remain = size;
> -	for (i = 0; i < num_chunks; i++) {
> +	snapshot->ktime = ktime_get_boottime_ns();
> +	snapshot->stamp = xe_mmio_read32(gt, GUC_PMTIMESTAMP);
> +	snapshot->ref_clk = gt->info.reference_clock;
> +	snapshot->level = log->level;
> +	snapshot->ver_found = guc->fw.versions.found[XE_UC_FW_VER_RELEASE];
> +	snapshot->ver_want = guc->fw.versions.wanted;
> +	snapshot->path = guc->fw.path;

shouldn't we use kstrdup() instead ?

> +
> +	return snapshot;
> +}
> +
> +void xe_guc_log_snapshot_print(struct xe_device *xe, struct xe_guc_log_snapshot *snapshot,
> +			       struct drm_printer *p, bool atomic)
> +{
> +	size_t remain;
> +	int i;
> +
> +	if (!snapshot) {
> +		drm_printf(p, "GuC log snapshot not allocated!\n");
> +		return;
> +	}
> +
> +	drm_printf(p, "GuC version %u.%u.%u (wanted %u.%u.%u)\n",
> +		   snapshot->ver_found.major, snapshot->ver_found.minor, snapshot->ver_found.patch,
> +		   snapshot->ver_want.major, snapshot->ver_want.minor, snapshot->ver_want.patch);
> +	drm_printf(p, "GuC firmware: %s\n", snapshot->path);
> +	drm_printf(p, "Kernel timestamp: 0x%08llX [%llu]\n", snapshot->ktime, snapshot->ktime);
> +	drm_printf(p, "GuC timestamp: 0x%08X [%u]\n", snapshot->stamp, snapshot->stamp);
> +	drm_printf(p, "CS timestamp frequency: %u Hz\n", snapshot->ref_clk);
> +	drm_printf(p, "Log level: %u\n", snapshot->level);
> +
> +	remain = snapshot->size;
> +	for (i = 0; i < snapshot->num_chunks; i++) {
>  		size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>  
> -		xe_hexdump_blob(xe, copy[i], size, p, atomic);
> +		xe_hexdump_blob(xe, snapshot->copy[i], size, p, atomic);

we really should get rid of xe in xe_hexdump_blob

>  		remain -= size;
>  	}
> +}
> +
> +void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic)
> +{
> +	struct xe_guc_log_snapshot *snapshot;
>  
> -out:
> -	for (i = 0; i < num_chunks; i++)
> -		kfree(copy[i]);
> -	kfree(copy);
> +	snapshot = xe_guc_log_snapshot_capture(log, atomic);
> +	xe_guc_log_snapshot_print(log_to_xe(log), snapshot, p, atomic);
> +	xe_guc_log_snapshot_free(snapshot);
>  }
>  
>  int xe_guc_log_init(struct xe_guc_log *log)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
> index 5149b492c3b8..6e4d0b369c19 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
> @@ -9,6 +9,7 @@
>  #include "xe_guc_log_types.h"
>  
>  struct drm_printer;
> +struct xe_device;
>  
>  #if IS_ENABLED(CONFIG_DRM_XE_LARGE_GUC_BUFFER)
>  #define CRASH_BUFFER_SIZE       SZ_1M
> @@ -38,6 +39,11 @@ struct drm_printer;
>  
>  int xe_guc_log_init(struct xe_guc_log *log);
>  void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic);
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_alloc(struct xe_guc_log *log, bool atomic);
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_capture(struct xe_guc_log *log, bool atomic);
> +void xe_guc_log_snapshot_print(struct xe_device *xe, struct xe_guc_log_snapshot *snapshot,
> +			       struct drm_printer *p, bool atomic);
> +void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot);
>  
>  static inline u32
>  xe_guc_log_get_level(struct xe_guc_log *log)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h
> index 125080d138a7..e3a6a44c2f18 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h
> @@ -8,8 +8,37 @@
>  
>  #include <linux/types.h>
>  
> +#include "xe_uc_fw_types.h"
> +
>  struct xe_bo;
>  
> +/**
> + * struct xe_guc_log_snapshot:
> + * Capture of the GuC log plus various state useful for decoding the log
> + */
> +struct xe_guc_log_snapshot {
> +	/** @size: Size in bytes of the @copy allocation */
> +	size_t size;
> +	/** @copy: Host memory copy of the log buffer for later dumping, split into chunks */
> +	void **copy;
> +	/** @num_chunks: Number of chunks withint @copy */

typo

> +	int num_chunks;
> +	/** @ktime: Kernel time the snapshot was taken */
> +	u64 ktime;
> +	/** @stamp: GuC timestamp at which the snapshot was taken */
> +	u32 stamp;
> +	/** @ref_clk: GuC timestamp frequency */
> +	u32 ref_clk;
> +	/** @level: GuC log verbosity level */
> +	u32 level;
> +	/** @ver_found: GuC firmware version */
> +	struct xe_uc_fw_version ver_found;

didn't you mention that platform and/or revision is also mandatory to
correctly decode GuC log ?

> +	/** @ver_want: GuC firmware version that driver expected */
> +	struct xe_uc_fw_version ver_want;

I'm still not convinced that it's relevant to the actual GuC log what
driver wanted to use - this should be part of the GuC snapshot instead

> +	/** @path: Path of GuC firmware blob */
> +	const char *path;

ditto

> +};
> +
>  /**
>   * struct xe_guc_log - GuC log
>   */

  reply	other threads:[~2024-06-11 22:49 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-11  1:20 [PATCH v4 0/7] drm/xe/guc: Improve quality and robustness of GuC log dumping John.C.Harrison
2024-06-11  1:20 ` [PATCH v4 1/7] drm/xe/guc: Remove spurious line feed in debug print John.C.Harrison
2024-06-11  1:20 ` [PATCH v4 2/7] drm/xe/guc: Copy GuC log prior to dumping John.C.Harrison
2024-06-11 22:30   ` Michal Wajdeczko
2024-06-12 23:36     ` John Harrison
2024-06-11  1:20 ` [PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info John.C.Harrison
2024-06-11 22:49   ` Michal Wajdeczko [this message]
2024-06-12 23:52     ` John Harrison
2024-06-11  1:20 ` [PATCH v4 4/7] drm/print: Introduce drm_line_printer John.C.Harrison
2024-06-11  1:20 ` [PATCH v4 5/7] drm/xe/guc: Add a helper function for dumping GuC log to dmesg John.C.Harrison
2024-06-11  1:20 ` [PATCH v4 6/7] drm/xe/guc: Dead CT helper John.C.Harrison
2024-06-11 23:20   ` Michal Wajdeczko
2024-06-13  0:43     ` John Harrison
2024-06-11  1:20 ` [PATCH v4 7/7] drm/xe/guc: Dump entire CTB on errors John.C.Harrison
2024-06-11  1:25 ` ✓ CI.Patch_applied: success for drm/xe/guc: Improve quality and robustness of GuC log dumping (rev2) Patchwork
2024-06-11  1:25 ` ✗ CI.checkpatch: warning " Patchwork
2024-06-11  1:26 ` ✓ CI.KUnit: success " Patchwork
2024-06-11  1:38 ` ✓ CI.Build: " Patchwork
2024-06-11  1:40 ` ✗ CI.Hooks: failure " Patchwork
2024-06-11  1:41 ` ✗ CI.checksparse: warning " Patchwork
2024-06-11  2:31 ` ✗ CI.BAT: failure " Patchwork
2024-06-11  3:53 ` ✗ CI.FULL: " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2024-06-10 22:56 [PATCH v4 0/7] drm/xe/guc: Improve quality and robustness of GuC log dumping John.C.Harrison
2024-06-10 22:56 ` [PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info John.C.Harrison

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5fc7f334-0f3d-4180-93fa-d40a1c98064c@intel.com \
    --to=michal.wajdeczko@intel.com \
    --cc=Intel-Xe@Lists.FreeDesktop.Org \
    --cc=John.C.Harrison@Intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox