From: Michal Wajdeczko <michal.wajdeczko@intel.com>
To: John.C.Harrison@Intel.com, Intel-Xe@Lists.FreeDesktop.Org
Subject: Re: [PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info
Date: Wed, 12 Jun 2024 00:49:47 +0200 [thread overview]
Message-ID: <5fc7f334-0f3d-4180-93fa-d40a1c98064c@intel.com> (raw)
In-Reply-To: <20240611012028.2305024-4-John.C.Harrison@Intel.com>
On 11.06.2024 03:20, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> Split the GuC log dump into a two stage snapshot and print mechanism.
> This allows the log to be captured at the point of an error (which may
> be in a resticted context) and then dump it out later (from a regular
typo
> context such as a worker function or a sysfs file handler).
>
> Also add a bunch of other useful pieces of information that can help
> (or are fundamentally required!) to decode and parse the log.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
> drivers/gpu/drm/xe/regs/xe_guc_regs.h | 1 +
> drivers/gpu/drm/xe/xe_guc_log.c | 150 ++++++++++++++++++++------
> drivers/gpu/drm/xe/xe_guc_log.h | 6 ++
> drivers/gpu/drm/xe/xe_guc_log_types.h | 29 +++++
> 4 files changed, 155 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> index a5fd14307f94..b27b73680c12 100644
> --- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> @@ -84,6 +84,7 @@
> #define HUC_LOADING_AGENT_GUC REG_BIT(1)
> #define GUC_WOPCM_OFFSET_VALID REG_BIT(0)
> #define GUC_MAX_IDLE_COUNT XE_REG(0xc3e4)
> +#define GUC_PMTIMESTAMP XE_REG(0xc3e8)
>
> #define GUC_SEND_INTERRUPT XE_REG(0xc4c8)
> #define GUC_SEND_TRIGGER REG_BIT(0)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> index a35309926271..a35d3cc0c369 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.c
> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> @@ -7,12 +7,20 @@
>
> #include <drm/drm_managed.h>
>
> +#include "regs/xe_guc_regs.h"
> #include "xe_bo.h"
> #include "xe_gt.h"
> #include "xe_gt_printk.h"
> #include "xe_map.h"
> +#include "xe_mmio.h"
> #include "xe_module.h"
>
> +static struct xe_guc *
> +log_to_guc(struct xe_guc_log *log)
any reason why to split signature into two lines?
> +{
> + return container_of(log, struct xe_guc, log);
> +}
> +
> static struct xe_gt *
> log_to_gt(struct xe_guc_log *log)
> {
> @@ -108,62 +116,142 @@ static void xe_hexdump_blob(struct xe_device *xe, const void *blob, size_t size,
>
> #define GUC_LOG_CHUNK_SIZE SZ_2M
>
> -void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic)
no kernel-doc
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_alloc(struct xe_guc_log *log, bool atomic)
> {
> - struct xe_device *xe = log_to_xe(log);
> - size_t size, remain;
> - void **copy;
> - int num_chunks, i;
> + struct xe_guc_log_snapshot *snapshot;
> + size_t remain;
> + int i;
>
> - xe_assert(xe, log->bo);
> + snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL);
> + if (!snapshot)
> + return NULL;
>
> /*
> * NB: kmalloc has a hard limit well below the maximum GuC log buffer size.
> * Also, can't use vmalloc as might be called from atomic context. So need
> * to break the buffer up into smaller chunks that can be allocated.
> */
> - size = log->bo->size;
> - num_chunks = (size + GUC_LOG_CHUNK_SIZE - 1) / GUC_LOG_CHUNK_SIZE;
> + snapshot->size = log->bo->size;
> + snapshot->num_chunks = (snapshot->size + GUC_LOG_CHUNK_SIZE - 1) / GUC_LOG_CHUNK_SIZE;
>
> - copy = kcalloc(num_chunks, sizeof(*copy), atomic ? GFP_ATOMIC : GFP_KERNEL);
> - if (!copy) {
> - drm_printf(p, "Failed to allocate array x%d", num_chunks);
> - return;
> - }
> + snapshot->copy = kcalloc(snapshot->num_chunks, sizeof(*snapshot->copy),
> + atomic ? GFP_ATOMIC : GFP_KERNEL);
> + if (!snapshot->copy)
> + goto fail_snap;
>
> - remain = size;
> - for (i = 0; i < num_chunks; i++) {
> + remain = snapshot->size;
> + for (i = 0; i < snapshot->num_chunks; i++) {
> size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>
> - copy[i] = kmalloc(size, atomic ? GFP_ATOMIC : GFP_KERNEL);
> - if (!copy[i]) {
> - drm_printf(p, "Failed to allocate %ld at chunk %d of %d",
> - size, i, num_chunks);
> - goto out;
> - }
> + snapshot->copy[i] = kmalloc(size, atomic ? GFP_ATOMIC : GFP_KERNEL);
> + if (!snapshot->copy[i])
> + goto fail_copy;
> remain -= size;
> }
>
> - remain = size;
> - for (i = 0; i < num_chunks; i++) {
> + return snapshot;
> +
> +fail_copy:
> + for (i = 0; i < snapshot->num_chunks; i++)
> + kfree(snapshot->copy[i]);
> + kfree(snapshot->copy);
> +fail_snap:
> + kfree(snapshot);
> + return NULL;
> +}
> +
> +void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot)
> +{
> + int i;
> +
> + if (!snapshot)
> + return;
> +
> + if (!snapshot->copy) {
> + for (i = 0; i < snapshot->num_chunks; i++)
> + kfree(snapshot->copy[i]);
> + kfree(snapshot->copy);
> + }
> +
> + kfree(snapshot);
> +}
> +
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_capture(struct xe_guc_log *log, bool atomic)
> +{
> + struct xe_guc_log_snapshot *snapshot;
> + struct xe_device *xe = log_to_xe(log);
> + struct xe_guc *guc = log_to_guc(log);
> + struct xe_gt *gt = log_to_gt(log);
> + size_t remain;
> + int i;
> +
> + if (!log->bo) {
> + xe_gt_err(gt, "GuC log not allocated!\n");
> + return NULL;
> + }
> +
> + snapshot = xe_guc_log_snapshot_alloc(log, atomic);
> + if (!snapshot) {
> + xe_gt_err(gt, "GuC log snapshot not allocated!\n");
> + return NULL;
> + }
> +
> + remain = snapshot->size;
> + for (i = 0; i < snapshot->num_chunks; i++) {
> size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>
> - xe_map_memcpy_from(xe, copy[i], &log->bo->vmap, i * GUC_LOG_CHUNK_SIZE, size);
> + xe_map_memcpy_from(xe, snapshot->copy[i], &log->bo->vmap,
> + i * GUC_LOG_CHUNK_SIZE, size);
> remain -= size;
> }
>
> - remain = size;
> - for (i = 0; i < num_chunks; i++) {
> + snapshot->ktime = ktime_get_boottime_ns();
> + snapshot->stamp = xe_mmio_read32(gt, GUC_PMTIMESTAMP);
> + snapshot->ref_clk = gt->info.reference_clock;
> + snapshot->level = log->level;
> + snapshot->ver_found = guc->fw.versions.found[XE_UC_FW_VER_RELEASE];
> + snapshot->ver_want = guc->fw.versions.wanted;
> + snapshot->path = guc->fw.path;
shouldn't we use kstrdup() instead ?
> +
> + return snapshot;
> +}
> +
> +void xe_guc_log_snapshot_print(struct xe_device *xe, struct xe_guc_log_snapshot *snapshot,
> + struct drm_printer *p, bool atomic)
> +{
> + size_t remain;
> + int i;
> +
> + if (!snapshot) {
> + drm_printf(p, "GuC log snapshot not allocated!\n");
> + return;
> + }
> +
> + drm_printf(p, "GuC version %u.%u.%u (wanted %u.%u.%u)\n",
> + snapshot->ver_found.major, snapshot->ver_found.minor, snapshot->ver_found.patch,
> + snapshot->ver_want.major, snapshot->ver_want.minor, snapshot->ver_want.patch);
> + drm_printf(p, "GuC firmware: %s\n", snapshot->path);
> + drm_printf(p, "Kernel timestamp: 0x%08llX [%llu]\n", snapshot->ktime, snapshot->ktime);
> + drm_printf(p, "GuC timestamp: 0x%08X [%u]\n", snapshot->stamp, snapshot->stamp);
> + drm_printf(p, "CS timestamp frequency: %u Hz\n", snapshot->ref_clk);
> + drm_printf(p, "Log level: %u\n", snapshot->level);
> +
> + remain = snapshot->size;
> + for (i = 0; i < snapshot->num_chunks; i++) {
> size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>
> - xe_hexdump_blob(xe, copy[i], size, p, atomic);
> + xe_hexdump_blob(xe, snapshot->copy[i], size, p, atomic);
we really should get rid of xe in xe_hexdump_blob
> remain -= size;
> }
> +}
> +
> +void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic)
> +{
> + struct xe_guc_log_snapshot *snapshot;
>
> -out:
> - for (i = 0; i < num_chunks; i++)
> - kfree(copy[i]);
> - kfree(copy);
> + snapshot = xe_guc_log_snapshot_capture(log, atomic);
> + xe_guc_log_snapshot_print(log_to_xe(log), snapshot, p, atomic);
> + xe_guc_log_snapshot_free(snapshot);
> }
>
> int xe_guc_log_init(struct xe_guc_log *log)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
> index 5149b492c3b8..6e4d0b369c19 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
> @@ -9,6 +9,7 @@
> #include "xe_guc_log_types.h"
>
> struct drm_printer;
> +struct xe_device;
>
> #if IS_ENABLED(CONFIG_DRM_XE_LARGE_GUC_BUFFER)
> #define CRASH_BUFFER_SIZE SZ_1M
> @@ -38,6 +39,11 @@ struct drm_printer;
>
> int xe_guc_log_init(struct xe_guc_log *log);
> void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic);
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_alloc(struct xe_guc_log *log, bool atomic);
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_capture(struct xe_guc_log *log, bool atomic);
> +void xe_guc_log_snapshot_print(struct xe_device *xe, struct xe_guc_log_snapshot *snapshot,
> + struct drm_printer *p, bool atomic);
> +void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot);
>
> static inline u32
> xe_guc_log_get_level(struct xe_guc_log *log)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h
> index 125080d138a7..e3a6a44c2f18 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h
> @@ -8,8 +8,37 @@
>
> #include <linux/types.h>
>
> +#include "xe_uc_fw_types.h"
> +
> struct xe_bo;
>
> +/**
> + * struct xe_guc_log_snapshot:
> + * Capture of the GuC log plus various state useful for decoding the log
> + */
> +struct xe_guc_log_snapshot {
> + /** @size: Size in bytes of the @copy allocation */
> + size_t size;
> + /** @copy: Host memory copy of the log buffer for later dumping, split into chunks */
> + void **copy;
> + /** @num_chunks: Number of chunks withint @copy */
typo
> + int num_chunks;
> + /** @ktime: Kernel time the snapshot was taken */
> + u64 ktime;
> + /** @stamp: GuC timestamp at which the snapshot was taken */
> + u32 stamp;
> + /** @ref_clk: GuC timestamp frequency */
> + u32 ref_clk;
> + /** @level: GuC log verbosity level */
> + u32 level;
> + /** @ver_found: GuC firmware version */
> + struct xe_uc_fw_version ver_found;
didn't you mention that platform and/or revision is also mandatory to
correctly decode GuC log ?
> + /** @ver_want: GuC firmware version that driver expected */
> + struct xe_uc_fw_version ver_want;
I'm still not convinced that it's relevant to the actual GuC log what
driver wanted to use - this should be part of the GuC snapshot instead
> + /** @path: Path of GuC firmware blob */
> + const char *path;
ditto
> +};
> +
> /**
> * struct xe_guc_log - GuC log
> */
next prev parent reply other threads:[~2024-06-11 22:49 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-11 1:20 [PATCH v4 0/7] drm/xe/guc: Improve quality and robustness of GuC log dumping John.C.Harrison
2024-06-11 1:20 ` [PATCH v4 1/7] drm/xe/guc: Remove spurious line feed in debug print John.C.Harrison
2024-06-11 1:20 ` [PATCH v4 2/7] drm/xe/guc: Copy GuC log prior to dumping John.C.Harrison
2024-06-11 22:30 ` Michal Wajdeczko
2024-06-12 23:36 ` John Harrison
2024-06-11 1:20 ` [PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info John.C.Harrison
2024-06-11 22:49 ` Michal Wajdeczko [this message]
2024-06-12 23:52 ` John Harrison
2024-06-11 1:20 ` [PATCH v4 4/7] drm/print: Introduce drm_line_printer John.C.Harrison
2024-06-11 1:20 ` [PATCH v4 5/7] drm/xe/guc: Add a helper function for dumping GuC log to dmesg John.C.Harrison
2024-06-11 1:20 ` [PATCH v4 6/7] drm/xe/guc: Dead CT helper John.C.Harrison
2024-06-11 23:20 ` Michal Wajdeczko
2024-06-13 0:43 ` John Harrison
2024-06-11 1:20 ` [PATCH v4 7/7] drm/xe/guc: Dump entire CTB on errors John.C.Harrison
2024-06-11 1:25 ` ✓ CI.Patch_applied: success for drm/xe/guc: Improve quality and robustness of GuC log dumping (rev2) Patchwork
2024-06-11 1:25 ` ✗ CI.checkpatch: warning " Patchwork
2024-06-11 1:26 ` ✓ CI.KUnit: success " Patchwork
2024-06-11 1:38 ` ✓ CI.Build: " Patchwork
2024-06-11 1:40 ` ✗ CI.Hooks: failure " Patchwork
2024-06-11 1:41 ` ✗ CI.checksparse: warning " Patchwork
2024-06-11 2:31 ` ✗ CI.BAT: failure " Patchwork
2024-06-11 3:53 ` ✗ CI.FULL: " Patchwork
-- strict thread matches above, loose matches on Subject: below --
2024-06-10 22:56 [PATCH v4 0/7] drm/xe/guc: Improve quality and robustness of GuC log dumping John.C.Harrison
2024-06-10 22:56 ` [PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info John.C.Harrison
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5fc7f334-0f3d-4180-93fa-d40a1c98064c@intel.com \
--to=michal.wajdeczko@intel.com \
--cc=Intel-Xe@Lists.FreeDesktop.Org \
--cc=John.C.Harrison@Intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox