public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2] accel/amdxdna: Support getting last hardware error
@ 2025-10-14 23:41 Lizhi Hou
  2025-10-15 13:42 ` Mario Limonciello
  0 siblings, 1 reply; 3+ messages in thread
From: Lizhi Hou @ 2025-10-14 23:41 UTC (permalink / raw)
  To: ogabbay, quic_jhugo, maciej.falkowski, dri-devel
  Cc: Lizhi Hou, linux-kernel, max.zhen, sonal.santan,
	mario.limonciello

Add new parameter DRM_AMDXDNA_HW_LAST_ASYNC_ERR to get array IOCTL. When
hardware reports an error, the driver save the error information and
timestamp. This new get array parameter retrieves the last error.

Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/aie2_error.c      | 95 ++++++++++++++++++++-----
 drivers/accel/amdxdna/aie2_pci.c        |  3 +
 drivers/accel/amdxdna/aie2_pci.h        |  5 +-
 drivers/accel/amdxdna/amdxdna_error.h   | 59 +++++++++++++++
 drivers/accel/amdxdna/amdxdna_pci_drv.c |  3 +-
 include/uapi/drm/amdxdna_accel.h        | 13 ++++
 6 files changed, 159 insertions(+), 19 deletions(-)
 create mode 100644 drivers/accel/amdxdna/amdxdna_error.h

diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
index 5ee905632a39..d452008ec4f4 100644
--- a/drivers/accel/amdxdna/aie2_error.c
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -13,6 +13,7 @@
 
 #include "aie2_msg_priv.h"
 #include "aie2_pci.h"
+#include "amdxdna_error.h"
 #include "amdxdna_mailbox.h"
 #include "amdxdna_pci_drv.h"
 
@@ -46,6 +47,7 @@ enum aie_module_type {
 	AIE_MEM_MOD = 0,
 	AIE_CORE_MOD,
 	AIE_PL_MOD,
+	AIE_UNKNOWN_MOD,
 };
 
 enum aie_error_category {
@@ -143,6 +145,31 @@ static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
 	EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
 };
 
+static const enum amdxdna_error_num aie_cat_err_num_map[] = {
+	[AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
+	[AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
+	[AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
+	[AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
+	[AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
+	[AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
+	[AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
+	[AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
+	[AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
+	[AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
+	[AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
+};
+
+static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
+
+static const enum amdxdna_error_module aie_err_mod_map[] = {
+	[AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
+	[AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
+	[AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
+	[AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
+};
+
+static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
+
 static enum aie_error_category
 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
 {
@@ -176,12 +203,40 @@ aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
 		if (event_id != lut[i].event_id)
 			continue;
 
+		if (lut[i].category > AIE_ERROR_UNKNOWN)
+			return AIE_ERROR_UNKNOWN;
+
 		return lut[i].category;
 	}
 
 	return AIE_ERROR_UNKNOWN;
 }
 
+static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
+{
+	struct aie_error *errs = err_info;
+	enum amdxdna_error_module err_mod;
+	enum aie_error_category aie_err;
+	enum amdxdna_error_num err_num;
+	struct aie_error *last_err;
+
+	last_err = &errs[num_err - 1];
+	if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
+		err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
+		err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
+	} else {
+		aie_err = aie_get_error_category(last_err->row,
+						 last_err->event_id,
+						 last_err->mod_type);
+		err_num = aie_cat_err_num_map[aie_err];
+		err_mod = aie_err_mod_map[last_err->mod_type];
+	}
+
+	ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
+	ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
+	ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
+}
+
 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
 {
 	struct aie_error *errs = err_info;
@@ -264,29 +319,14 @@ static void aie2_error_worker(struct work_struct *err_work)
 	}
 
 	mutex_lock(&xdna->dev_lock);
+	aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
+
 	/* Re-sent this event to firmware */
 	if (aie2_error_event_send(e))
 		XDNA_WARN(xdna, "Unable to register async event");
 	mutex_unlock(&xdna->dev_lock);
 }
 
-int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
-{
-	struct amdxdna_dev *xdna = ndev->xdna;
-	struct async_event *e;
-	int i, ret;
-
-	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
-	for (i = 0; i < ndev->async_events->event_cnt; i++) {
-		e = &ndev->async_events->event[i];
-		ret = aie2_error_event_send(e);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
@@ -341,6 +381,10 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
 		e->size = ASYNC_BUF_SIZE;
 		e->resp.status = MAX_AIE2_STATUS_CODE;
 		INIT_WORK(&e->work, aie2_error_worker);
+
+		ret = aie2_error_event_send(e);
+		if (ret)
+			goto free_wq;
 	}
 
 	ndev->async_events = events;
@@ -349,6 +393,8 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
 		 events->event_cnt, events->size);
 	return 0;
 
+free_wq:
+	destroy_workqueue(events->wq);
 free_buf:
 	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
 			     events->addr, DMA_FROM_DEVICE);
@@ -356,3 +402,18 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
 	kfree(events);
 	return ret;
 }
+
+int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+	args->num_element = 1;
+	args->element_size = sizeof(ndev->last_async_err);
+	if (copy_to_user(u64_to_user_ptr(args->buffer),
+			 &ndev->last_async_err, args->element_size))
+		return -EFAULT;
+
+	return 0;
+}
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 8a66f276100e..cfca4e456b61 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -924,6 +924,9 @@ static int aie2_get_array(struct amdxdna_client *client,
 	case DRM_AMDXDNA_HW_CONTEXT_ALL:
 		ret = aie2_query_ctx_status_array(client, args);
 		break;
+	case DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
+		ret = aie2_get_array_async_error(xdna->dev_handle, args);
+		break;
 	default:
 		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
 		ret = -EOPNOTSUPP;
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 289a23ecd5f1..34bc35479f42 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -190,6 +190,8 @@ struct amdxdna_dev_hdl {
 
 	enum aie2_dev_status		dev_status;
 	u32				hwctx_num;
+
+	struct amdxdna_async_error	last_async_err;
 };
 
 #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
@@ -253,8 +255,9 @@ void aie2_psp_stop(struct psp_device *psp);
 /* aie2_error.c */
 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
-int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
 int aie2_error_async_msg_thread(void *data);
+int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
+			       struct amdxdna_drm_get_array *args);
 
 /* aie2_message.c */
 int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
diff --git a/drivers/accel/amdxdna/amdxdna_error.h b/drivers/accel/amdxdna/amdxdna_error.h
new file mode 100644
index 000000000000..c51de86ec12b
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_error.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_ERROR_H_
+#define _AMDXDNA_ERROR_H_
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+
+#define AMDXDNA_ERR_DRV_AIE		4
+#define AMDXDNA_ERR_SEV_CRITICAL	3
+#define AMDXDNA_ERR_CLASS_AIE		2
+
+#define AMDXDNA_ERR_NUM_MASK		GENMASK_U64(15, 0)
+#define AMDXDNA_ERR_DRV_MASK		GENMASK_U64(23, 16)
+#define AMDXDNA_ERR_SEV_MASK		GENMASK_U64(31, 24)
+#define AMDXDNA_ERR_MOD_MASK		GENMASK_U64(39, 32)
+#define AMDXDNA_ERR_CLASS_MASK		GENMASK_U64(47, 40)
+
+enum amdxdna_error_num {
+	AMDXDNA_ERROR_NUM_AIE_SATURATION = 3,
+	AMDXDNA_ERROR_NUM_AIE_FP,
+	AMDXDNA_ERROR_NUM_AIE_STREAM,
+	AMDXDNA_ERROR_NUM_AIE_ACCESS,
+	AMDXDNA_ERROR_NUM_AIE_BUS,
+	AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
+	AMDXDNA_ERROR_NUM_AIE_ECC,
+	AMDXDNA_ERROR_NUM_AIE_LOCK,
+	AMDXDNA_ERROR_NUM_AIE_DMA,
+	AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
+	AMDXDNA_ERROR_NUM_UNKNOWN = 15,
+};
+
+enum amdxdna_error_module {
+	AMDXDNA_ERROR_MODULE_AIE_CORE = 3,
+	AMDXDNA_ERROR_MODULE_AIE_MEMORY,
+	AMDXDNA_ERROR_MODULE_AIE_SHIM,
+	AMDXDNA_ERROR_MODULE_AIE_NOC,
+	AMDXDNA_ERROR_MODULE_AIE_PL,
+	AMDXDNA_ERROR_MODULE_UNKNOWN = 8,
+};
+
+#define AMDXDNA_ERROR_ENCODE(err_num, err_mod)				\
+	(FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) |			\
+	 FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) |	\
+	 FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, AMDXDNA_ERR_SEV_CRITICAL) | \
+	 FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) |			\
+	 FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE))
+
+#define AMDXDNA_EXTRA_ERR_COL_MASK	GENMASK_U64(7, 0)
+#define AMDXDNA_EXTRA_ERR_ROW_MASK	GENMASK_U64(15, 8)
+
+#define AMDXDNA_EXTRA_ERR_ENCODE(row, col)				\
+	(FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) |			\
+	 FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row))
+
+#endif /* _AMDXDNA_ERROR_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
index aa04452310e5..696fdac8ad3c 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -27,9 +27,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
 /*
  * 0.0: Initial version
  * 0.1: Support getting all hardware contexts by DRM_IOCTL_AMDXDNA_GET_ARRAY
+ * 0.2: Support getting last error hardware error
  */
 #define AMDXDNA_DRIVER_MAJOR		0
-#define AMDXDNA_DRIVER_MINOR		1
+#define AMDXDNA_DRIVER_MINOR		2
 
 /*
  * Bind the driver base on (vendor_id, device_id) pair and later use the
diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
index a1fb9785db77..c7eec9ceb2ae 100644
--- a/include/uapi/drm/amdxdna_accel.h
+++ b/include/uapi/drm/amdxdna_accel.h
@@ -523,7 +523,20 @@ struct amdxdna_drm_hwctx_entry {
 	__u32 pad;
 };
 
+/**
+ * struct amdxdna_async_error - XDNA async error structure
+ */
+struct amdxdna_async_error {
+	/** @err_code: Error code. */
+	__u64 err_code;
+	/** @ts_us: Timestamp. */
+	__u64 ts_us;
+	/** @ex_err_code: Extra error code */
+	__u64 ex_err_code;
+};
+
 #define DRM_AMDXDNA_HW_CONTEXT_ALL	0
+#define DRM_AMDXDNA_HW_LAST_ASYNC_ERR	2
 
 /**
  * struct amdxdna_drm_get_array - Get information array.
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH V2] accel/amdxdna: Support getting last hardware error
  2025-10-14 23:41 [PATCH V2] accel/amdxdna: Support getting last hardware error Lizhi Hou
@ 2025-10-15 13:42 ` Mario Limonciello
  2025-10-16 16:37   ` Lizhi Hou
  0 siblings, 1 reply; 3+ messages in thread
From: Mario Limonciello @ 2025-10-15 13:42 UTC (permalink / raw)
  To: Lizhi Hou, ogabbay, quic_jhugo, maciej.falkowski, dri-devel
  Cc: linux-kernel, max.zhen, sonal.santan

On 10/14/25 6:41 PM, Lizhi Hou wrote:
> Add new parameter DRM_AMDXDNA_HW_LAST_ASYNC_ERR to get array IOCTL. When
> hardware reports an error, the driver save the error information and
> timestamp. This new get array parameter retrieves the last error.
> 
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>

Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>

> ---
>   drivers/accel/amdxdna/aie2_error.c      | 95 ++++++++++++++++++++-----
>   drivers/accel/amdxdna/aie2_pci.c        |  3 +
>   drivers/accel/amdxdna/aie2_pci.h        |  5 +-
>   drivers/accel/amdxdna/amdxdna_error.h   | 59 +++++++++++++++
>   drivers/accel/amdxdna/amdxdna_pci_drv.c |  3 +-
>   include/uapi/drm/amdxdna_accel.h        | 13 ++++
>   6 files changed, 159 insertions(+), 19 deletions(-)
>   create mode 100644 drivers/accel/amdxdna/amdxdna_error.h
> 
> diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
> index 5ee905632a39..d452008ec4f4 100644
> --- a/drivers/accel/amdxdna/aie2_error.c
> +++ b/drivers/accel/amdxdna/aie2_error.c
> @@ -13,6 +13,7 @@
>   
>   #include "aie2_msg_priv.h"
>   #include "aie2_pci.h"
> +#include "amdxdna_error.h"
>   #include "amdxdna_mailbox.h"
>   #include "amdxdna_pci_drv.h"
>   
> @@ -46,6 +47,7 @@ enum aie_module_type {
>   	AIE_MEM_MOD = 0,
>   	AIE_CORE_MOD,
>   	AIE_PL_MOD,
> +	AIE_UNKNOWN_MOD,
>   };
>   
>   enum aie_error_category {
> @@ -143,6 +145,31 @@ static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
>   	EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
>   };
>   
> +static const enum amdxdna_error_num aie_cat_err_num_map[] = {
> +	[AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
> +	[AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
> +	[AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
> +	[AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
> +	[AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
> +	[AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
> +	[AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
> +	[AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
> +	[AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
> +	[AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
> +	[AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
> +};
> +
> +static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
> +
> +static const enum amdxdna_error_module aie_err_mod_map[] = {
> +	[AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
> +	[AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
> +	[AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
> +	[AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
> +};
> +
> +static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
> +
>   static enum aie_error_category
>   aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
>   {
> @@ -176,12 +203,40 @@ aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
>   		if (event_id != lut[i].event_id)
>   			continue;
>   
> +		if (lut[i].category > AIE_ERROR_UNKNOWN)
> +			return AIE_ERROR_UNKNOWN;
> +
>   		return lut[i].category;
>   	}
>   
>   	return AIE_ERROR_UNKNOWN;
>   }
>   
> +static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
> +{
> +	struct aie_error *errs = err_info;
> +	enum amdxdna_error_module err_mod;
> +	enum aie_error_category aie_err;
> +	enum amdxdna_error_num err_num;
> +	struct aie_error *last_err;
> +
> +	last_err = &errs[num_err - 1];
> +	if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
> +		err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
> +		err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
> +	} else {
> +		aie_err = aie_get_error_category(last_err->row,
> +						 last_err->event_id,
> +						 last_err->mod_type);
> +		err_num = aie_cat_err_num_map[aie_err];
> +		err_mod = aie_err_mod_map[last_err->mod_type];
> +	}
> +
> +	ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
> +	ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
> +	ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
> +}
> +
>   static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
>   {
>   	struct aie_error *errs = err_info;
> @@ -264,29 +319,14 @@ static void aie2_error_worker(struct work_struct *err_work)
>   	}
>   
>   	mutex_lock(&xdna->dev_lock);
> +	aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
> +
>   	/* Re-sent this event to firmware */
>   	if (aie2_error_event_send(e))
>   		XDNA_WARN(xdna, "Unable to register async event");
>   	mutex_unlock(&xdna->dev_lock);
>   }
>   
> -int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
> -{
> -	struct amdxdna_dev *xdna = ndev->xdna;
> -	struct async_event *e;
> -	int i, ret;
> -
> -	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> -	for (i = 0; i < ndev->async_events->event_cnt; i++) {
> -		e = &ndev->async_events->event[i];
> -		ret = aie2_error_event_send(e);
> -		if (ret)
> -			return ret;
> -	}
> -
> -	return 0;
> -}
> -
>   void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
>   {
>   	struct amdxdna_dev *xdna = ndev->xdna;
> @@ -341,6 +381,10 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
>   		e->size = ASYNC_BUF_SIZE;
>   		e->resp.status = MAX_AIE2_STATUS_CODE;
>   		INIT_WORK(&e->work, aie2_error_worker);
> +
> +		ret = aie2_error_event_send(e);
> +		if (ret)
> +			goto free_wq;
>   	}
>   
>   	ndev->async_events = events;
> @@ -349,6 +393,8 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
>   		 events->event_cnt, events->size);
>   	return 0;
>   
> +free_wq:
> +	destroy_workqueue(events->wq);
>   free_buf:
>   	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
>   			     events->addr, DMA_FROM_DEVICE);
> @@ -356,3 +402,18 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
>   	kfree(events);
>   	return ret;
>   }
> +
> +int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
> +{
> +	struct amdxdna_dev *xdna = ndev->xdna;
> +
> +	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> +
> +	args->num_element = 1;
> +	args->element_size = sizeof(ndev->last_async_err);
> +	if (copy_to_user(u64_to_user_ptr(args->buffer),
> +			 &ndev->last_async_err, args->element_size))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index 8a66f276100e..cfca4e456b61 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -924,6 +924,9 @@ static int aie2_get_array(struct amdxdna_client *client,
>   	case DRM_AMDXDNA_HW_CONTEXT_ALL:
>   		ret = aie2_query_ctx_status_array(client, args);
>   		break;
> +	case DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
> +		ret = aie2_get_array_async_error(xdna->dev_handle, args);
> +		break;
>   	default:
>   		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
>   		ret = -EOPNOTSUPP;
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index 289a23ecd5f1..34bc35479f42 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -190,6 +190,8 @@ struct amdxdna_dev_hdl {
>   
>   	enum aie2_dev_status		dev_status;
>   	u32				hwctx_num;
> +
> +	struct amdxdna_async_error	last_async_err;
>   };
>   
>   #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
> @@ -253,8 +255,9 @@ void aie2_psp_stop(struct psp_device *psp);
>   /* aie2_error.c */
>   int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
>   void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
> -int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
>   int aie2_error_async_msg_thread(void *data);
> +int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
> +			       struct amdxdna_drm_get_array *args);
>   
>   /* aie2_message.c */
>   int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
> diff --git a/drivers/accel/amdxdna/amdxdna_error.h b/drivers/accel/amdxdna/amdxdna_error.h
> new file mode 100644
> index 000000000000..c51de86ec12b
> --- /dev/null
> +++ b/drivers/accel/amdxdna/amdxdna_error.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2025, Advanced Micro Devices, Inc.
> + */
> +
> +#ifndef _AMDXDNA_ERROR_H_
> +#define _AMDXDNA_ERROR_H_
> +
> +#include <linux/bitfield.h>
> +#include <linux/bits.h>
> +
> +#define AMDXDNA_ERR_DRV_AIE		4
> +#define AMDXDNA_ERR_SEV_CRITICAL	3
> +#define AMDXDNA_ERR_CLASS_AIE		2
> +
> +#define AMDXDNA_ERR_NUM_MASK		GENMASK_U64(15, 0)
> +#define AMDXDNA_ERR_DRV_MASK		GENMASK_U64(23, 16)
> +#define AMDXDNA_ERR_SEV_MASK		GENMASK_U64(31, 24)
> +#define AMDXDNA_ERR_MOD_MASK		GENMASK_U64(39, 32)
> +#define AMDXDNA_ERR_CLASS_MASK		GENMASK_U64(47, 40)
> +
> +enum amdxdna_error_num {
> +	AMDXDNA_ERROR_NUM_AIE_SATURATION = 3,
> +	AMDXDNA_ERROR_NUM_AIE_FP,
> +	AMDXDNA_ERROR_NUM_AIE_STREAM,
> +	AMDXDNA_ERROR_NUM_AIE_ACCESS,
> +	AMDXDNA_ERROR_NUM_AIE_BUS,
> +	AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
> +	AMDXDNA_ERROR_NUM_AIE_ECC,
> +	AMDXDNA_ERROR_NUM_AIE_LOCK,
> +	AMDXDNA_ERROR_NUM_AIE_DMA,
> +	AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
> +	AMDXDNA_ERROR_NUM_UNKNOWN = 15,
> +};
> +
> +enum amdxdna_error_module {
> +	AMDXDNA_ERROR_MODULE_AIE_CORE = 3,
> +	AMDXDNA_ERROR_MODULE_AIE_MEMORY,
> +	AMDXDNA_ERROR_MODULE_AIE_SHIM,
> +	AMDXDNA_ERROR_MODULE_AIE_NOC,
> +	AMDXDNA_ERROR_MODULE_AIE_PL,
> +	AMDXDNA_ERROR_MODULE_UNKNOWN = 8,
> +};
> +
> +#define AMDXDNA_ERROR_ENCODE(err_num, err_mod)				\
> +	(FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) |			\
> +	 FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) |	\
> +	 FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, AMDXDNA_ERR_SEV_CRITICAL) | \
> +	 FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) |			\
> +	 FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE))
> +
> +#define AMDXDNA_EXTRA_ERR_COL_MASK	GENMASK_U64(7, 0)
> +#define AMDXDNA_EXTRA_ERR_ROW_MASK	GENMASK_U64(15, 8)
> +
> +#define AMDXDNA_EXTRA_ERR_ENCODE(row, col)				\
> +	(FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) |			\
> +	 FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row))
> +
> +#endif /* _AMDXDNA_ERROR_H_ */
> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> index aa04452310e5..696fdac8ad3c 100644
> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> @@ -27,9 +27,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
>   /*
>    * 0.0: Initial version
>    * 0.1: Support getting all hardware contexts by DRM_IOCTL_AMDXDNA_GET_ARRAY
> + * 0.2: Support getting last error hardware error
>    */
>   #define AMDXDNA_DRIVER_MAJOR		0
> -#define AMDXDNA_DRIVER_MINOR		1
> +#define AMDXDNA_DRIVER_MINOR		2
>   
>   /*
>    * Bind the driver base on (vendor_id, device_id) pair and later use the
> diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
> index a1fb9785db77..c7eec9ceb2ae 100644
> --- a/include/uapi/drm/amdxdna_accel.h
> +++ b/include/uapi/drm/amdxdna_accel.h
> @@ -523,7 +523,20 @@ struct amdxdna_drm_hwctx_entry {
>   	__u32 pad;
>   };
>   
> +/**
> + * struct amdxdna_async_error - XDNA async error structure
> + */
> +struct amdxdna_async_error {
> +	/** @err_code: Error code. */
> +	__u64 err_code;
> +	/** @ts_us: Timestamp. */
> +	__u64 ts_us;
> +	/** @ex_err_code: Extra error code */
> +	__u64 ex_err_code;
> +};
> +
>   #define DRM_AMDXDNA_HW_CONTEXT_ALL	0
> +#define DRM_AMDXDNA_HW_LAST_ASYNC_ERR	2
>   
>   /**
>    * struct amdxdna_drm_get_array - Get information array.


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH V2] accel/amdxdna: Support getting last hardware error
  2025-10-15 13:42 ` Mario Limonciello
@ 2025-10-16 16:37   ` Lizhi Hou
  0 siblings, 0 replies; 3+ messages in thread
From: Lizhi Hou @ 2025-10-16 16:37 UTC (permalink / raw)
  To: Mario Limonciello, ogabbay, quic_jhugo, maciej.falkowski,
	dri-devel
  Cc: linux-kernel, max.zhen, sonal.santan

Applied to drm-misc-next

On 10/15/25 06:42, Mario Limonciello wrote:
> On 10/14/25 6:41 PM, Lizhi Hou wrote:
>> Add new parameter DRM_AMDXDNA_HW_LAST_ASYNC_ERR to get array IOCTL. When
>> hardware reports an error, the driver save the error information and
>> timestamp. This new get array parameter retrieves the last error.
>>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>
> Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
>
>> ---
>>   drivers/accel/amdxdna/aie2_error.c      | 95 ++++++++++++++++++++-----
>>   drivers/accel/amdxdna/aie2_pci.c        |  3 +
>>   drivers/accel/amdxdna/aie2_pci.h        |  5 +-
>>   drivers/accel/amdxdna/amdxdna_error.h   | 59 +++++++++++++++
>>   drivers/accel/amdxdna/amdxdna_pci_drv.c |  3 +-
>>   include/uapi/drm/amdxdna_accel.h        | 13 ++++
>>   6 files changed, 159 insertions(+), 19 deletions(-)
>>   create mode 100644 drivers/accel/amdxdna/amdxdna_error.h
>>
>> diff --git a/drivers/accel/amdxdna/aie2_error.c 
>> b/drivers/accel/amdxdna/aie2_error.c
>> index 5ee905632a39..d452008ec4f4 100644
>> --- a/drivers/accel/amdxdna/aie2_error.c
>> +++ b/drivers/accel/amdxdna/aie2_error.c
>> @@ -13,6 +13,7 @@
>>     #include "aie2_msg_priv.h"
>>   #include "aie2_pci.h"
>> +#include "amdxdna_error.h"
>>   #include "amdxdna_mailbox.h"
>>   #include "amdxdna_pci_drv.h"
>>   @@ -46,6 +47,7 @@ enum aie_module_type {
>>       AIE_MEM_MOD = 0,
>>       AIE_CORE_MOD,
>>       AIE_PL_MOD,
>> +    AIE_UNKNOWN_MOD,
>>   };
>>     enum aie_error_category {
>> @@ -143,6 +145,31 @@ static const struct aie_event_category 
>> aie_ml_shim_tile_event_cat[] = {
>>       EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
>>   };
>>   +static const enum amdxdna_error_num aie_cat_err_num_map[] = {
>> +    [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
>> +    [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
>> +    [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
>> +    [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
>> +    [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
>> +    [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
>> +    [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
>> +    [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
>> +    [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
>> +    [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
>> +    [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
>> +};
>> +
>> +static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 
>> 1);
>> +
>> +static const enum amdxdna_error_module aie_err_mod_map[] = {
>> +    [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
>> +    [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
>> +    [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
>> +    [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
>> +};
>> +
>> +static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
>> +
>>   static enum aie_error_category
>>   aie_get_error_category(u8 row, u8 event_id, enum aie_module_type 
>> mod_type)
>>   {
>> @@ -176,12 +203,40 @@ aie_get_error_category(u8 row, u8 event_id, 
>> enum aie_module_type mod_type)
>>           if (event_id != lut[i].event_id)
>>               continue;
>>   +        if (lut[i].category > AIE_ERROR_UNKNOWN)
>> +            return AIE_ERROR_UNKNOWN;
>> +
>>           return lut[i].category;
>>       }
>>         return AIE_ERROR_UNKNOWN;
>>   }
>>   +static void aie2_update_last_async_error(struct amdxdna_dev_hdl 
>> *ndev, void *err_info, u32 num_err)
>> +{
>> +    struct aie_error *errs = err_info;
>> +    enum amdxdna_error_module err_mod;
>> +    enum aie_error_category aie_err;
>> +    enum amdxdna_error_num err_num;
>> +    struct aie_error *last_err;
>> +
>> +    last_err = &errs[num_err - 1];
>> +    if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
>> +        err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
>> +        err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
>> +    } else {
>> +        aie_err = aie_get_error_category(last_err->row,
>> +                         last_err->event_id,
>> +                         last_err->mod_type);
>> +        err_num = aie_cat_err_num_map[aie_err];
>> +        err_mod = aie_err_mod_map[last_err->mod_type];
>> +    }
>> +
>> +    ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, 
>> err_mod);
>> +    ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
>> +    ndev->last_async_err.ex_err_code = 
>> AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
>> +}
>> +
>>   static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void 
>> *err_info, u32 num_err)
>>   {
>>       struct aie_error *errs = err_info;
>> @@ -264,29 +319,14 @@ static void aie2_error_worker(struct 
>> work_struct *err_work)
>>       }
>>         mutex_lock(&xdna->dev_lock);
>> +    aie2_update_last_async_error(e->ndev, info->payload, 
>> info->err_cnt);
>> +
>>       /* Re-sent this event to firmware */
>>       if (aie2_error_event_send(e))
>>           XDNA_WARN(xdna, "Unable to register async event");
>>       mutex_unlock(&xdna->dev_lock);
>>   }
>>   -int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
>> -{
>> -    struct amdxdna_dev *xdna = ndev->xdna;
>> -    struct async_event *e;
>> -    int i, ret;
>> -
>> -    drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>> -    for (i = 0; i < ndev->async_events->event_cnt; i++) {
>> -        e = &ndev->async_events->event[i];
>> -        ret = aie2_error_event_send(e);
>> -        if (ret)
>> -            return ret;
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>>   void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
>>   {
>>       struct amdxdna_dev *xdna = ndev->xdna;
>> @@ -341,6 +381,10 @@ int aie2_error_async_events_alloc(struct 
>> amdxdna_dev_hdl *ndev)
>>           e->size = ASYNC_BUF_SIZE;
>>           e->resp.status = MAX_AIE2_STATUS_CODE;
>>           INIT_WORK(&e->work, aie2_error_worker);
>> +
>> +        ret = aie2_error_event_send(e);
>> +        if (ret)
>> +            goto free_wq;
>>       }
>>         ndev->async_events = events;
>> @@ -349,6 +393,8 @@ int aie2_error_async_events_alloc(struct 
>> amdxdna_dev_hdl *ndev)
>>            events->event_cnt, events->size);
>>       return 0;
>>   +free_wq:
>> +    destroy_workqueue(events->wq);
>>   free_buf:
>>       dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
>>                    events->addr, DMA_FROM_DEVICE);
>> @@ -356,3 +402,18 @@ int aie2_error_async_events_alloc(struct 
>> amdxdna_dev_hdl *ndev)
>>       kfree(events);
>>       return ret;
>>   }
>> +
>> +int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct 
>> amdxdna_drm_get_array *args)
>> +{
>> +    struct amdxdna_dev *xdna = ndev->xdna;
>> +
>> +    drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>> +
>> +    args->num_element = 1;
>> +    args->element_size = sizeof(ndev->last_async_err);
>> +    if (copy_to_user(u64_to_user_ptr(args->buffer),
>> +             &ndev->last_async_err, args->element_size))
>> +        return -EFAULT;
>> +
>> +    return 0;
>> +}
>> diff --git a/drivers/accel/amdxdna/aie2_pci.c 
>> b/drivers/accel/amdxdna/aie2_pci.c
>> index 8a66f276100e..cfca4e456b61 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.c
>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>> @@ -924,6 +924,9 @@ static int aie2_get_array(struct amdxdna_client 
>> *client,
>>       case DRM_AMDXDNA_HW_CONTEXT_ALL:
>>           ret = aie2_query_ctx_status_array(client, args);
>>           break;
>> +    case DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
>> +        ret = aie2_get_array_async_error(xdna->dev_handle, args);
>> +        break;
>>       default:
>>           XDNA_ERR(xdna, "Not supported request parameter %u", 
>> args->param);
>>           ret = -EOPNOTSUPP;
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h 
>> b/drivers/accel/amdxdna/aie2_pci.h
>> index 289a23ecd5f1..34bc35479f42 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -190,6 +190,8 @@ struct amdxdna_dev_hdl {
>>         enum aie2_dev_status        dev_status;
>>       u32                hwctx_num;
>> +
>> +    struct amdxdna_async_error    last_async_err;
>>   };
>>     #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
>> @@ -253,8 +255,9 @@ void aie2_psp_stop(struct psp_device *psp);
>>   /* aie2_error.c */
>>   int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
>>   void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
>> -int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
>>   int aie2_error_async_msg_thread(void *data);
>> +int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
>> +                   struct amdxdna_drm_get_array *args);
>>     /* aie2_message.c */
>>   int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
>> diff --git a/drivers/accel/amdxdna/amdxdna_error.h 
>> b/drivers/accel/amdxdna/amdxdna_error.h
>> new file mode 100644
>> index 000000000000..c51de86ec12b
>> --- /dev/null
>> +++ b/drivers/accel/amdxdna/amdxdna_error.h
>> @@ -0,0 +1,59 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (C) 2025, Advanced Micro Devices, Inc.
>> + */
>> +
>> +#ifndef _AMDXDNA_ERROR_H_
>> +#define _AMDXDNA_ERROR_H_
>> +
>> +#include <linux/bitfield.h>
>> +#include <linux/bits.h>
>> +
>> +#define AMDXDNA_ERR_DRV_AIE        4
>> +#define AMDXDNA_ERR_SEV_CRITICAL    3
>> +#define AMDXDNA_ERR_CLASS_AIE        2
>> +
>> +#define AMDXDNA_ERR_NUM_MASK        GENMASK_U64(15, 0)
>> +#define AMDXDNA_ERR_DRV_MASK        GENMASK_U64(23, 16)
>> +#define AMDXDNA_ERR_SEV_MASK        GENMASK_U64(31, 24)
>> +#define AMDXDNA_ERR_MOD_MASK        GENMASK_U64(39, 32)
>> +#define AMDXDNA_ERR_CLASS_MASK        GENMASK_U64(47, 40)
>> +
>> +enum amdxdna_error_num {
>> +    AMDXDNA_ERROR_NUM_AIE_SATURATION = 3,
>> +    AMDXDNA_ERROR_NUM_AIE_FP,
>> +    AMDXDNA_ERROR_NUM_AIE_STREAM,
>> +    AMDXDNA_ERROR_NUM_AIE_ACCESS,
>> +    AMDXDNA_ERROR_NUM_AIE_BUS,
>> +    AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
>> +    AMDXDNA_ERROR_NUM_AIE_ECC,
>> +    AMDXDNA_ERROR_NUM_AIE_LOCK,
>> +    AMDXDNA_ERROR_NUM_AIE_DMA,
>> +    AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
>> +    AMDXDNA_ERROR_NUM_UNKNOWN = 15,
>> +};
>> +
>> +enum amdxdna_error_module {
>> +    AMDXDNA_ERROR_MODULE_AIE_CORE = 3,
>> +    AMDXDNA_ERROR_MODULE_AIE_MEMORY,
>> +    AMDXDNA_ERROR_MODULE_AIE_SHIM,
>> +    AMDXDNA_ERROR_MODULE_AIE_NOC,
>> +    AMDXDNA_ERROR_MODULE_AIE_PL,
>> +    AMDXDNA_ERROR_MODULE_UNKNOWN = 8,
>> +};
>> +
>> +#define AMDXDNA_ERROR_ENCODE(err_num, err_mod)                \
>> +    (FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) |            \
>> +     FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) |    \
>> +     FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, 
>> AMDXDNA_ERR_SEV_CRITICAL) | \
>> +     FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) |            \
>> +     FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE))
>> +
>> +#define AMDXDNA_EXTRA_ERR_COL_MASK    GENMASK_U64(7, 0)
>> +#define AMDXDNA_EXTRA_ERR_ROW_MASK    GENMASK_U64(15, 8)
>> +
>> +#define AMDXDNA_EXTRA_ERR_ENCODE(row, col)                \
>> +    (FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) |            \
>> +     FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row))
>> +
>> +#endif /* _AMDXDNA_ERROR_H_ */
>> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c 
>> b/drivers/accel/amdxdna/amdxdna_pci_drv.c
>> index aa04452310e5..696fdac8ad3c 100644
>> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
>> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
>> @@ -27,9 +27,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
>>   /*
>>    * 0.0: Initial version
>>    * 0.1: Support getting all hardware contexts by 
>> DRM_IOCTL_AMDXDNA_GET_ARRAY
>> + * 0.2: Support getting last error hardware error
>>    */
>>   #define AMDXDNA_DRIVER_MAJOR        0
>> -#define AMDXDNA_DRIVER_MINOR        1
>> +#define AMDXDNA_DRIVER_MINOR        2
>>     /*
>>    * Bind the driver base on (vendor_id, device_id) pair and later 
>> use the
>> diff --git a/include/uapi/drm/amdxdna_accel.h 
>> b/include/uapi/drm/amdxdna_accel.h
>> index a1fb9785db77..c7eec9ceb2ae 100644
>> --- a/include/uapi/drm/amdxdna_accel.h
>> +++ b/include/uapi/drm/amdxdna_accel.h
>> @@ -523,7 +523,20 @@ struct amdxdna_drm_hwctx_entry {
>>       __u32 pad;
>>   };
>>   +/**
>> + * struct amdxdna_async_error - XDNA async error structure
>> + */
>> +struct amdxdna_async_error {
>> +    /** @err_code: Error code. */
>> +    __u64 err_code;
>> +    /** @ts_us: Timestamp. */
>> +    __u64 ts_us;
>> +    /** @ex_err_code: Extra error code */
>> +    __u64 ex_err_code;
>> +};
>> +
>>   #define DRM_AMDXDNA_HW_CONTEXT_ALL    0
>> +#define DRM_AMDXDNA_HW_LAST_ASYNC_ERR    2
>>     /**
>>    * struct amdxdna_drm_get_array - Get information array.
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-10-16 16:38 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-14 23:41 [PATCH V2] accel/amdxdna: Support getting last hardware error Lizhi Hou
2025-10-15 13:42 ` Mario Limonciello
2025-10-16 16:37   ` Lizhi Hou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox