* [PATCH v2 01/12] drm/amd/include: Add amd cper header
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 02/12] drm/amdgpu: Introduce funcs for populating CPER Xiang Liu
` (11 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Xiang Liu
From: Hawking Zhang <Hawking.Zhang@amd.com>
AMD is using Common Platform Error Record (CPER) format
to report all gpu hardware errors.
v2: add program attribute
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/include/amd_cper.h | 269 +++++++++++++++++++++++++
1 file changed, 269 insertions(+)
create mode 100644 drivers/gpu/drm/amd/include/amd_cper.h
diff --git a/drivers/gpu/drm/amd/include/amd_cper.h b/drivers/gpu/drm/amd/include/amd_cper.h
new file mode 100644
index 000000000000..086869264425
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/amd_cper.h
@@ -0,0 +1,269 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __AMD_CPER_H__
+#define __AMD_CPER_H__
+
+#include <linux/uuid.h>
+
+#define CPER_HDR_REV_1 (0x100)
+#define CPER_SEC_MINOR_REV_1 (0x01)
+#define CPER_SEC_MAJOR_REV_22 (0x22)
+#define CPER_MAX_OAM_COUNT (8)
+
+#define CPER_CTX_TYPE_CRASH (1)
+#define CPER_CTX_TYPE_BOOT (9)
+
+#define CPER_CREATOR_ID_AMDGPU "amdgpu"
+
+#define CPER_NOTIFY_MCE \
+ GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \
+ 0xE1, 0x49, 0x13, 0xBB)
+#define CPER_NOTIFY_CMC \
+ GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \
+ 0xEB, 0xD4, 0xF8, 0x90)
+#define BOOT_TYPE \
+ GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \
+ 0xD4, 0x64, 0xB3, 0x8F)
+
+#define AMD_CRASHDUMP \
+ GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \
+ 0x72, 0x5F, 0xD6, 0xAE)
+#define AMD_GPU_NONSTANDARD_ERROR \
+ GUID_INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \
+ 0x17, 0x80, 0x55, 0x1D)
+#define PROC_ERR_SECTION_TYPE \
+ GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \
+ 0x24, 0x2B, 0x6E, 0x1D)
+
+enum cper_error_severity {
+ CPER_SEV_NON_FATAL_UNCORRECTED = 0,
+ CPER_SEV_FATAL = 1,
+ CPER_SEV_NON_FATAL_CORRECTED = 2,
+ CPER_SEV_NUM = 3,
+
+ CPER_SEV_UNUSED = 10,
+};
+
+enum cper_aca_reg {
+ CPER_ACA_REG_CTL_LO = 0,
+ CPER_ACA_REG_CTL_HI = 1,
+ CPER_ACA_REG_STATUS_LO = 2,
+ CPER_ACA_REG_STATUS_HI = 3,
+ CPER_ACA_REG_ADDR_LO = 4,
+ CPER_ACA_REG_ADDR_HI = 5,
+ CPER_ACA_REG_MISC0_LO = 6,
+ CPER_ACA_REG_MISC0_HI = 7,
+ CPER_ACA_REG_CONFIG_LO = 8,
+ CPER_ACA_REG_CONFIG_HI = 9,
+ CPER_ACA_REG_IPID_LO = 10,
+ CPER_ACA_REG_IPID_HI = 11,
+ CPER_ACA_REG_SYND_LO = 12,
+ CPER_ACA_REG_SYND_HI = 13,
+
+ CPER_ACA_REG_COUNT = 32,
+};
+
+#pragma pack(push, 1)
+
+struct cper_timestamp {
+ uint8_t seconds;
+ uint8_t minutes;
+ uint8_t hours;
+ uint8_t flag;
+ uint8_t day;
+ uint8_t month;
+ uint8_t year;
+ uint8_t century;
+};
+
+struct cper_hdr {
+ char signature[4]; /* "CPER" */
+ uint16_t revision;
+ uint32_t signature_end; /* 0xFFFFFFFF */
+ uint16_t sec_cnt;
+ enum cper_error_severity error_severity;
+ union {
+ struct {
+ uint32_t platform_id : 1;
+ uint32_t timestamp : 1;
+ uint32_t partition_id : 1;
+ uint32_t reserved : 29;
+ } valid_bits;
+ uint32_t valid_mask;
+ };
+ uint32_t record_length; /* Total size of CPER Entry */
+ struct cper_timestamp timestamp;
+ char platform_id[16];
+ guid_t partition_id; /* Reserved */
+ char creator_id[16];
+ guid_t notify_type; /* CMC, MCE */
+ char record_id[8]; /* Unique CPER Entry ID */
+ uint32_t flags; /* Reserved */
+ uint64_t persistence_info; /* Reserved */
+ uint8_t reserved[12]; /* Reserved */
+};
+
+struct cper_sec_desc {
+ uint32_t sec_offset; /* Offset from the start of CPER entry */
+ uint32_t sec_length;
+ uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */
+ uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */
+ union {
+ struct {
+ uint8_t fru_id : 1;
+ uint8_t fru_text : 1;
+ uint8_t reserved : 6;
+ } valid_bits;
+ uint8_t valid_mask;
+ };
+ uint8_t reserved;
+ union {
+ struct {
+ uint32_t primary : 1;
+ uint32_t reserved1 : 2;
+ uint32_t exceed_err_threshold : 1;
+ uint32_t latent_err : 1;
+ uint32_t reserved2 : 27;
+ } flag_bits;
+ uint32_t flag_mask;
+ };
+ guid_t sec_type;
+ char fru_id[16];
+ enum cper_error_severity severity;
+ char fru_text[20];
+};
+
+struct cper_sec_nonstd_err_hdr {
+ union {
+ struct {
+ uint64_t apic_id : 1;
+ uint64_t fw_id : 1;
+ uint64_t err_info_cnt : 6;
+ uint64_t err_context_cnt : 6;
+ } valid_bits;
+ uint64_t valid_mask;
+ };
+ uint64_t apic_id;
+ char fw_id[48];
+};
+
+struct cper_sec_nonstd_err_info {
+ guid_t error_type;
+ union {
+ struct {
+ uint64_t ms_chk : 1;
+ uint64_t target_addr_id : 1;
+ uint64_t req_id : 1;
+ uint64_t resp_id : 1;
+ uint64_t instr_ptr : 1;
+ uint64_t reserved : 59;
+ } valid_bits;
+ uint64_t valid_mask;
+ };
+ union {
+ struct {
+ uint64_t err_type_valid : 1;
+ uint64_t pcc_valid : 1;
+ uint64_t uncorr_valid : 1;
+ uint64_t precise_ip_valid : 1;
+ uint64_t restartable_ip_valid : 1;
+ uint64_t overflow_valid : 1;
+ uint64_t reserved1 : 10;
+ uint64_t err_type : 2;
+ uint64_t pcc : 1;
+ uint64_t uncorr : 1;
+ uint64_t precised_ip : 1;
+ uint64_t restartable_ip : 1;
+ uint64_t overflow : 1;
+ uint64_t reserved2 : 41;
+ } ms_chk_bits;
+ uint64_t ms_chk_mask;
+ };
+ uint64_t target_addr_id;
+ uint64_t req_id;
+ uint64_t resp_id;
+ uint64_t instr_ptr;
+};
+
+struct cper_sec_nonstd_err_ctx {
+ uint16_t reg_ctx_type;
+ uint16_t reg_arr_size;
+ uint32_t msr_addr;
+ uint64_t mm_reg_addr;
+ uint32_t reg_dump[CPER_ACA_REG_COUNT];
+};
+
+struct cper_sec_nonstd_err {
+ struct cper_sec_nonstd_err_hdr hdr;
+ struct cper_sec_nonstd_err_info info;
+ struct cper_sec_nonstd_err_ctx ctx;
+};
+
+struct cper_sec_crashdump_hdr {
+ uint64_t reserved1;
+ uint64_t reserved2;
+ char fw_id[48];
+ uint64_t reserved3[8];
+};
+
+struct cper_sec_crashdump_reg_data {
+ uint32_t status_lo;
+ uint32_t status_hi;
+ uint32_t addr_lo;
+ uint32_t addr_hi;
+ uint32_t ipid_lo;
+ uint32_t ipid_hi;
+ uint32_t synd_lo;
+ uint32_t synd_hi;
+};
+
+struct cper_sec_crashdump_body_fatal {
+ uint16_t reg_ctx_type;
+ uint16_t reg_arr_size;
+ uint32_t reserved1;
+ uint64_t reserved2;
+ struct cper_sec_crashdump_reg_data data;
+};
+
+struct cper_sec_crashdump_body_boot {
+ uint16_t reg_ctx_type;
+ uint16_t reg_arr_size;
+ uint32_t reserved1;
+ uint64_t reserved2;
+ uint64_t msg[CPER_MAX_OAM_COUNT];
+};
+
+struct cper_sec_crashdump_fatal {
+ struct cper_sec_crashdump_hdr hdr;
+ struct cper_sec_crashdump_body_fatal body;
+};
+
+struct cper_sec_crashdump_boot {
+ struct cper_sec_crashdump_hdr hdr;
+ struct cper_sec_crashdump_body_boot body;
+};
+
+#pragma pack(pop)
+
+#endif
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 02/12] drm/amdgpu: Introduce funcs for populating CPER
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
2025-02-14 8:07 ` [PATCH v2 01/12] drm/amd/include: Add amd cper header Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 03/12] drm/amdgpu: Include ACA error type in aca bank Xiang Liu
` (10 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1
From: Hawking Zhang <Hawking.Zhang@amd.com>
Introduce utility functions designed to assist
in populating CPER records.
v2: call cper_init/fini in device_ip_init/fini.
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/Makefile | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 281 +++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 91 +++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +
5 files changed, 381 insertions(+), 1 deletion(-)
create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 04247303b3cf..84bb3dfa39a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -66,7 +66,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o \
- amdgpu_userq_fence.o amdgpu_eviction_fence.o
+ amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_cper.o
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index dc1f8d6fd0c4..db0a26800927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -110,6 +110,7 @@
#include "amdgpu_mca.h"
#include "amdgpu_aca.h"
#include "amdgpu_ras.h"
+#include "amdgpu_cper.h"
#include "amdgpu_xcp.h"
#include "amdgpu_seq64.h"
#include "amdgpu_reg_state.h"
@@ -1128,6 +1129,9 @@ struct amdgpu_device {
/* ACA */
struct amdgpu_aca aca;
+ /* CPER */
+ struct amdgpu_cper cper;
+
struct amdgpu_ip_block ip_blocks[AMDGPU_MAX_IP_NUM];
uint32_t harvest_ip_mask;
int num_ip_blocks;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
new file mode 100644
index 000000000000..8ce5dc6efcf9
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+
+static const guid_t MCE = CPER_NOTIFY_MCE;
+static const guid_t CMC = CPER_NOTIFY_CMC;
+static const guid_t BOOT = BOOT_TYPE;
+
+static const guid_t CRASHDUMP = AMD_CRASHDUMP;
+static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR;
+
+static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
+{
+ hdr->record_length += size;
+}
+
+void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ enum amdgpu_cper_type type,
+ enum cper_error_severity sev)
+{
+ hdr->signature[0] = 'C';
+ hdr->signature[1] = 'P';
+ hdr->signature[2] = 'E';
+ hdr->signature[3] = 'R';
+ hdr->revision = CPER_HDR_REV_1;
+ hdr->signature_end = 0xFFFFFFFF;
+ hdr->error_severity = sev;
+
+ hdr->valid_bits.platform_id = 1;
+ hdr->valid_bits.partition_id = 1;
+ hdr->valid_bits.timestamp = 1;
+ /*TODO need to initialize hdr->timestamp */
+
+ snprintf(hdr->record_id, 8, "%d", atomic_inc_return(&adev->cper.unique_id));
+ snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
+ adev->pdev->vendor, adev->pdev->device);
+ /* pmfw version should be part of creator_id according to CPER spec */
+ snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
+
+ switch (type) {
+ case AMDGPU_CPER_TYPE_BOOT:
+ hdr->notify_type = BOOT;
+ break;
+ case AMDGPU_CPER_TYPE_FATAL:
+ case AMDGPU_CPER_TYPE_BP_THRESHOLD:
+ hdr->notify_type = MCE;
+ break;
+ case AMDGPU_CPER_TYPE_RUNTIME:
+ if (sev == CPER_SEV_NON_FATAL_CORRECTED)
+ hdr->notify_type = CMC;
+ else
+ hdr->notify_type = MCE;
+ break;
+ default:
+ dev_err(adev->dev, "Unknown CPER Type\n");
+ break;
+ }
+
+ __inc_entry_length(hdr, HDR_LEN);
+}
+
+static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
+ struct cper_sec_desc *section_desc,
+ bool bp_threshold,
+ bool poison,
+ enum cper_error_severity sev,
+ guid_t sec_type,
+ uint32_t section_length,
+ uint32_t section_offset)
+{
+ section_desc->revision_minor = CPER_SEC_MINOR_REV_1;
+ section_desc->revision_major = CPER_SEC_MAJOR_REV_22;
+ section_desc->sec_offset = section_offset;
+ section_desc->sec_length = section_length;
+ section_desc->valid_bits.fru_id = 1;
+ section_desc->valid_bits.fru_text = 1;
+ section_desc->flag_bits.primary = 1;
+ section_desc->severity = sev;
+ section_desc->sec_type = sec_type;
+
+ if (adev->smuio.funcs &&
+ adev->smuio.funcs->get_socket_id)
+ snprintf(section_desc->fru_text, 20, "OAM%d",
+ adev->smuio.funcs->get_socket_id(adev));
+ /* TODO: fru_id is 16 bytes in CPER spec, but driver defines it as 20 bytes */
+ snprintf(section_desc->fru_id, 16, "%llx", adev->unique_id);
+
+ if (bp_threshold)
+ section_desc->flag_bits.exceed_err_threshold = 1;
+ if (poison)
+ section_desc->flag_bits.latent_err = 1;
+
+ return 0;
+}
+
+int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ uint32_t idx,
+ struct cper_sec_crashdump_reg_data reg_data)
+{
+ struct cper_sec_desc *section_desc;
+ struct cper_sec_crashdump_fatal *section;
+
+ section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
+ section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
+ FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
+
+ amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
+ CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
+ FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
+
+ section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
+ section->body.reg_arr_size = sizeof(reg_data);
+ section->body.data = reg_data;
+
+ __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
+
+ return 0;
+}
+
+int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ uint32_t idx,
+ enum cper_error_severity sev,
+ uint32_t *reg_dump,
+ uint32_t reg_count)
+{
+ struct cper_sec_desc *section_desc;
+ struct cper_sec_nonstd_err *section;
+ bool poison;
+
+ poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true;
+ section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
+ section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
+ NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
+
+ amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
+ sev, RUNTIME, NONSTD_SEC_LEN,
+ NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
+
+ reg_count = min(reg_count, CPER_ACA_REG_COUNT);
+
+ section->hdr.valid_bits.err_info_cnt = 1;
+ section->hdr.valid_bits.err_context_cnt = 1;
+
+ section->info.error_type = RUNTIME;
+ section->info.ms_chk_bits.err_type_valid = 1;
+ section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
+ section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
+
+ memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
+
+ __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
+
+ return 0;
+}
+
+int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ uint32_t idx)
+{
+ struct cper_sec_desc *section_desc;
+ struct cper_sec_nonstd_err *section;
+
+ section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
+ section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
+ NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
+
+ amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
+ CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
+ NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
+
+ section->hdr.valid_bits.err_info_cnt = 1;
+ section->hdr.valid_bits.err_context_cnt = 1;
+
+ section->info.error_type = RUNTIME;
+ section->info.ms_chk_bits.err_type_valid = 1;
+ section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
+ section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
+
+ /* Hardcoded Reg dump for bad page threshold CPER */
+ section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1;
+ section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
+ section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
+ section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
+ section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
+ section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96;
+ section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0;
+ section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0;
+
+ __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
+
+ return 0;
+}
+
+struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
+ enum amdgpu_cper_type type,
+ uint16_t section_count)
+{
+ struct cper_hdr *hdr;
+ uint32_t size = 0;
+
+ size += HDR_LEN;
+ size += (SEC_DESC_LEN * section_count);
+
+ switch (type) {
+ case AMDGPU_CPER_TYPE_RUNTIME:
+ case AMDGPU_CPER_TYPE_BP_THRESHOLD:
+ size += (NONSTD_SEC_LEN * section_count);
+ break;
+ case AMDGPU_CPER_TYPE_FATAL:
+ size += (FATAL_SEC_LEN * section_count);
+ break;
+ case AMDGPU_CPER_TYPE_BOOT:
+ size += (BOOT_SEC_LEN * section_count);
+ break;
+ default:
+ dev_err(adev->dev, "Unknown CPER Type!\n");
+ return NULL;
+ }
+
+ hdr = kzalloc(size, GFP_KERNEL);
+ if (!hdr)
+ return NULL;
+
+ /* Save this early */
+ hdr->sec_cnt = section_count;
+
+ return hdr;
+}
+
+int amdgpu_cper_init(struct amdgpu_device *adev)
+{
+ mutex_init(&adev->cper.cper_lock);
+
+ adev->cper.enabled = true;
+ adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
+
+ /*TODO: initialize cper ring*/
+
+ return 0;
+}
+
+int amdgpu_cper_fini(struct amdgpu_device *adev)
+{
+ adev->cper.enabled = false;
+
+ /*TODO: free cper ring */
+ adev->cper.count = 0;
+ adev->cper.wptr = 0;
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
new file mode 100644
index 000000000000..0ae845420983
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __AMDGPU_CPER_H__
+#define __AMDGPU_CPER_H__
+
+#include "amd_cper.h"
+
+#define CPER_MAX_ALLOWED_COUNT 0x1000
+#define HDR_LEN (sizeof(struct cper_hdr))
+#define SEC_DESC_LEN (sizeof(struct cper_sec_desc))
+
+#define BOOT_SEC_LEN (sizeof(struct cper_sec_crashdump_boot))
+#define FATAL_SEC_LEN (sizeof(struct cper_sec_crashdump_fatal))
+#define NONSTD_SEC_LEN (sizeof(struct cper_sec_nonstd_err))
+
+#define SEC_DESC_OFFSET(idx) (HDR_LEN + (SEC_DESC_LEN * idx))
+
+#define BOOT_SEC_OFFSET(count, idx) (HDR_LEN + (SEC_DESC_LEN * count) + (BOOT_SEC_LEN * idx))
+#define FATAL_SEC_OFFSET(count, idx) (HDR_LEN + (SEC_DESC_LEN * count) + (FATAL_SEC_LEN * idx))
+#define NONSTD_SEC_OFFSET(count, idx) (HDR_LEN + (SEC_DESC_LEN * count) + (NONSTD_SEC_LEN * idx))
+
+enum amdgpu_cper_type {
+ AMDGPU_CPER_TYPE_RUNTIME,
+ AMDGPU_CPER_TYPE_FATAL,
+ AMDGPU_CPER_TYPE_BOOT,
+ AMDGPU_CPER_TYPE_BP_THRESHOLD,
+};
+
+struct amdgpu_cper {
+ bool enabled;
+
+ atomic_t unique_id;
+ struct mutex cper_lock;
+
+ /* Lifetime CPERs generated */
+ uint32_t count;
+ uint32_t max_count;
+
+ uint32_t wptr;
+
+ void *ring[CPER_MAX_ALLOWED_COUNT];
+};
+
+void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ enum amdgpu_cper_type type,
+ enum cper_error_severity sev);
+int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ uint32_t idx,
+ struct cper_sec_crashdump_reg_data reg_data);
+int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ uint32_t idx,
+ enum cper_error_severity sev,
+ uint32_t *reg_dump,
+ uint32_t reg_count);
+int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
+ struct cper_hdr *hdr,
+ uint32_t section_idx);
+
+struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
+ enum amdgpu_cper_type type,
+ uint16_t section_count);
+
+int amdgpu_cper_init(struct amdgpu_device *adev);
+int amdgpu_cper_fini(struct amdgpu_device *adev);
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f9a7bd741a41..f5c65e89b23b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3075,6 +3075,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
amdgpu_fru_get_product_info(adev);
+ r = amdgpu_cper_init(adev);
+
init_failed:
return r;
@@ -3435,6 +3437,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
{
int i, r;
+ amdgpu_cper_fini(adev);
+
if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
amdgpu_virt_release_ras_err_handler_data(adev);
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 03/12] drm/amdgpu: Include ACA error type in aca bank
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
2025-02-14 8:07 ` [PATCH v2 01/12] drm/amd/include: Add amd cper header Xiang Liu
2025-02-14 8:07 ` [PATCH v2 02/12] drm/amdgpu: Introduce funcs for populating CPER Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 04/12] drm/amdgpu: Introduce funcs for generating cper record Xiang Liu
` (9 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Yang Wang
From: Hawking Zhang <Hawking.Zhang@amd.com>
ACA error types managed by driver a direct 1:1
correspondence with those managed by firmware.
To address this, for each ACA bank, include
both the ACA error type and the ACA SMU type.
This addition is useful for creating CPER records.
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Yang Wang <keivnyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 4 +++-
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 ++
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 ++
drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 ++
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 ++
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 ++
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 1 +
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 ++
9 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 9d6345146495..1a26b8ad14cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -168,7 +168,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
if (ret)
return ret;
- bank.type = type;
+ bank.smu_err_type = type;
aca_smu_bank_dump(adev, i, count, &bank, qctx);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index f3289d289913..3cd0115b0244 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -108,13 +108,15 @@ enum aca_error_type {
};
enum aca_smu_type {
+ ACA_SMU_TYPE_INVALID = -1,
ACA_SMU_TYPE_UE = 0,
ACA_SMU_TYPE_CE,
ACA_SMU_TYPE_COUNT,
};
struct aca_bank {
- enum aca_smu_type type;
+ enum aca_error_type aca_err_type;
+ enum aca_smu_type smu_err_type;
u64 regs[ACA_MAX_REGS_COUNT];
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index aecbe52a4f5c..94f306c0b706 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1129,10 +1129,12 @@ static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_ban
if (ext_error_code != 0 && ext_error_code != 9)
count = 0ULL;
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
break;
case ACA_SMU_TYPE_CE:
count = ext_error_code == 6 ? count : 0ULL;
+ bank->aca_err_type = ACA_SMU_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count);
break;
default:
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index aed05f3daeeb..d54b2261305b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -893,10 +893,12 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
switch (type) {
case ACA_SMU_TYPE_UE:
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info,
ACA_ERROR_TYPE_UE, 1ULL);
break;
case ACA_SMU_TYPE_CE:
+ bank->aca_err_type = ACA_SMU_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info,
ACA_ERROR_TYPE_CE, ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 9459e8cc7413..99bd68f705b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -1249,10 +1249,12 @@ static int jpeg_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_ban
misc0 = bank->regs[ACA_REG_IDX_MISC0];
switch (type) {
case ACA_SMU_TYPE_UE:
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
1ULL);
break;
case ACA_SMU_TYPE_CE:
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index e646e5cef0a2..17d27b12ccce 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -719,10 +719,12 @@ static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank
misc0 = bank->regs[ACA_REG_IDX_MISC0];
switch (type) {
case ACA_SMU_TYPE_UE:
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
1ULL);
break;
case ACA_SMU_TYPE_CE:
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 5e0066cd6c51..3dc0ffa81484 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2392,10 +2392,12 @@ static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_ban
misc0 = bank->regs[ACA_REG_IDX_MISC0];
switch (type) {
case ACA_SMU_TYPE_UE:
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
1ULL);
break;
case ACA_SMU_TYPE_CE:
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
ACA_REG__MISC0__ERRCNT(misc0));
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index a7b9c358a2d4..74f57b2d30a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -415,6 +415,7 @@ static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank
err_type = ACA_ERROR_TYPE_CE;
else
return 0;
+ bank->aca_err_type = err_type;
ret = aca_bank_info_decode(bank, &info);
if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index f0716c10f23e..980e610c2451 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1935,10 +1935,12 @@ static int vcn_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank
misc0 = bank->regs[ACA_REG_IDX_MISC0];
switch (type) {
case ACA_SMU_TYPE_UE:
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
1ULL);
break;
case ACA_SMU_TYPE_CE:
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
ACA_REG__MISC0__ERRCNT(misc0));
break;
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 04/12] drm/amdgpu: Introduce funcs for generating cper record
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (2 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 03/12] drm/amdgpu: Include ACA error type in aca bank Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 05/12] drm/amdgpu: Generate cper records Xiang Liu
` (8 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Xiang Liu, Yang Wang
From: Hawking Zhang <Hawking.Zhang@amd.com>
Introduce new functions that are used to generate
cper ue or ce records.
v2: return -ENOMEM instead of false
v2: check return value of fill section function
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Yang Wang <keivnyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 12 +--
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 12 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 108 +++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 9 +-
4 files changed, 128 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 1a26b8ad14cb..ed1c20bd8114 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -30,16 +30,6 @@
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
-struct aca_banks {
- int nr_banks;
- struct list_head list;
-};
-
-struct aca_hwip {
- int hwid;
- int mcatype;
-};
-
static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
ACA_BANK_HWID(SMU, 0x01, 0x01),
ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00),
@@ -111,7 +101,7 @@ static struct aca_regs_dump {
{"STATUS", ACA_REG_IDX_STATUS},
{"ADDR", ACA_REG_IDX_ADDR},
{"MISC", ACA_REG_IDX_MISC0},
- {"CONFIG", ACA_REG_IDX_CONFG},
+ {"CONFIG", ACA_REG_IDX_CONFIG},
{"IPID", ACA_REG_IDX_IPID},
{"SYND", ACA_REG_IDX_SYND},
{"DESTAT", ACA_REG_IDX_DESTAT},
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index 3cd0115b0244..b84a3489b116 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -81,7 +81,7 @@ enum aca_reg_idx {
ACA_REG_IDX_STATUS = 1,
ACA_REG_IDX_ADDR = 2,
ACA_REG_IDX_MISC0 = 3,
- ACA_REG_IDX_CONFG = 4,
+ ACA_REG_IDX_CONFIG = 4,
ACA_REG_IDX_IPID = 5,
ACA_REG_IDX_SYND = 6,
ACA_REG_IDX_DESTAT = 8,
@@ -114,6 +114,11 @@ enum aca_smu_type {
ACA_SMU_TYPE_COUNT,
};
+struct aca_hwip {
+ int hwid;
+ int mcatype;
+};
+
struct aca_bank {
enum aca_error_type aca_err_type;
enum aca_smu_type smu_err_type;
@@ -125,6 +130,11 @@ struct aca_bank_node {
struct list_head node;
};
+struct aca_banks {
+ int nr_banks;
+ struct list_head list;
+};
+
struct aca_bank_info {
int die_id;
int socket_id;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index 8ce5dc6efcf9..f82aa12a88f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -21,6 +21,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
+#include <linux/list.h>
#include "amdgpu.h"
static const guid_t MCE = CPER_NOTIFY_MCE;
@@ -257,6 +258,113 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
return hdr;
}
+int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
+ struct aca_bank *bank)
+{
+ struct cper_hdr *fatal = NULL;
+ struct cper_sec_crashdump_reg_data reg_data = { 0 };
+ int ret;
+
+ fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
+ if (!fatal) {
+ dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
+ return -ENOMEM;
+ }
+
+ reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
+ reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
+ reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
+ reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
+ reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
+ reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
+ reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
+ reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
+
+ amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
+ ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
+ if (ret)
+ return ret;
+
+ /*TODO: commit the cper entry to cper ring */
+
+ return 0;
+}
+
+static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
+ enum aca_error_type aca_err_type)
+{
+ switch (aca_err_type) {
+ case ACA_ERROR_TYPE_UE:
+ return CPER_SEV_FATAL;
+ case ACA_ERROR_TYPE_CE:
+ return CPER_SEV_NON_FATAL_CORRECTED;
+ case ACA_ERROR_TYPE_DEFERRED:
+ return CPER_SEV_NON_FATAL_UNCORRECTED;
+ default:
+ dev_err(adev->dev, "Unknown ACA error type!\n");
+ return CPER_SEV_FATAL;
+ }
+}
+
+int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
+ struct aca_banks *banks,
+ uint16_t bank_count)
+{
+ struct cper_hdr *corrected = NULL;
+ enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
+ uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
+ struct aca_bank_node *node;
+ struct aca_bank *bank;
+ uint32_t i;
+ int ret;
+
+ corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
+ if (!corrected) {
+ dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
+ return -ENOMEM;
+ }
+
+ /* Raise severity if any DE is detected in the ACA bank list */
+ list_for_each_entry(node, &banks->list, node) {
+ bank = &node->bank;
+ if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
+ sev = CPER_SEV_NON_FATAL_UNCORRECTED;
+ break;
+ }
+ }
+
+ amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
+
+ /* Combine CE and UE in cper record */
+ list_for_each_entry(node, &banks->list, node) {
+ bank = &node->bank;
+ reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
+ reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
+ reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
+ reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
+ reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
+ reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
+ reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
+ reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
+ reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
+ reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
+ reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
+ reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
+ reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
+ reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
+
+ ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i,
+ amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
+ reg_data, CPER_ACA_REG_COUNT);
+ if (ret)
+ return ret;
+ }
+
+ /*TODO: commit the cper entry to cper ring */
+
+ return 0;
+}
+
int amdgpu_cper_init(struct amdgpu_device *adev)
{
mutex_init(&adev->cper.cper_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index 0ae845420983..6860a809f2f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -26,6 +26,7 @@
#define __AMDGPU_CPER_H__
#include "amd_cper.h"
+#include "amdgpu_aca.h"
#define CPER_MAX_ALLOWED_COUNT 0x1000
#define HDR_LEN (sizeof(struct cper_hdr))
@@ -84,7 +85,13 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
enum amdgpu_cper_type type,
uint16_t section_count);
-
+/* UE must be encoded into separated cper entries, 1 UE 1 cper */
+int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
+ struct aca_bank *bank);
+/* CEs and DEs are combined into 1 cper entry */
+int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
+ struct aca_banks *banks,
+ uint16_t bank_count);
int amdgpu_cper_init(struct amdgpu_device *adev);
int amdgpu_cper_fini(struct amdgpu_device *adev);
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 05/12] drm/amdgpu: Generate cper records
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (3 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 04/12] drm/amdgpu: Introduce funcs for generating cper record Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-18 19:23 ` Kees Bakker
2025-02-14 8:07 ` [PATCH v2 06/12] drm/amdgpu: add RAS CPER ring buffer Xiang Liu
` (7 subsequent siblings)
12 siblings, 1 reply; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Yang Wang
From: Hawking Zhang <Hawking.Zhang@amd.com>
Encode the error information in CPER format and commit
to the cper ring
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Yang Wang <keivnyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 32 +++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index ed1c20bd8114..c0da9096a7fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -384,6 +384,36 @@ static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type
return ret;
}
+static void aca_banks_generate_cper(struct amdgpu_device *adev,
+ enum aca_smu_type type,
+ struct aca_banks *banks,
+ int count)
+{
+ struct aca_bank_node *node;
+ struct aca_bank *bank;
+
+ if (!adev || !banks || !count) {
+ dev_warn(adev->dev, "fail to generate cper records\n");
+ return;
+ }
+
+ /* UEs must be encoded into separate CPER entries */
+ if (type == ACA_SMU_TYPE_UE) {
+ list_for_each_entry(node, &banks->list, node) {
+ bank = &node->bank;
+ if (amdgpu_cper_generate_ue_record(adev, bank))
+ dev_warn(adev->dev, "fail to generate ue cper records\n");
+ }
+ } else {
+ /*
+ * SMU_TYPE_CE banks are combined into 1 CPER entries,
+ * they could be CEs or DEs or both
+ */
+ if (amdgpu_cper_generate_ce_records(adev, banks, count))
+ dev_warn(adev->dev, "fail to generate ce cper records\n");
+ }
+}
+
static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
bank_handler_t handler, struct ras_query_context *qctx, void *data)
{
@@ -421,6 +451,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
if (ret)
goto err_release_banks;
+ aca_banks_generate_cper(adev, type, &banks, count);
+
err_release_banks:
aca_banks_release(&banks);
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH v2 05/12] drm/amdgpu: Generate cper records
2025-02-14 8:07 ` [PATCH v2 05/12] drm/amdgpu: Generate cper records Xiang Liu
@ 2025-02-18 19:23 ` Kees Bakker
0 siblings, 0 replies; 17+ messages in thread
From: Kees Bakker @ 2025-02-18 19:23 UTC (permalink / raw)
To: Xiang Liu, amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Yang Wang
Op 14-02-2025 om 09:07 schreef Xiang Liu:
> From: Hawking Zhang <Hawking.Zhang@amd.com>
>
> Encode the error information in CPER format and commit
> to the cper ring
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
> Reviewed-by: Yang Wang <keivnyang.wang@amd.com>
> Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 32 +++++++++++++++++++++++++
> 1 file changed, 32 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> index ed1c20bd8114..c0da9096a7fa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> @@ -384,6 +384,36 @@ static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type
> return ret;
> }
>
> +static void aca_banks_generate_cper(struct amdgpu_device *adev,
> + enum aca_smu_type type,
> + struct aca_banks *banks,
> + int count)
> +{
> + struct aca_bank_node *node;
> + struct aca_bank *bank;
> +
> + if (!adev || !banks || !count) {
> + dev_warn(adev->dev, "fail to generate cper records\n");
The check for !adev is suspicious in combination with the
dev_warn(adev->dev...
> + return;
> + }
> +
> + /* UEs must be encoded into separate CPER entries */
> + if (type == ACA_SMU_TYPE_UE) {
> + list_for_each_entry(node, &banks->list, node) {
> + bank = &node->bank;
> + if (amdgpu_cper_generate_ue_record(adev, bank))
> + dev_warn(adev->dev, "fail to generate ue cper records\n");
> + }
> + } else {
> + /*
> + * SMU_TYPE_CE banks are combined into 1 CPER entries,
> + * they could be CEs or DEs or both
> + */
> + if (amdgpu_cper_generate_ce_records(adev, banks, count))
> + dev_warn(adev->dev, "fail to generate ce cper records\n");
> + }
> +}
> +
> static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
> bank_handler_t handler, struct ras_query_context *qctx, void *data)
> {
> @@ -421,6 +451,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
> if (ret)
> goto err_release_banks;
>
> + aca_banks_generate_cper(adev, type, &banks, count);
> +
> err_release_banks:
> aca_banks_release(&banks);
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v2 06/12] drm/amdgpu: add RAS CPER ring buffer
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (4 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 05/12] drm/amdgpu: Generate cper records Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 07/12] drm/amdgpu: read CPER ring via debugfs Xiang Liu
` (6 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1
From: Tao Zhou <tao.zhou1@amd.com>
And initialize it, this is a pure software ring to store RAS CPER data.
v2: update the initialization of count_dw of cper ring, it's dword
variable.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 39 +++++++++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 29 ++++++++++------
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 3 +-
5 files changed, 57 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index f82aa12a88f4..cef7c1ec0d7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -365,6 +365,39 @@ int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
return 0;
}
+static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
+{
+ return *(ring->rptr_cpu_addr);
+}
+
+static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)
+{
+ return ring->wptr;
+}
+
+static const struct amdgpu_ring_funcs cper_ring_funcs = {
+ .type = AMDGPU_RING_TYPE_CPER,
+ .align_mask = 0xff,
+ .support_64bit_ptrs = false,
+ .get_rptr = amdgpu_cper_ring_get_rptr,
+ .get_wptr = amdgpu_cper_ring_get_wptr,
+};
+
+static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_ring *ring = &(adev->cper.ring_buf);
+
+ ring->adev = NULL;
+ ring->ring_obj = NULL;
+ ring->use_doorbell = false;
+ ring->no_scheduler = true;
+ ring->funcs = &cper_ring_funcs;
+
+ sprintf(ring->name, "cper");
+ return amdgpu_ring_init(adev, ring, PAGE_SIZE, NULL, 0,
+ AMDGPU_RING_PRIO_DEFAULT, NULL);
+}
+
int amdgpu_cper_init(struct amdgpu_device *adev)
{
mutex_init(&adev->cper.cper_lock);
@@ -372,16 +405,14 @@ int amdgpu_cper_init(struct amdgpu_device *adev)
adev->cper.enabled = true;
adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
- /*TODO: initialize cper ring*/
-
- return 0;
+ return amdgpu_cper_ring_init(adev);
}
int amdgpu_cper_fini(struct amdgpu_device *adev)
{
adev->cper.enabled = false;
- /*TODO: free cper ring */
+ amdgpu_ring_fini(&(adev->cper.ring_buf));
adev->cper.count = 0;
adev->cper.wptr = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index 6860a809f2f5..80c8571cff9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -62,6 +62,7 @@ struct amdgpu_cper {
uint32_t wptr;
void *ring[CPER_MAX_ALLOWED_COUNT];
+ struct amdgpu_ring ring_buf;
};
void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index cfbc18c12113..005cdaee9987 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -324,20 +324,27 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
/* always set cond_exec_polling to CONTINUE */
*ring->cond_exe_cpu_addr = 1;
- r = amdgpu_fence_driver_start_ring(ring, irq_src, irq_type);
- if (r) {
- dev_err(adev->dev, "failed initializing fences (%d).\n", r);
- return r;
- }
+ if (ring->funcs->type != AMDGPU_RING_TYPE_CPER) {
+ r = amdgpu_fence_driver_start_ring(ring, irq_src, irq_type);
+ if (r) {
+ dev_err(adev->dev, "failed initializing fences (%d).\n", r);
+ return r;
+ }
- max_ibs_dw = ring->funcs->emit_frame_size +
- amdgpu_ring_max_ibs(ring->funcs->type) * ring->funcs->emit_ib_size;
- max_ibs_dw = (max_ibs_dw + ring->funcs->align_mask) & ~ring->funcs->align_mask;
+ max_ibs_dw = ring->funcs->emit_frame_size +
+ amdgpu_ring_max_ibs(ring->funcs->type) * ring->funcs->emit_ib_size;
+ max_ibs_dw = (max_ibs_dw + ring->funcs->align_mask) & ~ring->funcs->align_mask;
- if (WARN_ON(max_ibs_dw > max_dw))
- max_dw = max_ibs_dw;
+ if (WARN_ON(max_ibs_dw > max_dw))
+ max_dw = max_ibs_dw;
- ring->ring_size = roundup_pow_of_two(max_dw * 4 * sched_hw_submission);
+ ring->ring_size = roundup_pow_of_two(max_dw * 4 * sched_hw_submission);
+ } else {
+ ring->ring_size = roundup_pow_of_two(max_dw * 4);
+ ring->count_dw = (ring->ring_size - 4) >> 2;
+ /* ring buffer is empty now */
+ ring->wptr = *ring->rptr_cpu_addr = 0;
+ }
ring->buf_mask = (ring->ring_size / 4) - 1;
ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 04af26536f97..7372e4aed6b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -82,6 +82,7 @@ enum amdgpu_ring_type {
AMDGPU_RING_TYPE_KIQ,
AMDGPU_RING_TYPE_MES,
AMDGPU_RING_TYPE_UMSCH_MM,
+ AMDGPU_RING_TYPE_CPER,
};
enum amdgpu_ib_pool_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 16d924acb788..83a07309a538 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -77,7 +77,8 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,
ring->xcp_id = AMDGPU_XCP_NO_PARTITION;
if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
adev->gfx.enforce_isolation[0].xcp_id = ring->xcp_id;
- if (adev->xcp_mgr->mode == AMDGPU_XCP_MODE_NONE)
+ if ((adev->xcp_mgr->mode == AMDGPU_XCP_MODE_NONE) ||
+ (ring->funcs->type == AMDGPU_RING_TYPE_CPER))
return;
inst_mask = 1 << inst_idx;
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 07/12] drm/amdgpu: read CPER ring via debugfs
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (5 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 06/12] drm/amdgpu: add RAS CPER ring buffer Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 08/12] drm/amdgpu: add data write function for CPER ring Xiang Liu
` (5 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1
From: Tao Zhou <tao.zhou1@amd.com>
We read CPER data from read pointer to write pointer without changing
the pointers.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 47 ++++++++++++++++++------
1 file changed, 36 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 005cdaee9987..510fe1ad0628 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -500,6 +500,7 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
{
struct amdgpu_ring *ring = file_inode(f)->i_private;
uint32_t value, result, early[3];
+ uint64_t p;
loff_t i;
int r;
@@ -523,18 +524,42 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
}
}
- while (size) {
- if (*pos >= (ring->ring_size + 12))
- return result;
+ if (ring->funcs->type != AMDGPU_RING_TYPE_CPER) {
+ while (size) {
+ if (*pos >= (ring->ring_size + 12))
+ return result;
- value = ring->ring[(*pos - 12)/4];
- r = put_user(value, (uint32_t *)buf);
- if (r)
- return r;
- buf += 4;
- result += 4;
- size -= 4;
- *pos += 4;
+ value = ring->ring[(*pos - 12)/4];
+ r = put_user(value, (uint32_t *)buf);
+ if (r)
+ return r;
+ buf += 4;
+ result += 4;
+ size -= 4;
+ *pos += 4;
+ }
+ } else {
+ p = early[0];
+ if (early[0] <= early[1])
+ size = (early[1] - early[0]);
+ else
+ size = ring->ring_size - (early[0] - early[1]);
+
+ while (size) {
+ if (p == early[1])
+ return result;
+
+ value = ring->ring[p];
+ r = put_user(value, (uint32_t *)buf);
+ if (r)
+ return r;
+
+ buf += 4;
+ result += 4;
+ size--;
+ p++;
+ p &= ring->ptr_mask;
+ }
}
return result;
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 08/12] drm/amdgpu: add data write function for CPER ring
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (6 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 07/12] drm/amdgpu: read CPER ring via debugfs Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 09/12] drm/amdgpu: add mutex lock for cper ring Xiang Liu
` (4 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1
From: Tao Zhou <tao.zhou1@amd.com>
Old CPER data will be overwritten if ring buffer is full, and read
pointer always points to CPER header.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 93 ++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 2 +
2 files changed, 95 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index cef7c1ec0d7c..64624b8b0cbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -365,6 +365,99 @@ int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
return 0;
}
+static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
+{
+ struct cper_hdr *chdr;
+
+ chdr = (struct cper_hdr *)&(ring->ring[pos]);
+ return strcmp(chdr->signature, "CPER") ? false : true;
+}
+
+static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
+{
+ struct cper_hdr *chdr;
+ u64 p;
+ u32 chunk, rec_len = 0;
+
+ chdr = (struct cper_hdr *)&(ring->ring[pos]);
+ chunk = ring->ring_size - (pos << 2);
+
+ if (!strcmp(chdr->signature, "CPER")) {
+ rec_len = chdr->record_length;
+ goto calc;
+ }
+
+ /* ring buffer is not full, no cper data after ring->wptr */
+ if (ring->count_dw)
+ goto calc;
+
+ for (p = pos + 1; p <= ring->buf_mask; p++) {
+ chdr = (struct cper_hdr *)&(ring->ring[p]);
+ if (!strcmp(chdr->signature, "CPER")) {
+ rec_len = (p - pos) << 2;
+ goto calc;
+ }
+ }
+
+calc:
+ if (!rec_len)
+ return chunk;
+ else
+ return min(rec_len, chunk);
+}
+
+void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
+ void *src, int count)
+{
+ u64 pos, wptr_old, rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
+ u32 chunk, ent_sz;
+ u8 *s = (u8 *)src;
+
+ if (count >= ring->ring_size - 4) {
+ dev_err(ring->adev->dev,
+ "CPER data size(%d) is larger than ring size(%d)\n",
+ count, ring->ring_size - 4);
+
+ return;
+ }
+
+ wptr_old = ring->wptr;
+
+ while (count) {
+ ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
+ chunk = min(ent_sz, count);
+
+ memcpy(&ring->ring[ring->wptr], s, chunk);
+
+ ring->wptr += (chunk >> 2);
+ ring->wptr &= ring->ptr_mask;
+ count -= chunk;
+ s += chunk;
+ }
+
+ /* the buffer is overflow, adjust rptr */
+ if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
+ ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
+ ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {
+ pos = (ring->wptr + 1) & ring->ptr_mask;
+
+ do {
+ ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
+
+ rptr += (ent_sz >> 2);
+ rptr &= ring->ptr_mask;
+ *ring->rptr_cpu_addr = rptr;
+
+ pos = rptr;
+ } while (!amdgpu_cper_is_hdr(ring, rptr));
+ }
+
+ if (ring->count_dw >= (count >> 2))
+ ring->count_dw - (count >> 2);
+ else
+ ring->count_dw = 0;
+}
+
static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
{
return *(ring->rptr_cpu_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index 80c8571cff9d..1fa41858f22e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -93,6 +93,8 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
struct aca_banks *banks,
uint16_t bank_count);
+void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
+ void *src, int count);
int amdgpu_cper_init(struct amdgpu_device *adev);
int amdgpu_cper_fini(struct amdgpu_device *adev);
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 09/12] drm/amdgpu: add mutex lock for cper ring
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (7 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 08/12] drm/amdgpu: add data write function for CPER ring Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 10/12] drm/amdgpu: Get timestamp from system time Xiang Liu
` (3 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1
From: Tao Zhou <tao.zhou1@amd.com>
Avoid the confliction between read and write of ring buffer.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 21 ++++++++++++++++-----
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index 64624b8b0cbc..c14742eb4d67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -423,6 +423,7 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
wptr_old = ring->wptr;
+ mutex_lock(&ring->adev->cper.ring_lock);
while (count) {
ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
chunk = min(ent_sz, count);
@@ -451,6 +452,7 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
pos = rptr;
} while (!amdgpu_cper_is_hdr(ring, rptr));
}
+ mutex_unlock(&ring->adev->cper.ring_lock);
if (ring->count_dw >= (count >> 2))
ring->count_dw - (count >> 2);
@@ -480,6 +482,8 @@ static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
{
struct amdgpu_ring *ring = &(adev->cper.ring_buf);
+ mutex_init(&adev->cper.ring_lock);
+
ring->adev = NULL;
ring->ring_obj = NULL;
ring->use_doorbell = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index 1fa41858f22e..527835cbf0d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -63,6 +63,7 @@ struct amdgpu_cper {
void *ring[CPER_MAX_ALLOWED_COUNT];
struct amdgpu_ring ring_buf;
+ struct mutex ring_lock;
};
void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 510fe1ad0628..5293eef4f0dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -510,13 +510,18 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
result = 0;
if (*pos < 12) {
+ if (ring->funcs->type == AMDGPU_RING_TYPE_CPER)
+ mutex_lock(&ring->adev->cper.ring_lock);
+
early[0] = amdgpu_ring_get_rptr(ring) & ring->buf_mask;
early[1] = amdgpu_ring_get_wptr(ring) & ring->buf_mask;
early[2] = ring->wptr & ring->buf_mask;
for (i = *pos / 4; i < 3 && size; i++) {
r = put_user(early[i], (uint32_t *)buf);
- if (r)
- return r;
+ if (r) {
+ result = r;
+ goto out;
+ }
buf += 4;
result += 4;
size -= 4;
@@ -547,12 +552,14 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
while (size) {
if (p == early[1])
- return result;
+ goto out;
value = ring->ring[p];
r = put_user(value, (uint32_t *)buf);
- if (r)
- return r;
+ if (r) {
+ result = r;
+ goto out;
+ }
buf += 4;
result += 4;
@@ -562,6 +569,10 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
}
}
+out:
+ if (ring->funcs->type == AMDGPU_RING_TYPE_CPER)
+ mutex_unlock(&ring->adev->cper.ring_lock);
+
return result;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 10/12] drm/amdgpu: Get timestamp from system time
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (8 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 09/12] drm/amdgpu: add mutex lock for cper ring Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 11/12] drm/amdgpu: Commit CPER entry Xiang Liu
` (2 subsequent siblings)
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Xiang Liu
Get system local time and encode it to timestamp for CPER.
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index c14742eb4d67..0bdc08fba3b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -36,6 +36,22 @@ static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
hdr->record_length += size;
}
+static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)
+{
+ struct tm tm;
+ time64_t now = ktime_get_real_seconds();
+
+ time64_to_tm(now, 0, &tm);
+ timestamp->seconds = tm.tm_sec;
+ timestamp->minutes = tm.tm_min;
+ timestamp->hours = tm.tm_hour;
+ timestamp->flag = 0;
+ timestamp->day = tm.tm_mday;
+ timestamp->month = 1 + tm.tm_mon;
+ timestamp->year = (1900 + tm.tm_year) % 100;
+ timestamp->century = (1900 + tm.tm_year) / 100;
+}
+
void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
struct cper_hdr *hdr,
enum amdgpu_cper_type type,
@@ -52,7 +68,8 @@ void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
hdr->valid_bits.platform_id = 1;
hdr->valid_bits.partition_id = 1;
hdr->valid_bits.timestamp = 1;
- /*TODO need to initialize hdr->timestamp */
+
+ amdgpu_cper_get_timestamp(&hdr->timestamp);
snprintf(hdr->record_id, 8, "%d", atomic_inc_return(&adev->cper.unique_id));
snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 11/12] drm/amdgpu: Commit CPER entry
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (9 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 10/12] drm/amdgpu: Get timestamp from system time Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-02-14 8:07 ` [PATCH v2 12/12] drm/amdgpu: Generate bad page threshold cper records Xiang Liu
2025-03-28 10:27 ` [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Aravind Iddamsetty
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Xiang Liu
Commit the CPER entry to the ring buffer.
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index 0bdc08fba3b1..00f953ed6740 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -280,6 +280,7 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
{
struct cper_hdr *fatal = NULL;
struct cper_sec_crashdump_reg_data reg_data = { 0 };
+ struct amdgpu_ring *ring = &adev->cper.ring_buf;
int ret;
fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
@@ -302,7 +303,7 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
if (ret)
return ret;
- /*TODO: commit the cper entry to cper ring */
+ amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
return 0;
}
@@ -329,6 +330,7 @@ int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
{
struct cper_hdr *corrected = NULL;
enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
+ struct amdgpu_ring *ring = &adev->cper.ring_buf;
uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
struct aca_bank_node *node;
struct aca_bank *bank;
@@ -377,7 +379,7 @@ int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
return ret;
}
- /*TODO: commit the cper entry to cper ring */
+ amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
return 0;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 12/12] drm/amdgpu: Generate bad page threshold cper records
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (10 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 11/12] drm/amdgpu: Commit CPER entry Xiang Liu
@ 2025-02-14 8:07 ` Xiang Liu
2025-03-28 10:27 ` [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Aravind Iddamsetty
12 siblings, 0 replies; 17+ messages in thread
From: Xiang Liu @ 2025-02-14 8:07 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, tao.zhou1, Xiang Liu
Generate CPER record when bad page threshold exceed and
commit to CPER ring.
v2: return -ENOMEM instead of false
v2: check return value of fill section function
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 23 +++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 2 ++
drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 3 +++
3 files changed, 28 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index 00f953ed6740..67ad26c5e6df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -308,6 +308,29 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
return 0;
}
+int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
+{
+ struct cper_hdr *bp_threshold = NULL;
+ struct amdgpu_ring *ring = &adev->cper.ring_buf;
+ int ret;
+
+ bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
+ if (!bp_threshold) {
+ dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
+ return -ENOMEM;
+ }
+
+ amdgpu_cper_entry_fill_hdr(adev, bp_threshold, AMDGPU_CPER_TYPE_BP_THRESHOLD,
+ CPER_SEV_FATAL);
+ ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
+ if (ret)
+ return ret;
+
+ amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
+
+ return 0;
+}
+
static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
enum aca_error_type aca_err_type)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index 527835cbf0d3..561e0a43b4b8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -94,6 +94,8 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
struct aca_banks *banks,
uint16_t bank_count);
+/* Bad page threshold is encoded into separated cper entry */
+int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev);
void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
void *src, int count);
int amdgpu_cper_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index ba6e44951e57..c7abc0c4e87c 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -716,6 +716,9 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev)
ret = smu_send_rma_reason(smu);
mutex_unlock(&adev->pm.mutex);
+ if (amdgpu_cper_generate_bp_threshold_record(adev))
+ dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
+
return ret;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring
2025-02-14 8:07 [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Xiang Liu
` (11 preceding siblings ...)
2025-02-14 8:07 ` [PATCH v2 12/12] drm/amdgpu: Generate bad page threshold cper records Xiang Liu
@ 2025-03-28 10:27 ` Aravind Iddamsetty
2025-03-28 12:12 ` Aravind Iddamsetty
12 siblings, 1 reply; 17+ messages in thread
From: Aravind Iddamsetty @ 2025-03-28 10:27 UTC (permalink / raw)
To: Hawking.Zhang, tao.zhou1, Xiang Liu
Cc: amd-gfx, airlied@gmail.com, intel-xe@lists.freedesktop.org,
alexander.deucher@amd.com, ckoenig.leichtzumerken,
joonas.lahtinen@linux.intel.com, simona
Hi,
Based on the discussions around using Netlink for RAS purposes, as
summarized in this blog post [1] by Dave Airlie. I had proposed a series
regarding RAS infrastructure in DRM [2].
I came across your work, which appears to address related areas and I'm
particularly interested in understanding how it aligns with or could be
adapted to the ongoing discussions around leveraging Netlink for RAS.
Could you share your perspective on the potential integration of your
efforts with Netlink? Do you foresee any challenges or opportunities in
aligning with the approach discussed in the above-mentioned blog post
and series?
Looking forward to your insights and any additional thoughts you may
have on this topic.
[1]
https://airlied.blogspot.com/2022/09/accelerators-bof-outcomes-summary.html
[2]
https://lore.kernel.org/all/20231020155835.1295524-1-aravind.iddamsetty@linux.intel.com/
Thanks,
Aravind.
On 14-02-2025 13:37, Xiang Liu wrote:
> This patch series generate RAS CPER records for UE/DE/CE/BP threshold exceed
> event. SMU_TYPE_CE banks are combined into 1 CPER entry, they could be CEs or
> DEs or both. UEs and BPs are encoded into separate CPER entries.
>
> RAS CPER records for CEs will be generated only after CEs count been queried.
>
> All records are committed to a pure software ring with a limit size, new records
> will flush older records when overflow happened. User can access the records by
> reading debugfs node, which is read-only.
>
> Hawking Zhang (5):
> drm/amd/include: Add amd cper header
> drm/amdgpu: Introduce funcs for populating CPER
> drm/amdgpu: Include ACA error type in aca bank
> drm/amdgpu: Introduce funcs for generating cper record
> drm/amdgpu: Generate cper records
>
> Tao Zhou (4):
> drm/amdgpu: add RAS CPER ring buffer
> drm/amdgpu: read CPER ring via debugfs
> drm/amdgpu: add data write function for CPER ring
> drm/amdgpu: add mutex lock for cper ring
>
> Xiang Liu (3):
> drm/amdgpu: Get timestamp from system time
> drm/amdgpu: Commit CPER entry
> drm/amdgpu: Generate bad page threshold cper records
>
> drivers/gpu/drm/amd/amdgpu/Makefile | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 46 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 16 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 559 +++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 104 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 91 +++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +
> drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +
> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +
> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 1 +
> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +
> drivers/gpu/drm/amd/include/amd_cper.h | 269 ++++++++++
> drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 3 +
> 19 files changed, 1075 insertions(+), 40 deletions(-)
> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
> create mode 100644 drivers/gpu/drm/amd/include/amd_cper.h
>
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring
2025-03-28 10:27 ` [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring Aravind Iddamsetty
@ 2025-03-28 12:12 ` Aravind Iddamsetty
2025-04-16 9:40 ` Aravind Iddamsetty
0 siblings, 1 reply; 17+ messages in thread
From: Aravind Iddamsetty @ 2025-03-28 12:12 UTC (permalink / raw)
To: Hawking.Zhang, tao.zhou1, Xiang Liu
Cc: amd-gfx, airlied@gmail.com, intel-xe@lists.freedesktop.org,
alexander.deucher@amd.com, ckoenig.leichtzumerken,
joonas.lahtinen@linux.intel.com, simona,
dri-devel@lists.freedesktop.org
++ dri-devel
On 28-03-2025 15:57, Aravind Iddamsetty wrote:
> Hi,
>
> Based on the discussions around using Netlink for RAS purposes, as
> summarized in this blog post [1] by Dave Airlie. I had proposed a series
> regarding RAS infrastructure in DRM [2].
>
> I came across your work, which appears to address related areas and I'm
> particularly interested in understanding how it aligns with or could be
> adapted to the ongoing discussions around leveraging Netlink for RAS.
>
> Could you share your perspective on the potential integration of your
> efforts with Netlink? Do you foresee any challenges or opportunities in
> aligning with the approach discussed in the above-mentioned blog post
> and series?
>
> Looking forward to your insights and any additional thoughts you may
> have on this topic.
>
>
> [1]
> https://airlied.blogspot.com/2022/09/accelerators-bof-outcomes-summary.html
>
> [2]
> https://lore.kernel.org/all/20231020155835.1295524-1-aravind.iddamsetty@linux.intel.com/
>
>
> Thanks,
> Aravind.
> On 14-02-2025 13:37, Xiang Liu wrote:
>> This patch series generate RAS CPER records for UE/DE/CE/BP threshold exceed
>> event. SMU_TYPE_CE banks are combined into 1 CPER entry, they could be CEs or
>> DEs or both. UEs and BPs are encoded into separate CPER entries.
>>
>> RAS CPER records for CEs will be generated only after CEs count been queried.
>>
>> All records are committed to a pure software ring with a limit size, new records
>> will flush older records when overflow happened. User can access the records by
>> reading debugfs node, which is read-only.
>>
>> Hawking Zhang (5):
>> drm/amd/include: Add amd cper header
>> drm/amdgpu: Introduce funcs for populating CPER
>> drm/amdgpu: Include ACA error type in aca bank
>> drm/amdgpu: Introduce funcs for generating cper record
>> drm/amdgpu: Generate cper records
>>
>> Tao Zhou (4):
>> drm/amdgpu: add RAS CPER ring buffer
>> drm/amdgpu: read CPER ring via debugfs
>> drm/amdgpu: add data write function for CPER ring
>> drm/amdgpu: add mutex lock for cper ring
>>
>> Xiang Liu (3):
>> drm/amdgpu: Get timestamp from system time
>> drm/amdgpu: Commit CPER entry
>> drm/amdgpu: Generate bad page threshold cper records
>>
>> drivers/gpu/drm/amd/amdgpu/Makefile | 2 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 46 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 16 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 559 +++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 104 ++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 91 +++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +
>> drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 3 +-
>> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +
>> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +
>> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +
>> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +
>> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 1 +
>> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +
>> drivers/gpu/drm/amd/include/amd_cper.h | 269 ++++++++++
>> drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 3 +
>> 19 files changed, 1075 insertions(+), 40 deletions(-)
>> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
>> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
>> create mode 100644 drivers/gpu/drm/amd/include/amd_cper.h
>>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH v2 00/12] Generate CPER records for RAS and commit to CPER ring
2025-03-28 12:12 ` Aravind Iddamsetty
@ 2025-04-16 9:40 ` Aravind Iddamsetty
0 siblings, 0 replies; 17+ messages in thread
From: Aravind Iddamsetty @ 2025-04-16 9:40 UTC (permalink / raw)
To: Hawking.Zhang, tao.zhou1, Xiang Liu
Cc: amd-gfx, airlied@gmail.com, intel-xe@lists.freedesktop.org,
alexander.deucher@amd.com, ckoenig.leichtzumerken,
joonas.lahtinen@linux.intel.com, simona,
dri-devel@lists.freedesktop.org
[-- Attachment #1: Type: text/plain, Size: 3789 bytes --]
Hi,
I would appreciate it if you could kindly let me know your thoughts.
Thanks,
Aravind.
On 28-03-2025 17:42, Aravind Iddamsetty wrote:
> ++ dri-devel
>
> On 28-03-2025 15:57, Aravind Iddamsetty wrote:
>> Hi,
>>
>> Based on the discussions around using Netlink for RAS purposes, as
>> summarized in this blog post [1] by Dave Airlie. I had proposed a series
>> regarding RAS infrastructure in DRM [2].
>>
>> I came across your work, which appears to address related areas and I'm
>> particularly interested in understanding how it aligns with or could be
>> adapted to the ongoing discussions around leveraging Netlink for RAS.
>>
>> Could you share your perspective on the potential integration of your
>> efforts with Netlink? Do you foresee any challenges or opportunities in
>> aligning with the approach discussed in the above-mentioned blog post
>> and series?
>>
>> Looking forward to your insights and any additional thoughts you may
>> have on this topic.
>>
>>
>> [1]
>> https://airlied.blogspot.com/2022/09/accelerators-bof-outcomes-summary.html
>>
>> [2]
>> https://lore.kernel.org/all/20231020155835.1295524-1-aravind.iddamsetty@linux.intel.com/
>>
>>
>> Thanks,
>> Aravind.
>> On 14-02-2025 13:37, Xiang Liu wrote:
>>> This patch series generate RAS CPER records for UE/DE/CE/BP threshold exceed
>>> event. SMU_TYPE_CE banks are combined into 1 CPER entry, they could be CEs or
>>> DEs or both. UEs and BPs are encoded into separate CPER entries.
>>>
>>> RAS CPER records for CEs will be generated only after CEs count been queried.
>>>
>>> All records are committed to a pure software ring with a limit size, new records
>>> will flush older records when overflow happened. User can access the records by
>>> reading debugfs node, which is read-only.
>>>
>>> Hawking Zhang (5):
>>> drm/amd/include: Add amd cper header
>>> drm/amdgpu: Introduce funcs for populating CPER
>>> drm/amdgpu: Include ACA error type in aca bank
>>> drm/amdgpu: Introduce funcs for generating cper record
>>> drm/amdgpu: Generate cper records
>>>
>>> Tao Zhou (4):
>>> drm/amdgpu: add RAS CPER ring buffer
>>> drm/amdgpu: read CPER ring via debugfs
>>> drm/amdgpu: add data write function for CPER ring
>>> drm/amdgpu: add mutex lock for cper ring
>>>
>>> Xiang Liu (3):
>>> drm/amdgpu: Get timestamp from system time
>>> drm/amdgpu: Commit CPER entry
>>> drm/amdgpu: Generate bad page threshold cper records
>>>
>>> drivers/gpu/drm/amd/amdgpu/Makefile | 2 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 46 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 16 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 559 +++++++++++++++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h | 104 ++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 91 +++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +
>>> drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 3 +-
>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +
>>> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +
>>> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +
>>> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 2 +
>>> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 1 +
>>> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +
>>> drivers/gpu/drm/amd/include/amd_cper.h | 269 ++++++++++
>>> drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 3 +
>>> 19 files changed, 1075 insertions(+), 40 deletions(-)
>>> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
>>> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
>>> create mode 100644 drivers/gpu/drm/amd/include/amd_cper.h
>>>
[-- Attachment #2: Type: text/html, Size: 5250 bytes --]
^ permalink raw reply [flat|nested] 17+ messages in thread