* [PATCH 1/4] efi/cper, cxl: Make definitions and structures global
2024-05-22 15:08 [PATCH 0/4] acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors Smita Koralahalli
@ 2024-05-22 15:08 ` Smita Koralahalli
2024-05-22 17:28 ` Dave Jiang
` (2 more replies)
2024-05-22 15:08 ` [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors Smita Koralahalli
` (2 subsequent siblings)
3 siblings, 3 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-22 15:08 UTC (permalink / raw)
To: linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
In preparation to add tracepoint support, move protocol error UUID
definition to a common location and make CXL RAS capability struct
global for use across different modules.
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
drivers/firmware/efi/cper_cxl.c | 11 -----------
drivers/firmware/efi/cper_cxl.h | 7 ++-----
include/linux/cper.h | 4 ++++
include/linux/cxl-event.h | 11 +++++++++++
4 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index a55771b99a97..4fd8d783993e 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -18,17 +18,6 @@
#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
-/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
-struct cxl_ras_capability_regs {
- u32 uncor_status;
- u32 uncor_mask;
- u32 uncor_severity;
- u32 cor_status;
- u32 cor_mask;
- u32 cap_control;
- u32 header_log[16];
-};
-
static const char * const prot_err_agent_type_strs[] = {
"Restricted CXL Device",
"Restricted CXL Host Downstream Port",
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
index 86bfcf7909ec..6f8c00495708 100644
--- a/drivers/firmware/efi/cper_cxl.h
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -7,14 +7,11 @@
* Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
*/
+#include <linux/cxl-event.h>
+
#ifndef LINUX_CPER_CXL_H
#define LINUX_CPER_CXL_H
-/* CXL Protocol Error Section */
-#define CPER_SEC_CXL_PROT_ERR \
- GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
- 0x4B, 0x77, 0x10, 0x48)
-
#pragma pack(1)
/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 265b0f8fc0b3..5c6d4d5b9975 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -89,6 +89,10 @@ enum {
#define CPER_NOTIFY_DMAR \
GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
0x72, 0x2D, 0xEB, 0x41)
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR \
+ GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
+ 0x4B, 0x77, 0x10, 0x48)
/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
/*
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 60b25020281f..f11e52ff565a 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -154,6 +154,17 @@ struct cxl_cper_event_rec {
union cxl_event event;
} __packed;
+/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
+struct cxl_ras_capability_regs {
+ u32 uncor_status;
+ u32 uncor_mask;
+ u32 uncor_severity;
+ u32 cor_status;
+ u32 cor_mask;
+ u32 cap_control;
+ u32 header_log[16];
+};
+
struct cxl_cper_work_data {
enum cxl_event_type event_type;
struct cxl_cper_event_rec rec;
--
2.17.1
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] efi/cper, cxl: Make definitions and structures global
2024-05-22 15:08 ` [PATCH 1/4] efi/cper, cxl: Make definitions and structures global Smita Koralahalli
@ 2024-05-22 17:28 ` Dave Jiang
2024-05-22 23:40 ` Alison Schofield
2024-06-07 15:14 ` Jonathan Cameron
2 siblings, 0 replies; 28+ messages in thread
From: Dave Jiang @ 2024-05-22 17:28 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> ---
> drivers/firmware/efi/cper_cxl.c | 11 -----------
> drivers/firmware/efi/cper_cxl.h | 7 ++-----
> include/linux/cper.h | 4 ++++
> include/linux/cxl-event.h | 11 +++++++++++
> 4 files changed, 17 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index a55771b99a97..4fd8d783993e 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -18,17 +18,6 @@
> #define PROT_ERR_VALID_DVSEC BIT_ULL(5)
> #define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
>
> -/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
> -struct cxl_ras_capability_regs {
> - u32 uncor_status;
> - u32 uncor_mask;
> - u32 uncor_severity;
> - u32 cor_status;
> - u32 cor_mask;
> - u32 cap_control;
> - u32 header_log[16];
> -};
> -
> static const char * const prot_err_agent_type_strs[] = {
> "Restricted CXL Device",
> "Restricted CXL Host Downstream Port",
> diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
> index 86bfcf7909ec..6f8c00495708 100644
> --- a/drivers/firmware/efi/cper_cxl.h
> +++ b/drivers/firmware/efi/cper_cxl.h
> @@ -7,14 +7,11 @@
> * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> */
>
> +#include <linux/cxl-event.h>
> +
> #ifndef LINUX_CPER_CXL_H
> #define LINUX_CPER_CXL_H
>
> -/* CXL Protocol Error Section */
> -#define CPER_SEC_CXL_PROT_ERR \
> - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> - 0x4B, 0x77, 0x10, 0x48)
> -
> #pragma pack(1)
>
> /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
> diff --git a/include/linux/cper.h b/include/linux/cper.h
> index 265b0f8fc0b3..5c6d4d5b9975 100644
> --- a/include/linux/cper.h
> +++ b/include/linux/cper.h
> @@ -89,6 +89,10 @@ enum {
> #define CPER_NOTIFY_DMAR \
> GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
> 0x72, 0x2D, 0xEB, 0x41)
> +/* CXL Protocol Error Section */
> +#define CPER_SEC_CXL_PROT_ERR \
> + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
> + 0x4B, 0x77, 0x10, 0x48)
>
> /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
> /*
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 60b25020281f..f11e52ff565a 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -154,6 +154,17 @@ struct cxl_cper_event_rec {
> union cxl_event event;
> } __packed;
>
> +/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
> +struct cxl_ras_capability_regs {
> + u32 uncor_status;
> + u32 uncor_mask;
> + u32 uncor_severity;
> + u32 cor_status;
> + u32 cor_mask;
> + u32 cap_control;
> + u32 header_log[16];
> +};
> +
> struct cxl_cper_work_data {
> enum cxl_event_type event_type;
> struct cxl_cper_event_rec rec;
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 1/4] efi/cper, cxl: Make definitions and structures global
2024-05-22 15:08 ` [PATCH 1/4] efi/cper, cxl: Make definitions and structures global Smita Koralahalli
2024-05-22 17:28 ` Dave Jiang
@ 2024-05-22 23:40 ` Alison Schofield
2024-06-07 15:14 ` Jonathan Cameron
2 siblings, 0 replies; 28+ messages in thread
From: Alison Schofield @ 2024-05-22 23:40 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Bowman Terry
On Wed, May 22, 2024 at 03:08:36PM +0000, Smita Koralahalli wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/firmware/efi/cper_cxl.c | 11 -----------
> drivers/firmware/efi/cper_cxl.h | 7 ++-----
> include/linux/cper.h | 4 ++++
> include/linux/cxl-event.h | 11 +++++++++++
> 4 files changed, 17 insertions(+), 16 deletions(-)
>
snip to end
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] efi/cper, cxl: Make definitions and structures global
2024-05-22 15:08 ` [PATCH 1/4] efi/cper, cxl: Make definitions and structures global Smita Koralahalli
2024-05-22 17:28 ` Dave Jiang
2024-05-22 23:40 ` Alison Schofield
@ 2024-06-07 15:14 ` Jonathan Cameron
2024-06-10 18:31 ` Smita Koralahalli
2 siblings, 1 reply; 28+ messages in thread
From: Jonathan Cameron @ 2024-06-07 15:14 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel,
Alison Schofield, Vishal Verma, Ira Weiny, Dan Williams,
Yazen Ghannam, Bowman Terry
On Wed, 22 May 2024 15:08:36 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> In preparation to add tracepoint support, move protocol error UUID
> definition to a common location and make CXL RAS capability struct
> global for use across different modules.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
FWIW given already well reviewed.
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 1/4] efi/cper, cxl: Make definitions and structures global
2024-06-07 15:14 ` Jonathan Cameron
@ 2024-06-10 18:31 ` Smita Koralahalli
0 siblings, 0 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-06-10 18:31 UTC (permalink / raw)
To: Jonathan Cameron
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel,
Alison Schofield, Vishal Verma, Ira Weiny, Dan Williams,
Yazen Ghannam, Bowman Terry
On 6/7/2024 8:14 AM, Jonathan Cameron wrote:
> On Wed, 22 May 2024 15:08:36 +0000
> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
>
>> In preparation to add tracepoint support, move protocol error UUID
>> definition to a common location and make CXL RAS capability struct
>> global for use across different modules.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> FWIW given already well reviewed.
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>
Sorry I missed your reviewed by tag. Thanks for reviewing my patches!
Thanks
Smita
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-22 15:08 [PATCH 0/4] acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors Smita Koralahalli
2024-05-22 15:08 ` [PATCH 1/4] efi/cper, cxl: Make definitions and structures global Smita Koralahalli
@ 2024-05-22 15:08 ` Smita Koralahalli
2024-05-22 17:59 ` Dave Jiang
2024-05-23 0:03 ` Alison Schofield
2024-05-22 15:08 ` [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First " Smita Koralahalli
2024-05-22 15:08 ` [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev() Smita Koralahalli
3 siblings, 2 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-22 15:08 UTC (permalink / raw)
To: linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
Severity, Device ID, Device Serial number and CXL RAS capability struct in
struct cxl_cper_prot_err. Include this struct as a member of struct
cxl_cper_work_data.
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
drivers/acpi/apei/ghes.c | 10 +++++
drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
include/linux/cxl-event.h | 26 +++++++++++++
3 files changed, 102 insertions(+)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 623cc0cb4a65..1a58032770ee 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
schedule_work(cxl_cper_work);
}
+static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
+{
+ struct cxl_cper_work_data wd;
+
+ if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
+ return;
+}
+
int cxl_cper_register_work(struct work_struct *work)
{
if (cxl_cper_work)
@@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+ cxl_cper_handle_prot_err(gdata);
} else {
void *err = acpi_hest_get_payload(gdata);
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 4fd8d783993e..03b9839f3b73 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@
*/
#include <linux/cper.h>
+#include <acpi/ghes.h>
#include "cper_cxl.h"
#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
@@ -44,6 +45,17 @@ enum {
USP, /* CXL Upstream Switch Port */
};
+static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
+{
+ switch (cper_severity) {
+ case CPER_SEV_RECOVERABLE:
+ case CPER_SEV_FATAL:
+ return CXL_AER_UNCORRECTABLE;
+ default:
+ return CXL_AER_CORRECTABLE;
+ }
+}
+
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
sizeof(cxl_ras->header_log), 0);
}
}
+
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_prot_err *p_err)
+{
+ struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+ u8 *dvsec_start, *cap_start;
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
+ pr_err(FW_WARN "No Device ID\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The device ID or agent address is required for CXL RCD, CXL
+ * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
+ * CXL Downstream Switch Port and CXL Upstream Switch Port.
+ */
+ if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
+ p_err->segment = prot_err->agent_addr.segment;
+ p_err->bus = prot_err->agent_addr.bus;
+ p_err->device = prot_err->agent_addr.device;
+ p_err->function = prot_err->agent_addr.function;
+ } else {
+ pr_err(FW_WARN "Invalid agent type\n");
+ return -EINVAL;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+ pr_err(FW_WARN "Invalid Protocol Error log\n");
+ return -EINVAL;
+ }
+
+ dvsec_start = (u8 *)(prot_err + 1);
+ cap_start = dvsec_start + prot_err->dvsec_len;
+ p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
+
+ /*
+ * Set device serial number unconditionally.
+ *
+ * Print a warning message if it is not valid. The device serial
+ * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
+ * Manager Managed LD.
+ */
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
+ prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
+ pr_warn(FW_WARN "No Device Serial number\n");
+
+ p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
+ p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
+
+ p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
+
+ return 0;
+}
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index f11e52ff565a..9c7b69e076a0 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -165,11 +165,37 @@ struct cxl_ras_capability_regs {
u32 header_log[16];
};
+enum cxl_aer_err_type {
+ CXL_AER_UNCORRECTABLE,
+ CXL_AER_CORRECTABLE,
+};
+
+struct cxl_cper_prot_err {
+ struct cxl_ras_capability_regs cxl_ras;
+
+ /* Device ID */
+ u8 function;
+ u8 device;
+ u8 bus;
+ u16 segment;
+
+ /* Device Serial Number */
+ u32 lower_dw;
+ u32 upper_dw;
+
+ int severity;
+};
+
struct cxl_cper_work_data {
enum cxl_event_type event_type;
struct cxl_cper_event_rec rec;
+ struct cxl_cper_prot_err p_err;
};
+struct acpi_hest_generic_data;
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+ struct cxl_cper_prot_err *p_err);
+
#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_work(struct work_struct *work);
int cxl_cper_unregister_work(struct work_struct *work);
--
2.17.1
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-22 15:08 ` [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors Smita Koralahalli
@ 2024-05-22 17:59 ` Dave Jiang
2024-05-23 21:19 ` Smita Koralahalli
2024-05-23 0:03 ` Alison Schofield
1 sibling, 1 reply; 28+ messages in thread
From: Dave Jiang @ 2024-05-22 17:59 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/acpi/apei/ghes.c | 10 +++++
> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
> include/linux/cxl-event.h | 26 +++++++++++++
> 3 files changed, 102 insertions(+)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 623cc0cb4a65..1a58032770ee 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> schedule_work(cxl_cper_work);
> }
>
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> + struct cxl_cper_work_data wd;
> +
> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> + return;
> +}
> +
> int cxl_cper_register_work(struct work_struct *work)
> {
> if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>
> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> + cxl_cper_handle_prot_err(gdata);
> } else {
> void *err = acpi_hest_get_payload(gdata);
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..03b9839f3b73 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/cper.h>
> +#include <acpi/ghes.h>
> #include "cper_cxl.h"
>
> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> @@ -44,6 +45,17 @@ enum {
> USP, /* CXL Upstream Switch Port */
> };
>
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> + switch (cper_severity) {
> + case CPER_SEV_RECOVERABLE:
> + case CPER_SEV_FATAL:
> + return CXL_AER_UNCORRECTABLE;
> + default:
> + return CXL_AER_CORRECTABLE;
> + }
> +}
> +
> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> {
> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> sizeof(cxl_ras->header_log), 0);
> }
> }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_prot_err *p_err)
> +{
> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> + u8 *dvsec_start, *cap_start;
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> + pr_err(FW_WARN "No Device ID\n");
> + return -EINVAL;
> + }
> +
> + /*
> + * The device ID or agent address is required for CXL RCD, CXL
> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
> + */
> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
Perhaps define an enum CXL_AGENT_TYPE_MAX instead of 0x7 magic number? Otherwise if a new type is introduced, it would break this code.
> + p_err->segment = prot_err->agent_addr.segment;
> + p_err->bus = prot_err->agent_addr.bus;
> + p_err->device = prot_err->agent_addr.device;
> + p_err->function = prot_err->agent_addr.function;
> + } else {
> + pr_err(FW_WARN "Invalid agent type\n");
> + return -EINVAL;
> + }
Up to you if you want to do this or not, but maybe:
if (prot_err->agent_type >= CXL_AGENT_TYPE_MAX || prot_err->agent_type == RCH_DP) {
pr_warn(...);
return -EINVAL;
}
p_err->segment = ...;
p_err->bus = ...;
...
Although perhaps a helper function cxl_cper_valid_agent_type() that checks invalid agent type by checking the valid_bits, the agent_type boundary, and if agent_type != RCH_DP?
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> + pr_err(FW_WARN "Invalid Protocol Error log\n");
> + return -EINVAL;
> + }
> +
> + dvsec_start = (u8 *)(prot_err + 1);
> + cap_start = dvsec_start + prot_err->dvsec_len;
> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> +
> + /*
> + * Set device serial number unconditionally.
> + *
> + * Print a warning message if it is not valid. The device serial
> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> + * Manager Managed LD.
> + */
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
prot_err->agent_type > FM_LD? Although maybe it would be a clearer read if a helper function is defined to identify the agent types such as cxl_cper_prot_err_serial_needed() or cxl_cper_prot_agent_type_device() and with it a switch statement to explicitly identify all the agent types that require serial number. If a future device is defined, the > 0x4 logic may break.
DJ
> + pr_warn(FW_WARN "No Device Serial number\n");
> +
> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> + return 0;
> +}
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index f11e52ff565a..9c7b69e076a0 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -165,11 +165,37 @@ struct cxl_ras_capability_regs {
> u32 header_log[16];
> };
>
> +enum cxl_aer_err_type {
> + CXL_AER_UNCORRECTABLE,
> + CXL_AER_CORRECTABLE,
> +};
> +
> +struct cxl_cper_prot_err {
> + struct cxl_ras_capability_regs cxl_ras;
> +
> + /* Device ID */
> + u8 function;
> + u8 device;
> + u8 bus;
> + u16 segment;
> +
> + /* Device Serial Number */
> + u32 lower_dw;
> + u32 upper_dw;
> +
> + int severity;
> +};
> +
> struct cxl_cper_work_data {
> enum cxl_event_type event_type;
> struct cxl_cper_event_rec rec;
> + struct cxl_cper_prot_err p_err;
> };
>
> +struct acpi_hest_generic_data;
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_prot_err *p_err);
> +
> #ifdef CONFIG_ACPI_APEI_GHES
> int cxl_cper_register_work(struct work_struct *work);
> int cxl_cper_unregister_work(struct work_struct *work);
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-22 17:59 ` Dave Jiang
@ 2024-05-23 21:19 ` Smita Koralahalli
2024-05-23 22:51 ` Dave Jiang
0 siblings, 1 reply; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-23 21:19 UTC (permalink / raw)
To: Dave Jiang, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
Hi Dave,
On 5/22/2024 10:59 AM, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>> struct cxl_cper_prot_err. Include this struct as a member of struct
>> cxl_cper_work_data.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> drivers/acpi/apei/ghes.c | 10 +++++
>> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
>> include/linux/cxl-event.h | 26 +++++++++++++
>> 3 files changed, 102 insertions(+)
>>
[snip]
>> + * The device ID or agent address is required for CXL RCD, CXL
>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>> + */
>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>
> Perhaps define an enum CXL_AGENT_TYPE_MAX instead of 0x7 magic number? Otherwise if a new type is introduced, it would break this code.
Agreed. I will define a boolean array indexed by agent type as suggested
by Alison. That would avoid all these comparisons and not worry about
breaking code in future.
>
>> + p_err->segment = prot_err->agent_addr.segment;
>> + p_err->bus = prot_err->agent_addr.bus;
>> + p_err->device = prot_err->agent_addr.device;
>> + p_err->function = prot_err->agent_addr.function;
>> + } else {
>> + pr_err(FW_WARN "Invalid agent type\n");
>> + return -EINVAL;
>> + }
>
> Up to you if you want to do this or not, but maybe:
>
> if (prot_err->agent_type >= CXL_AGENT_TYPE_MAX || prot_err->agent_type == RCH_DP) {
> pr_warn(...);
> return -EINVAL;
> }
>
> p_err->segment = ...;
> p_err->bus = ...;
Noted.
> ...
>
> Although perhaps a helper function cxl_cper_valid_agent_type() that checks invalid agent type by checking the valid_bits, the agent_type boundary, and if agent_type != RCH_DP?
Okay.
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> + pr_err(FW_WARN "Invalid Protocol Error log\n");
>> + return -EINVAL;
>> + }
>> +
>> + dvsec_start = (u8 *)(prot_err + 1);
>> + cap_start = dvsec_start + prot_err->dvsec_len;
>> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
>> +
>> + /*
>> + * Set device serial number unconditionally.
>> + *
>> + * Print a warning message if it is not valid. The device serial
>> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>> + * Manager Managed LD.
>> + */
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>
> prot_err->agent_type > FM_LD? Although maybe it would be a clearer read if a helper function is defined to identify the agent types such as cxl_cper_prot_err_serial_needed() or cxl_cper_prot_agent_type_device() and with it a switch statement to explicitly identify all the agent types that require serial number. If a future device is defined, the > 0x4 logic may break.
Probably helper function is not required if boolean array is defined?
What do you think?
Thanks,
Smita
[snip]
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-23 21:19 ` Smita Koralahalli
@ 2024-05-23 22:51 ` Dave Jiang
0 siblings, 0 replies; 28+ messages in thread
From: Dave Jiang @ 2024-05-23 22:51 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/23/24 2:19 PM, Smita Koralahalli wrote:
> Hi Dave,
>
> On 5/22/2024 10:59 AM, Dave Jiang wrote:
>>
>>
>> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>>
>>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>>> struct cxl_cper_prot_err. Include this struct as a member of struct
>>> cxl_cper_work_data.
>>>
>>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>>> ---
>>> drivers/acpi/apei/ghes.c | 10 +++++
>>> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
>>> include/linux/cxl-event.h | 26 +++++++++++++
>>> 3 files changed, 102 insertions(+)
>>>
>
> [snip]
>
>
>>> + * The device ID or agent address is required for CXL RCD, CXL
>>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>>> + */
>>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>>
>> Perhaps define an enum CXL_AGENT_TYPE_MAX instead of 0x7 magic number? Otherwise if a new type is introduced, it would break this code.
>
> Agreed. I will define a boolean array indexed by agent type as suggested by Alison. That would avoid all these comparisons and not worry about breaking code in future.
>
>>
>>> + p_err->segment = prot_err->agent_addr.segment;
>>> + p_err->bus = prot_err->agent_addr.bus;
>>> + p_err->device = prot_err->agent_addr.device;
>>> + p_err->function = prot_err->agent_addr.function;
>>> + } else {
>>> + pr_err(FW_WARN "Invalid agent type\n");
>>> + return -EINVAL;
>>> + }
>>
>> Up to you if you want to do this or not, but maybe:
>>
>> if (prot_err->agent_type >= CXL_AGENT_TYPE_MAX || prot_err->agent_type == RCH_DP) {
>> pr_warn(...);
>> return -EINVAL;
>> }
>>
>> p_err->segment = ...;
>> p_err->bus = ...;
>
> Noted.
>
>> ...
>>
>> Although perhaps a helper function cxl_cper_valid_agent_type() that checks invalid agent type by checking the valid_bits, the agent_type boundary, and if agent_type != RCH_DP?
>
> Okay.
>
>>> +
>>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>>> + pr_err(FW_WARN "Invalid Protocol Error log\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + dvsec_start = (u8 *)(prot_err + 1);
>>> + cap_start = dvsec_start + prot_err->dvsec_len;
>>> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
>>> +
>>> + /*
>>> + * Set device serial number unconditionally.
>>> + *
>>> + * Print a warning message if it is not valid. The device serial
>>> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>>> + * Manager Managed LD.
>>> + */
>>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>>
>> prot_err->agent_type > FM_LD? Although maybe it would be a clearer read if a helper function is defined to identify the agent types such as cxl_cper_prot_err_serial_needed() or cxl_cper_prot_agent_type_device() and with it a switch statement to explicitly identify all the agent types that require serial number. If a future device is defined, the > 0x4 logic may break.
>
> Probably helper function is not required if boolean array is defined? What do you think?
That works for me. My main concern is to clarify the code and remove possibility of breakage from future changes.
>
> Thanks,
> Smita
>
> [snip]
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-22 15:08 ` [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors Smita Koralahalli
2024-05-22 17:59 ` Dave Jiang
@ 2024-05-23 0:03 ` Alison Schofield
2024-05-23 21:21 ` Smita Koralahalli
1 sibling, 1 reply; 28+ messages in thread
From: Alison Schofield @ 2024-05-23 0:03 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Bowman Terry
On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/acpi/apei/ghes.c | 10 +++++
> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
> include/linux/cxl-event.h | 26 +++++++++++++
> 3 files changed, 102 insertions(+)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 623cc0cb4a65..1a58032770ee 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> schedule_work(cxl_cper_work);
> }
>
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> + struct cxl_cper_work_data wd;
> +
> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> + return;
> +}
> +
> int cxl_cper_register_work(struct work_struct *work)
> {
> if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>
> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> + cxl_cper_handle_prot_err(gdata);
> } else {
> void *err = acpi_hest_get_payload(gdata);
>
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..03b9839f3b73 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
> */
>
> #include <linux/cper.h>
> +#include <acpi/ghes.h>
> #include "cper_cxl.h"
>
> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> @@ -44,6 +45,17 @@ enum {
> USP, /* CXL Upstream Switch Port */
> };
>
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> + switch (cper_severity) {
> + case CPER_SEV_RECOVERABLE:
> + case CPER_SEV_FATAL:
> + return CXL_AER_UNCORRECTABLE;
> + default:
> + return CXL_AER_CORRECTABLE;
> + }
> +}
> +
> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> {
> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> sizeof(cxl_ras->header_log), 0);
> }
> }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> + struct cxl_cper_prot_err *p_err)
> +{
> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> + u8 *dvsec_start, *cap_start;
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> + pr_err(FW_WARN "No Device ID\n");
> + return -EINVAL;
> + }
> +
> + /*
> + * The device ID or agent address is required for CXL RCD, CXL
> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
> + */
> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
For this check against agent_type, and the similar one below, would a boolean
array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
below. It seems one array would suffice for this case, but naming it isn't obvious
to me. Maybe it'll be to you.
Something similar to what is done for prot_err_agent_type_strs[]
static const bool agent_requires_id_address_serial[] = {
true, /* RDC */
false, /* RCH_DP */
.
.
.
};
> + p_err->segment = prot_err->agent_addr.segment;
> + p_err->bus = prot_err->agent_addr.bus;
> + p_err->device = prot_err->agent_addr.device;
> + p_err->function = prot_err->agent_addr.function;
> + } else {
> + pr_err(FW_WARN "Invalid agent type\n");
> + return -EINVAL;
> + }
> +
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> + pr_err(FW_WARN "Invalid Protocol Error log\n");
> + return -EINVAL;
> + }
> +
> + dvsec_start = (u8 *)(prot_err + 1);
> + cap_start = dvsec_start + prot_err->dvsec_len;
> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> +
> + /*
> + * Set device serial number unconditionally.
> + *
> + * Print a warning message if it is not valid. The device serial
> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> + * Manager Managed LD.
> + */
> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
then this also can be replaced with
agent_requires_id_address_serial[prot_err->agent_type]
-- Alison
> + pr_warn(FW_WARN "No Device Serial number\n");
> +
> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> + return 0;
> +}
snip
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-23 0:03 ` Alison Schofield
@ 2024-05-23 21:21 ` Smita Koralahalli
2024-06-07 15:26 ` Jonathan Cameron
0 siblings, 1 reply; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-23 21:21 UTC (permalink / raw)
To: Alison Schofield
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Bowman Terry
Hi Alison,
On 5/22/2024 5:03 PM, Alison Schofield wrote:
> On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>> struct cxl_cper_prot_err. Include this struct as a member of struct
>> cxl_cper_work_data.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> drivers/acpi/apei/ghes.c | 10 +++++
>> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
>> include/linux/cxl-event.h | 26 +++++++++++++
>> 3 files changed, 102 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 623cc0cb4a65..1a58032770ee 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>> schedule_work(cxl_cper_work);
>> }
>>
>> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>> +{
>> + struct cxl_cper_work_data wd;
>> +
>> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>> + return;
>> +}
>> +
>> int cxl_cper_register_work(struct work_struct *work)
>> {
>> if (cxl_cper_work)
>> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
>> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>>
>> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
>> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
>> + cxl_cper_handle_prot_err(gdata);
>> } else {
>> void *err = acpi_hest_get_payload(gdata);
>>
>> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
>> index 4fd8d783993e..03b9839f3b73 100644
>> --- a/drivers/firmware/efi/cper_cxl.c
>> +++ b/drivers/firmware/efi/cper_cxl.c
>> @@ -8,6 +8,7 @@
>> */
>>
>> #include <linux/cper.h>
>> +#include <acpi/ghes.h>
>> #include "cper_cxl.h"
>>
>> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
>> @@ -44,6 +45,17 @@ enum {
>> USP, /* CXL Upstream Switch Port */
>> };
>>
>> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
>> +{
>> + switch (cper_severity) {
>> + case CPER_SEV_RECOVERABLE:
>> + case CPER_SEV_FATAL:
>> + return CXL_AER_UNCORRECTABLE;
>> + default:
>> + return CXL_AER_CORRECTABLE;
>> + }
>> +}
>> +
>> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>> {
>> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
>> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>> sizeof(cxl_ras->header_log), 0);
>> }
>> }
>> +
>> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
>> + struct cxl_cper_prot_err *p_err)
>> +{
>> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
>> + u8 *dvsec_start, *cap_start;
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
>> + pr_err(FW_WARN "No Device ID\n");
>> + return -EINVAL;
>> + }
>> +
>> + /*
>> + * The device ID or agent address is required for CXL RCD, CXL
>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>> + */
>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>
> For this check against agent_type, and the similar one below, would a boolean
> array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
> below. It seems one array would suffice for this case, but naming it isn't obvious
> to me. Maybe it'll be to you.
>
> Something similar to what is done for prot_err_agent_type_strs[]
>
> static const bool agent_requires_id_address_serial[] = {
> true, /* RDC */
> false, /* RCH_DP */
> .
> .
> .
> };
>
>
Noted. Will implement it this way!
Thanks
Smita
>> + p_err->segment = prot_err->agent_addr.segment;
>> + p_err->bus = prot_err->agent_addr.bus;
>> + p_err->device = prot_err->agent_addr.device;
>> + p_err->function = prot_err->agent_addr.function;
>> + } else {
>> + pr_err(FW_WARN "Invalid agent type\n");
>> + return -EINVAL;
>> + }
>> +
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> + pr_err(FW_WARN "Invalid Protocol Error log\n");
>> + return -EINVAL;
>> + }
>> +
>> + dvsec_start = (u8 *)(prot_err + 1);
>> + cap_start = dvsec_start + prot_err->dvsec_len;
>> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
>> +
>> + /*
>> + * Set device serial number unconditionally.
>> + *
>> + * Print a warning message if it is not valid. The device serial
>> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>> + * Manager Managed LD.
>> + */
>> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
>
> then this also can be replaced with
> agent_requires_id_address_serial[prot_err->agent_type]
>
>
> -- Alison
>
>
>> + pr_warn(FW_WARN "No Device Serial number\n");
>> +
>> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
>> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
>> +
>> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
>> +
>> + return 0;
>> +}
>
> snip
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-05-23 21:21 ` Smita Koralahalli
@ 2024-06-07 15:26 ` Jonathan Cameron
2024-06-10 19:07 ` Smita Koralahalli
0 siblings, 1 reply; 28+ messages in thread
From: Jonathan Cameron @ 2024-06-07 15:26 UTC (permalink / raw)
To: Smita Koralahalli
Cc: Alison Schofield, linux-efi, linux-kernel, linux-cxl,
Ard Biesheuvel, Vishal Verma, Ira Weiny, Dan Williams,
Yazen Ghannam, Bowman Terry
On Thu, 23 May 2024 14:21:40 -0700
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> Hi Alison,
>
> On 5/22/2024 5:03 PM, Alison Schofield wrote:
> > On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
> >> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> >>
> >> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> >> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> >> struct cxl_cper_prot_err. Include this struct as a member of struct
> >> cxl_cper_work_data.
> >>
> >> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> >> ---
> >> drivers/acpi/apei/ghes.c | 10 +++++
> >> drivers/firmware/efi/cper_cxl.c | 66 +++++++++++++++++++++++++++++++++
> >> include/linux/cxl-event.h | 26 +++++++++++++
> >> 3 files changed, 102 insertions(+)
> >>
> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> >> index 623cc0cb4a65..1a58032770ee 100644
> >> --- a/drivers/acpi/apei/ghes.c
> >> +++ b/drivers/acpi/apei/ghes.c
> >> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
> >> schedule_work(cxl_cper_work);
> >> }
> >>
> >> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> >> +{
> >> + struct cxl_cper_work_data wd;
> >> +
> >> + if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> >> + return;
> >> +}
> >> +
> >> int cxl_cper_register_work(struct work_struct *work)
> >> {
> >> if (cxl_cper_work)
> >> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
> >> struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
> >>
> >> cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> >> + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> >> + cxl_cper_handle_prot_err(gdata);
> >> } else {
> >> void *err = acpi_hest_get_payload(gdata);
> >>
> >> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> >> index 4fd8d783993e..03b9839f3b73 100644
> >> --- a/drivers/firmware/efi/cper_cxl.c
> >> +++ b/drivers/firmware/efi/cper_cxl.c
> >> @@ -8,6 +8,7 @@
> >> */
> >>
> >> #include <linux/cper.h>
> >> +#include <acpi/ghes.h>
> >> #include "cper_cxl.h"
> >>
> >> #define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
> >> @@ -44,6 +45,17 @@ enum {
> >> USP, /* CXL Upstream Switch Port */
> >> };
> >>
> >> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> >> +{
> >> + switch (cper_severity) {
> >> + case CPER_SEV_RECOVERABLE:
> >> + case CPER_SEV_FATAL:
> >> + return CXL_AER_UNCORRECTABLE;
> >> + default:
> >> + return CXL_AER_CORRECTABLE;
> >> + }
> >> +}
> >> +
> >> void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
> >> {
> >> if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> >> @@ -176,3 +188,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
> >> sizeof(cxl_ras->header_log), 0);
> >> }
> >> }
> >> +
> >> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> >> + struct cxl_cper_prot_err *p_err)
> >> +{
> >> + struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> >> + u8 *dvsec_start, *cap_start;
> >> +
> >> + if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> >> + pr_err(FW_WARN "No Device ID\n");
> >> + return -EINVAL;
> >> + }
> >> +
> >> + /*
> >> + * The device ID or agent address is required for CXL RCD, CXL
> >> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> >> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
> >> + */
> >> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
> >
> > For this check against agent_type, and the similar one below, would a boolean
> > array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
> > below. It seems one array would suffice for this case, but naming it isn't obvious
> > to me. Maybe it'll be to you.
> >
> > Something similar to what is done for prot_err_agent_type_strs[]
> >
> > static const bool agent_requires_id_address_serial[] = {
> > true, /* RDC */
> > false, /* RCH_DP */
[RCD] = false,
etc rather than comments would be neater.
Given two similar things already. Maybe time for a little structure.
//with better name than this
struct agent_reqs {
bool sn;
bool sbdf;
};
static const agent_reqs agent_reqs[] = {
[RCD] = { .sn = false, .sbdf = true, },
};
etc.
Maybe just bring the the string in as well
struct agent_info {
const char *string;
bool req_sn;
bool req_sbdf;
};
static const agent_info agent_info[] = {
[RD] = {
.string = "Restricted CXL Device",
.req_sn = false,
.req_sbdf = true,
},
};
Values made up, but hopefully conveys that moving to having
all the data in one place makes it harder to forget stuff
for new entries etc.
> > .
> > .
> > .
> > };
> >
> >
>
> Noted. Will implement it this way!
>
> Thanks
> Smita
>
> >> + p_err->segment = prot_err->agent_addr.segment;
> >> + p_err->bus = prot_err->agent_addr.bus;
> >> + p_err->device = prot_err->agent_addr.device;
> >> + p_err->function = prot_err->agent_addr.function;
> >> + } else {
> >> + pr_err(FW_WARN "Invalid agent type\n");
> >> + return -EINVAL;
> >> + }
> >> +
> >> + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> >> + pr_err(FW_WARN "Invalid Protocol Error log\n");
> >> + return -EINVAL;
> >> + }
> >> +
> >> + dvsec_start = (u8 *)(prot_err + 1);
> >> + cap_start = dvsec_start + prot_err->dvsec_len;
> >> + p_err->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> >> +
> >> + /*
> >> + * Set device serial number unconditionally.
> >> + *
> >> + * Print a warning message if it is not valid. The device serial
> >> + * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> >> + * Manager Managed LD.
> >> + */
> >> + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> >> + prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
> >
> > then this also can be replaced with
> > agent_requires_id_address_serial[prot_err->agent_type]
> >
> >
> > -- Alison
> >
> >
> >> + pr_warn(FW_WARN "No Device Serial number\n");
> >> +
> >> + p_err->lower_dw = prot_err->dev_serial_num.lower_dw;
> >> + p_err->upper_dw = prot_err->dev_serial_num.upper_dw;
> >> +
> >> + p_err->severity = cper_severity_cxl_aer(gdata->error_severity);
> >> +
> >> + return 0;
> >> +}
> >
> > snip
> >
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.
2024-06-07 15:26 ` Jonathan Cameron
@ 2024-06-10 19:07 ` Smita Koralahalli
0 siblings, 0 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-06-10 19:07 UTC (permalink / raw)
To: Jonathan Cameron
Cc: Alison Schofield, linux-efi, linux-kernel, linux-cxl,
Ard Biesheuvel, Vishal Verma, Ira Weiny, Dan Williams,
Yazen Ghannam, Bowman Terry
On 6/7/2024 8:26 AM, Jonathan Cameron wrote:
> On Thu, 23 May 2024 14:21:40 -0700
> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
>
>> Hi Alison,
>>
>> On 5/22/2024 5:03 PM, Alison Schofield wrote:
>>> On Wed, May 22, 2024 at 03:08:37PM +0000, Smita Koralahalli wrote:
>>>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>>>
>>>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>>>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>>>> struct cxl_cper_prot_err. Include this struct as a member of struct
>>>> cxl_cper_work_data.
>>>>
>>>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
[snip]
>>>> + * The device ID or agent address is required for CXL RCD, CXL
>>>> + * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>>>> + * CXL Downstream Switch Port and CXL Upstream Switch Port.
>>>> + */
>>>> + if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
>>>
>>> For this check against agent_type, and the similar one below, would a boolean
>>> array indexed by the agent type work? That would avoid the <= 0x7 and > 0x4
>>> below. It seems one array would suffice for this case, but naming it isn't obvious
>>> to me. Maybe it'll be to you.
>>>
>>> Something similar to what is done for prot_err_agent_type_strs[]
>>>
>>> static const bool agent_requires_id_address_serial[] = {
>>> true, /* RDC */
>>> false, /* RCH_DP */
> [RCD] = false,
>
> etc rather than comments would be neater.
> Given two similar things already. Maybe time for a little structure.
>
> //with better name than this
> struct agent_reqs {
> bool sn;
> bool sbdf;
> };
>
> static const agent_reqs agent_reqs[] = {
> [RCD] = { .sn = false, .sbdf = true, },
> };
>
> etc.
>
> Maybe just bring the the string in as well
>
> struct agent_info {
> const char *string;
> bool req_sn;
> bool req_sbdf;
> };
>
> static const agent_info agent_info[] = {
> [RD] = {
> .string = "Restricted CXL Device",
> .req_sn = false,
> .req_sbdf = true,
> },
> };
>
> Values made up, but hopefully conveys that moving to having
> all the data in one place makes it harder to forget stuff
> for new entries etc.
>
Okay. I was considering 2D array without naming before. Something like:
static const bool agent_requires_id_address[][] = {
/* Device_ID, Serial_num */
{true, true}, /* RCD */
{false, false}, /* RCH_DP */
Let me change to array of structures if naming helps.
Thanks,
Smita
[snip]
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-22 15:08 [PATCH 0/4] acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors Smita Koralahalli
2024-05-22 15:08 ` [PATCH 1/4] efi/cper, cxl: Make definitions and structures global Smita Koralahalli
2024-05-22 15:08 ` [PATCH 2/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors Smita Koralahalli
@ 2024-05-22 15:08 ` Smita Koralahalli
2024-05-22 18:05 ` Dave Jiang
` (2 more replies)
2024-05-22 15:08 ` [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev() Smita Koralahalli
3 siblings, 3 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-22 15:08 UTC (permalink / raw)
To: linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records.
Reuse the existing work queue cxl_cper_work registered with GHES to notify
the CXL subsystem on a Protocol error.
The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
them to trace FW-First Protocol Errors.
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
drivers/acpi/apei/ghes.c | 14 ++++++++++++++
drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
drivers/cxl/cxlpci.h | 3 +++
drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
include/linux/cxl-event.h | 1 +
5 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 1a58032770ee..a31bd91e9475 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
return;
+
+ guard(spinlock_irqsave)(&cxl_cper_work_lock);
+
+ if (!cxl_cper_work)
+ return;
+
+ wd.event_type = CXL_CPER_EVENT_PROT_ERR;
+
+ if (!kfifo_put(&cxl_cper_fifo, wd)) {
+ pr_err_ratelimited("CXL CPER kfifo overflow\n");
+ return;
+ }
+
+ schedule_work(cxl_cper_work);
}
int cxl_cper_register_work(struct work_struct *work)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 0df09bd79408..ef9438cb1dd6 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
+ struct cxl_cper_prot_err *p_err)
+{
+ u32 status, fe;
+
+ if (p_err->severity == CXL_AER_CORRECTABLE) {
+ status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
+
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+ } else {
+ status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ p_err->cxl_ras.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+ p_err->cxl_ras.header_log);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
+
static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
void __iomem *ras_base)
{
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 93992a1c8eec..0ba3215786e1 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
void cxl_cor_error_detected(struct pci_dev *pdev);
pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
pci_channel_state_t state);
+struct cxl_cper_prot_err;
+void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
+ struct cxl_cper_prot_err *p_err);
#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 74876c9835e8..3e3c36983686 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
&uuid_null, &rec->event);
}
+static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
+{
+ struct pci_dev *pdev __free(pci_dev_put) = NULL;
+ struct cxl_dev_state *cxlds;
+ unsigned int devfn;
+
+ devfn = PCI_DEVFN(p_err->device, p_err->function);
+ pdev = pci_get_domain_bus_and_slot(p_err->segment,
+ p_err->bus, devfn);
+ if (!pdev)
+ return;
+
+ guard(device)(&pdev->dev);
+ if (pdev->driver != &cxl_pci_driver)
+ return;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
+ pr_warn("CPER-reported device serial number does not match expected value\n");
+
+ cxl_trace_prot_err(cxlds, p_err);
+}
+
static void cxl_cper_work_fn(struct work_struct *work)
{
struct cxl_cper_work_data wd;
- while (cxl_cper_kfifo_get(&wd))
- cxl_handle_cper_event(wd.event_type, &wd.rec);
+ while (cxl_cper_kfifo_get(&wd)) {
+ if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
+ cxl_handle_prot_err(&wd.p_err);
+ else
+ cxl_handle_cper_event(wd.event_type, &wd.rec);
+ }
}
static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 9c7b69e076a0..5562844df850 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -122,6 +122,7 @@ struct cxl_event_record_raw {
} __packed;
enum cxl_event_type {
+ CXL_CPER_EVENT_PROT_ERR,
CXL_CPER_EVENT_GENERIC,
CXL_CPER_EVENT_GEN_MEDIA,
CXL_CPER_EVENT_DRAM,
--
2.17.1
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-22 15:08 ` [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First " Smita Koralahalli
@ 2024-05-22 18:05 ` Dave Jiang
2024-05-23 4:38 ` Alison Schofield
2024-05-23 21:23 ` Smita Koralahalli
2024-05-23 0:22 ` Alison Schofield
2024-06-12 0:07 ` Dan Williams
2 siblings, 2 replies; 28+ messages in thread
From: Dave Jiang @ 2024-05-22 18:05 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records.
>
> Reuse the existing work queue cxl_cper_work registered with GHES to notify
> the CXL subsystem on a Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> drivers/cxl/cxlpci.h | 3 +++
> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> include/linux/cxl-event.h | 1 +
> 5 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 1a58032770ee..a31bd91e9475 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>
> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> return;
> +
> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
> +
> + if (!cxl_cper_work)
> + return;
> +
> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
> +
> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
> + return;
> + }
> +
> + schedule_work(cxl_cper_work);
> }
>
> int cxl_cper_register_work(struct work_struct *work)
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 0df09bd79408..ef9438cb1dd6 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
> }
> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err)
> +{
> + u32 status, fe;
> +
> + if (p_err->severity == CXL_AER_CORRECTABLE) {
> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
> +
> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> + } else {
> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
> +
> + if (hweight32(status) > 1)
> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> + p_err->cxl_ras.cap_control));
> + else
> + fe = status;
> +
> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> + p_err->cxl_ras.header_log);
> + }
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
> +
> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
> void __iomem *ras_base)
> {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 93992a1c8eec..0ba3215786e1 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
> void cxl_cor_error_detected(struct pci_dev *pdev);
> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> pci_channel_state_t state);
> +struct cxl_cper_prot_err;
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err);
> #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 74876c9835e8..3e3c36983686 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> &uuid_null, &rec->event);
> }
>
> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(p_err->device, p_err->function);
> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
> + p_err->bus, devfn);
> + if (!pdev)
> + return;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return;
> +
> + cxlds = pci_get_drvdata(pdev);
> + if (!cxlds)
> + return;
> +
> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
> + pr_warn("CPER-reported device serial number does not match expected value\n");
Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served.
DJ
> +
> + cxl_trace_prot_err(cxlds, p_err);
> +}
> +
> static void cxl_cper_work_fn(struct work_struct *work)
> {
> struct cxl_cper_work_data wd;
>
> - while (cxl_cper_kfifo_get(&wd))
> - cxl_handle_cper_event(wd.event_type, &wd.rec);
> + while (cxl_cper_kfifo_get(&wd)) {
> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
> + cxl_handle_prot_err(&wd.p_err);
> + else
> + cxl_handle_cper_event(wd.event_type, &wd.rec);
> + }
> }
> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 9c7b69e076a0..5562844df850 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
> } __packed;
>
> enum cxl_event_type {
> + CXL_CPER_EVENT_PROT_ERR,
> CXL_CPER_EVENT_GENERIC,
> CXL_CPER_EVENT_GEN_MEDIA,
> CXL_CPER_EVENT_DRAM,
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-22 18:05 ` Dave Jiang
@ 2024-05-23 4:38 ` Alison Schofield
2024-05-23 21:23 ` Smita Koralahalli
1 sibling, 0 replies; 28+ messages in thread
From: Alison Schofield @ 2024-05-23 4:38 UTC (permalink / raw)
To: Dave Jiang
Cc: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl,
Ard Biesheuvel, Vishal Verma, Ira Weiny, Dan Williams,
Jonathan Cameron, Yazen Ghannam, Bowman Terry
On Wed, May 22, 2024 at 11:05:49AM -0700, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> > When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> > CPER records.
> >
> > Reuse the existing work queue cxl_cper_work registered with GHES to notify
> > the CXL subsystem on a Protocol error.
> >
> > The defined trace events cxl_aer_uncorrectable_error and
> > cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> > them to trace FW-First Protocol Errors.
> >
> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> > ---
> > drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> > drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> > drivers/cxl/cxlpci.h | 3 +++
> > drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> > include/linux/cxl-event.h | 1 +
> > 5 files changed, 74 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > index 1a58032770ee..a31bd91e9475 100644
> > --- a/drivers/acpi/apei/ghes.c
> > +++ b/drivers/acpi/apei/ghes.c
> > @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> >
> > if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> > return;
> > +
> > + guard(spinlock_irqsave)(&cxl_cper_work_lock);
> > +
> > + if (!cxl_cper_work)
> > + return;
> > +
> > + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
> > +
> > + if (!kfifo_put(&cxl_cper_fifo, wd)) {
> > + pr_err_ratelimited("CXL CPER kfifo overflow\n");
> > + return;
> > + }
> > +
> > + schedule_work(cxl_cper_work);
> > }
> >
> > int cxl_cper_register_work(struct work_struct *work)
> > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> > index 0df09bd79408..ef9438cb1dd6 100644
> > --- a/drivers/cxl/core/pci.c
> > +++ b/drivers/cxl/core/pci.c
> > @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
> > }
> > EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
> >
> > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> > + struct cxl_cper_prot_err *p_err)
> > +{
> > + u32 status, fe;
> > +
> > + if (p_err->severity == CXL_AER_CORRECTABLE) {
> > + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
> > +
> > + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> > + } else {
> > + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
> > +
> > + if (hweight32(status) > 1)
> > + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> > + p_err->cxl_ras.cap_control));
> > + else
> > + fe = status;
> > +
> > + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> > + p_err->cxl_ras.header_log);
> > + }
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
> > +
> > static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
> > void __iomem *ras_base)
> > {
> > diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> > index 93992a1c8eec..0ba3215786e1 100644
> > --- a/drivers/cxl/cxlpci.h
> > +++ b/drivers/cxl/cxlpci.h
> > @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
> > void cxl_cor_error_detected(struct pci_dev *pdev);
> > pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> > pci_channel_state_t state);
> > +struct cxl_cper_prot_err;
> > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> > + struct cxl_cper_prot_err *p_err);
> > #endif /* __CXL_PCI_H__ */
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 74876c9835e8..3e3c36983686 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> > &uuid_null, &rec->event);
> > }
> >
> > +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> > +{
> > + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> > + struct cxl_dev_state *cxlds;
> > + unsigned int devfn;
> > +
> > + devfn = PCI_DEVFN(p_err->device, p_err->function);
> > + pdev = pci_get_domain_bus_and_slot(p_err->segment,
> > + p_err->bus, devfn);
> > + if (!pdev)
> > + return;
> > +
> > + guard(device)(&pdev->dev);
> > + if (pdev->driver != &cxl_pci_driver)
> > + return;
> > +
> > + cxlds = pci_get_drvdata(pdev);
> > + if (!cxlds)
> > + return;
> > +
> > + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
> > + pr_warn("CPER-reported device serial number does not match expected value\n");
>
> Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served.
>
> DJ
Good point. Providing the dev lets the user look up the serial number,
meaning this message doesn't need to include an 'expected' but not found
value.
-- Alison
> > +
> > + cxl_trace_prot_err(cxlds, p_err);
> > +}
> > +
> > static void cxl_cper_work_fn(struct work_struct *work)
> > {
> > struct cxl_cper_work_data wd;
> >
> > - while (cxl_cper_kfifo_get(&wd))
> > - cxl_handle_cper_event(wd.event_type, &wd.rec);
> > + while (cxl_cper_kfifo_get(&wd)) {
> > + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
> > + cxl_handle_prot_err(&wd.p_err);
> > + else
> > + cxl_handle_cper_event(wd.event_type, &wd.rec);
> > + }
> > }
> > static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
> >
> > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> > index 9c7b69e076a0..5562844df850 100644
> > --- a/include/linux/cxl-event.h
> > +++ b/include/linux/cxl-event.h
> > @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
> > } __packed;
> >
> > enum cxl_event_type {
> > + CXL_CPER_EVENT_PROT_ERR,
> > CXL_CPER_EVENT_GENERIC,
> > CXL_CPER_EVENT_GEN_MEDIA,
> > CXL_CPER_EVENT_DRAM,
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-22 18:05 ` Dave Jiang
2024-05-23 4:38 ` Alison Schofield
@ 2024-05-23 21:23 ` Smita Koralahalli
1 sibling, 0 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-23 21:23 UTC (permalink / raw)
To: Dave Jiang, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/22/2024 11:05 AM, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records.
>>
>> Reuse the existing work queue cxl_cper_work registered with GHES to notify
>> the CXL subsystem on a Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>> drivers/cxl/cxlpci.h | 3 +++
>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>> include/linux/cxl-event.h | 1 +
>> 5 files changed, 74 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 1a58032770ee..a31bd91e9475 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>>
>> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>> return;
>> +
>> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
>> +
>> + if (!cxl_cper_work)
>> + return;
>> +
>> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
>> +
>> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
>> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
>> + return;
>> + }
>> +
>> + schedule_work(cxl_cper_work);
>> }
>>
>> int cxl_cper_register_work(struct work_struct *work)
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 0df09bd79408..ef9438cb1dd6 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
>> }
>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err)
>> +{
>> + u32 status, fe;
>> +
>> + if (p_err->severity == CXL_AER_CORRECTABLE) {
>> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
>> +
>> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>> + } else {
>> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
>> +
>> + if (hweight32(status) > 1)
>> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>> + p_err->cxl_ras.cap_control));
>> + else
>> + fe = status;
>> +
>> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
>> + p_err->cxl_ras.header_log);
>> + }
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
>> +
>> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
>> void __iomem *ras_base)
>> {
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 93992a1c8eec..0ba3215786e1 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
>> void cxl_cor_error_detected(struct pci_dev *pdev);
>> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> pci_channel_state_t state);
>> +struct cxl_cper_prot_err;
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err);
>> #endif /* __CXL_PCI_H__ */
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 74876c9835e8..3e3c36983686 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>> &uuid_null, &rec->event);
>> }
>>
>> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> + struct cxl_dev_state *cxlds;
>> + unsigned int devfn;
>> +
>> + devfn = PCI_DEVFN(p_err->device, p_err->function);
>> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
>> + p_err->bus, devfn);
>> + if (!pdev)
>> + return;
>> +
>> + guard(device)(&pdev->dev);
>> + if (pdev->driver != &cxl_pci_driver)
>> + return;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> + if (!cxlds)
>> + return;
>> +
>> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
>> + pr_warn("CPER-reported device serial number does not match expected value\n");
>
> Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served.
Will fix.
Thanks,
Smita
[snip]
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-22 15:08 ` [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First " Smita Koralahalli
2024-05-22 18:05 ` Dave Jiang
@ 2024-05-23 0:22 ` Alison Schofield
2024-05-23 21:35 ` Smita Koralahalli
2024-06-12 0:07 ` Dan Williams
2 siblings, 1 reply; 28+ messages in thread
From: Alison Schofield @ 2024-05-23 0:22 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Bowman Terry
On Wed, May 22, 2024 at 03:08:38PM +0000, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records.
>
> Reuse the existing work queue cxl_cper_work registered with GHES to notify
> the CXL subsystem on a Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
Will the trace log differentiate between errors reported in FW-First
versus Native mode? Wondering if that bit of info needs to be logged
or is discoverable elsewhere.
Otherwise, LGTM,
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> drivers/cxl/cxlpci.h | 3 +++
> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> include/linux/cxl-event.h | 1 +
> 5 files changed, 74 insertions(+), 2 deletions(-)
snip
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-23 0:22 ` Alison Schofield
@ 2024-05-23 21:35 ` Smita Koralahalli
0 siblings, 0 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-23 21:35 UTC (permalink / raw)
To: Alison Schofield
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Bowman Terry
On 5/22/2024 5:22 PM, Alison Schofield wrote:
> On Wed, May 22, 2024 at 03:08:38PM +0000, Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records.
>>
>> Reuse the existing work queue cxl_cper_work registered with GHES to notify
>> the CXL subsystem on a Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>
> Will the trace log differentiate between errors reported in FW-First
> versus Native mode? Wondering if that bit of info needs to be logged
> or is discoverable elsewhere.
No, the trace log won't differentiate currently.
But just a side note, FW-First also logs errors in dmesg. I'm not sure
if going forward, we would still continue to log errors in dmesg. But I
feel it might be needed so that we don't miss errors from RCH Downstream
Port or hexdump of unrecognized agent types.
Thanks
Smita
>
> Otherwise, LGTM,
> Reviewed-by: Alison Schofield <alison.schofield@intel.com>
>
>
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>> drivers/cxl/cxlpci.h | 3 +++
>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>> include/linux/cxl-event.h | 1 +
>> 5 files changed, 74 insertions(+), 2 deletions(-)
>
> snip
>
>
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-05-22 15:08 ` [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First " Smita Koralahalli
2024-05-22 18:05 ` Dave Jiang
2024-05-23 0:22 ` Alison Schofield
@ 2024-06-12 0:07 ` Dan Williams
2024-06-13 17:47 ` Smita Koralahalli
2 siblings, 1 reply; 28+ messages in thread
From: Dan Williams @ 2024-06-12 0:07 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records.
>
> Reuse the existing work queue cxl_cper_work registered with GHES to notify
> the CXL subsystem on a Protocol error.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
> them to trace FW-First Protocol Errors.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
> drivers/cxl/cxlpci.h | 3 +++
> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
> include/linux/cxl-event.h | 1 +
> 5 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 1a58032770ee..a31bd91e9475 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>
> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
> return;
> +
> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
> +
> + if (!cxl_cper_work)
> + return;
> +
> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
> +
> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
> + return;
> + }
> +
> + schedule_work(cxl_cper_work);
This seems wrong to unconditionally schedule the cxl_pci driver to look
at potentially "non-device" errors. With Terry's upcoming CXL switch
port error handling there will be a native path for those errors, but
until that arrives, I see no point in this code trying to convey
root/switch port errors to the endpoint driver.
> }
>
> int cxl_cper_register_work(struct work_struct *work)
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 0df09bd79408..ef9438cb1dd6 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
> }
> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err)
> +{
> + u32 status, fe;
> +
> + if (p_err->severity == CXL_AER_CORRECTABLE) {
> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
> +
> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> + } else {
> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
> +
> + if (hweight32(status) > 1)
> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> + p_err->cxl_ras.cap_control));
> + else
> + fe = status;
> +
> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> + p_err->cxl_ras.header_log);
> + }
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
> +
> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
> void __iomem *ras_base)
> {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 93992a1c8eec..0ba3215786e1 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
> void cxl_cor_error_detected(struct pci_dev *pdev);
> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> pci_channel_state_t state);
> +struct cxl_cper_prot_err;
> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
> + struct cxl_cper_prot_err *p_err);
> #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 74876c9835e8..3e3c36983686 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> &uuid_null, &rec->event);
> }
>
> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
Can we call this variable @rec instead of @p_err? The data being passed
is CPER data which is a "record" structure.
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(p_err->device, p_err->function);
> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
> + p_err->bus, devfn);
> + if (!pdev)
> + return;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return;
> +
> + cxlds = pci_get_drvdata(pdev);
> + if (!cxlds)
> + return;
> +
> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
> + pr_warn("CPER-reported device serial number does not match expected value\n");
Not much the end user can do about this warning, I would say skip this
message, or make it a pci_dbg() because matching by BDF should be
sufficient.
> +
> + cxl_trace_prot_err(cxlds, p_err);
> +}
> +
> static void cxl_cper_work_fn(struct work_struct *work)
> {
> struct cxl_cper_work_data wd;
>
> - while (cxl_cper_kfifo_get(&wd))
> - cxl_handle_cper_event(wd.event_type, &wd.rec);
> + while (cxl_cper_kfifo_get(&wd)) {
> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
> + cxl_handle_prot_err(&wd.p_err);
> + else
> + cxl_handle_cper_event(wd.event_type, &wd.rec);
> + }
> }
> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 9c7b69e076a0..5562844df850 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
> } __packed;
>
> enum cxl_event_type {
> + CXL_CPER_EVENT_PROT_ERR,
> CXL_CPER_EVENT_GENERIC,
> CXL_CPER_EVENT_GEN_MEDIA,
> CXL_CPER_EVENT_DRAM,
> --
> 2.17.1
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-06-12 0:07 ` Dan Williams
@ 2024-06-13 17:47 ` Smita Koralahalli
2024-06-24 22:04 ` Smita Koralahalli
0 siblings, 1 reply; 28+ messages in thread
From: Smita Koralahalli @ 2024-06-13 17:47 UTC (permalink / raw)
To: Dan Williams, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Jonathan Cameron, Yazen Ghannam, Bowman Terry
Hi Dan,
On 6/11/2024 5:07 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records.
>>
>> Reuse the existing work queue cxl_cper_work registered with GHES to notify
>> the CXL subsystem on a Protocol error.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>> them to trace FW-First Protocol Errors.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>> drivers/cxl/cxlpci.h | 3 +++
>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>> include/linux/cxl-event.h | 1 +
>> 5 files changed, 74 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 1a58032770ee..a31bd91e9475 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>>
>> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>> return;
>> +
>> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
>> +
>> + if (!cxl_cper_work)
>> + return;
>> +
>> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
>> +
>> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
>> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
>> + return;
>> + }
>> +
>> + schedule_work(cxl_cper_work);
>
> This seems wrong to unconditionally schedule the cxl_pci driver to look
> at potentially "non-device" errors. With Terry's upcoming CXL switch
> port error handling there will be a native path for those errors, but
> until that arrives, I see no point in this code trying to convey
> root/switch port errors to the endpoint driver.
I see okay. What are your recommendations on this? Just confine it to
CXL RCD, CXL SLD and CXL LD? And then extend it to ports once Terry
sends patches?
Also, I'm not sure about FMLD. Should we just drop it as of now?
>
>> }
>>
>> int cxl_cper_register_work(struct work_struct *work)
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 0df09bd79408..ef9438cb1dd6 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
>> }
>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err)
>> +{
>> + u32 status, fe;
>> +
>> + if (p_err->severity == CXL_AER_CORRECTABLE) {
>> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
>> +
>> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>> + } else {
>> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
>> +
>> + if (hweight32(status) > 1)
>> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>> + p_err->cxl_ras.cap_control));
>> + else
>> + fe = status;
>> +
>> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
>> + p_err->cxl_ras.header_log);
>> + }
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
>> +
>> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
>> void __iomem *ras_base)
>> {
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 93992a1c8eec..0ba3215786e1 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
>> void cxl_cor_error_detected(struct pci_dev *pdev);
>> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>> pci_channel_state_t state);
>> +struct cxl_cper_prot_err;
>> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
>> + struct cxl_cper_prot_err *p_err);
>> #endif /* __CXL_PCI_H__ */
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 74876c9835e8..3e3c36983686 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>> &uuid_null, &rec->event);
>> }
>>
>> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
>
> Can we call this variable @rec instead of @p_err? The data being passed
> is CPER data which is a "record" structure.
Will change.
>
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> + struct cxl_dev_state *cxlds;
>> + unsigned int devfn;
>> +
>> + devfn = PCI_DEVFN(p_err->device, p_err->function);
>> + pdev = pci_get_domain_bus_and_slot(p_err->segment,
>> + p_err->bus, devfn);
>> + if (!pdev)
>> + return;
>> +
>> + guard(device)(&pdev->dev);
>> + if (pdev->driver != &cxl_pci_driver)
>> + return;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> + if (!cxlds)
>> + return;
>> +
>> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
>> + pr_warn("CPER-reported device serial number does not match expected value\n");
>
> Not much the end user can do about this warning, I would say skip this
> message, or make it a pci_dbg() because matching by BDF should be
> sufficient.
Will skip this message.
Thanks
Smita
>
>> +
>> + cxl_trace_prot_err(cxlds, p_err);
>> +}
>> +
>> static void cxl_cper_work_fn(struct work_struct *work)
>> {
>> struct cxl_cper_work_data wd;
>>
>> - while (cxl_cper_kfifo_get(&wd))
>> - cxl_handle_cper_event(wd.event_type, &wd.rec);
>> + while (cxl_cper_kfifo_get(&wd)) {
>> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
>> + cxl_handle_prot_err(&wd.p_err);
>> + else
>> + cxl_handle_cper_event(wd.event_type, &wd.rec);
>> + }
>> }
>> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>>
>> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
>> index 9c7b69e076a0..5562844df850 100644
>> --- a/include/linux/cxl-event.h
>> +++ b/include/linux/cxl-event.h
>> @@ -122,6 +122,7 @@ struct cxl_event_record_raw {
>> } __packed;
>>
>> enum cxl_event_type {
>> + CXL_CPER_EVENT_PROT_ERR,
>> CXL_CPER_EVENT_GENERIC,
>> CXL_CPER_EVENT_GEN_MEDIA,
>> CXL_CPER_EVENT_DRAM,
>> --
>> 2.17.1
>>
>
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors
2024-06-13 17:47 ` Smita Koralahalli
@ 2024-06-24 22:04 ` Smita Koralahalli
0 siblings, 0 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-06-24 22:04 UTC (permalink / raw)
To: Dan Williams, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Jonathan Cameron, Yazen Ghannam, Bowman Terry
Hi Dan,
On 6/13/2024 10:47 AM, Smita Koralahalli wrote:
> Hi Dan,
>
> On 6/11/2024 5:07 PM, Dan Williams wrote:
>> Smita Koralahalli wrote:
>>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>>> CPER records.
>>>
>>> Reuse the existing work queue cxl_cper_work registered with GHES to
>>> notify
>>> the CXL subsystem on a Protocol error.
>>>
>>> The defined trace events cxl_aer_uncorrectable_error and
>>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
>>> them to trace FW-First Protocol Errors.
>>>
>>> Signed-off-by: Smita Koralahalli
>>> <Smita.KoralahalliChannabasappa@amd.com>
>>> ---
>>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++
>>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++
>>> drivers/cxl/cxlpci.h | 3 +++
>>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++--
>>> include/linux/cxl-event.h | 1 +
>>> 5 files changed, 74 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>>> index 1a58032770ee..a31bd91e9475 100644
>>> --- a/drivers/acpi/apei/ghes.c
>>> +++ b/drivers/acpi/apei/ghes.c
>>> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct
>>> acpi_hest_generic_data *gdata)
>>> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
>>> return;
>>> +
>>> + guard(spinlock_irqsave)(&cxl_cper_work_lock);
>>> +
>>> + if (!cxl_cper_work)
>>> + return;
>>> +
>>> + wd.event_type = CXL_CPER_EVENT_PROT_ERR;
>>> +
>>> + if (!kfifo_put(&cxl_cper_fifo, wd)) {
>>> + pr_err_ratelimited("CXL CPER kfifo overflow\n");
>>> + return;
>>> + }
>>> +
>>> + schedule_work(cxl_cper_work);
>>
>> This seems wrong to unconditionally schedule the cxl_pci driver to look
>> at potentially "non-device" errors. With Terry's upcoming CXL switch
>> port error handling there will be a native path for those errors, but
>> until that arrives, I see no point in this code trying to convey
>> root/switch port errors to the endpoint driver.
>
> I see okay. What are your recommendations on this? Just confine it to
> CXL RCD, CXL SLD and CXL LD? And then extend it to ports once Terry
> sends patches?
>
> Also, I'm not sure about FMLD. Should we just drop it as of now?
>
Since, Terry sent his port error handling patches, shall I keep the
above check as is? That is schedule cxl_pci driver on all device and
port errors with mention to be rebased on Terry's.
I'm slightly doubtful on FMLD though.
Thanks,
Smita
[snip]
^ permalink raw reply [flat|nested] 28+ messages in thread
* [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev()
2024-05-22 15:08 [PATCH 0/4] acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors Smita Koralahalli
` (2 preceding siblings ...)
2024-05-22 15:08 ` [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First " Smita Koralahalli
@ 2024-05-22 15:08 ` Smita Koralahalli
2024-05-22 19:42 ` Dave Jiang
` (2 more replies)
3 siblings, 3 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-22 15:08 UTC (permalink / raw)
To: linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
Refactor computation of cxlds to a common function get_cxl_dev() and reuse
the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 26 deletions(-)
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 3e3c36983686..26e65e5b68cb 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
},
};
+static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
+ u8 function)
+{
+ struct pci_dev *pdev __free(pci_dev_put) = NULL;
+ struct cxl_dev_state *cxlds;
+ unsigned int devfn;
+
+ devfn = PCI_DEVFN(device, function);
+ pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
+
+ if (!pdev)
+ return NULL;
+
+ guard(device)(&pdev->dev);
+ if (pdev->driver != &cxl_pci_driver)
+ return NULL;
+
+ cxlds = pci_get_drvdata(pdev);
+
+ return cxlds;
+}
+
#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
static void cxl_handle_cper_event(enum cxl_event_type ev_type,
struct cxl_cper_event_rec *rec)
{
struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
- struct pci_dev *pdev __free(pci_dev_put) = NULL;
enum cxl_event_log_type log_type;
struct cxl_dev_state *cxlds;
- unsigned int devfn;
u32 hdr_flags;
pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
device_id->segment_num, device_id->bus_num,
device_id->device_num, device_id->func_num);
- devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
- pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
- device_id->bus_num, devfn);
- if (!pdev)
- return;
-
- guard(device)(&pdev->dev);
- if (pdev->driver != &cxl_pci_driver)
- return;
-
- cxlds = pci_get_drvdata(pdev);
+ cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
+ device_id->device_num, device_id->func_num);
if (!cxlds)
return;
@@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
{
- struct pci_dev *pdev __free(pci_dev_put) = NULL;
struct cxl_dev_state *cxlds;
- unsigned int devfn;
- devfn = PCI_DEVFN(p_err->device, p_err->function);
- pdev = pci_get_domain_bus_and_slot(p_err->segment,
- p_err->bus, devfn);
- if (!pdev)
- return;
-
- guard(device)(&pdev->dev);
- if (pdev->driver != &cxl_pci_driver)
- return;
-
- cxlds = pci_get_drvdata(pdev);
+ cxlds = get_cxl_dev(p_err->segment, p_err->bus,
+ p_err->device, p_err->function);
if (!cxlds)
return;
--
2.17.1
^ permalink raw reply related [flat|nested] 28+ messages in thread* Re: [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev()
2024-05-22 15:08 ` [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev() Smita Koralahalli
@ 2024-05-22 19:42 ` Dave Jiang
2024-05-23 21:37 ` Smita Koralahalli
2024-05-23 0:45 ` Alison Schofield
2024-06-07 15:40 ` Jonathan Cameron
2 siblings, 1 reply; 28+ messages in thread
From: Dave Jiang @ 2024-05-22 19:42 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/22/24 8:08 AM, Smita Koralahalli wrote:
> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
I think just introduce the function where you open coded it instead of adding code and then deleting them in the same series.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3e3c36983686..26e65e5b68cb 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
> },
> };
>
> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
> + u8 function)
get_cxlds() or get_cxl_device_state() would be better.
DJ
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(device, function);
> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
> +
> + if (!pdev)
> + return NULL;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return NULL;
> +
> + cxlds = pci_get_drvdata(pdev);
> +
> + return cxlds;
> +}
> +
> #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> struct cxl_cper_event_rec *rec)
> {
> struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> enum cxl_event_log_type log_type;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
> u32 hdr_flags;
>
> pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
> device_id->segment_num, device_id->bus_num,
> device_id->device_num, device_id->func_num);
>
> - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> - pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> - device_id->bus_num, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
> + device_id->device_num, device_id->func_num);
> if (!cxlds)
> return;
>
> @@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>
> static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> {
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
>
> - devfn = PCI_DEVFN(p_err->device, p_err->function);
> - pdev = pci_get_domain_bus_and_slot(p_err->segment,
> - p_err->bus, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(p_err->segment, p_err->bus,
> + p_err->device, p_err->function);
> if (!cxlds)
> return;
>
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev()
2024-05-22 19:42 ` Dave Jiang
@ 2024-05-23 21:37 ` Smita Koralahalli
0 siblings, 0 replies; 28+ messages in thread
From: Smita Koralahalli @ 2024-05-23 21:37 UTC (permalink / raw)
To: Dave Jiang, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Bowman Terry
On 5/22/2024 12:42 PM, Dave Jiang wrote:
>
>
> On 5/22/24 8:08 AM, Smita Koralahalli wrote:
>> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
>> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
>
> I think just introduce the function where you open coded it instead of adding code and then deleting them in the same series.
Okay refactor first, then add trace support. Will change.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
>> 1 file changed, 26 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 3e3c36983686..26e65e5b68cb 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
>> },
>> };
>>
>> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
>> + u8 function)
>
> get_cxlds() or get_cxl_device_state() would be better.
Okay will change
>
> DJ
>
Thanks
Smita
>> +{
>> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> + struct cxl_dev_state *cxlds;
>> + unsigned int devfn;
>> +
>> + devfn = PCI_DEVFN(device, function);
>> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
>> +
>> + if (!pdev)
>> + return NULL;
>> +
>> + guard(device)(&pdev->dev);
>> + if (pdev->driver != &cxl_pci_driver)
>> + return NULL;
>> +
>> + cxlds = pci_get_drvdata(pdev);
>> +
>> + return cxlds;
>> +}
>> +
>> #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
>> static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>> struct cxl_cper_event_rec *rec)
>> {
>> struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
>> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> enum cxl_event_log_type log_type;
>> struct cxl_dev_state *cxlds;
>> - unsigned int devfn;
>> u32 hdr_flags;
>>
>> pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
>> device_id->segment_num, device_id->bus_num,
>> device_id->device_num, device_id->func_num);
>>
>> - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
>> - pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
>> - device_id->bus_num, devfn);
>> - if (!pdev)
>> - return;
>> -
>> - guard(device)(&pdev->dev);
>> - if (pdev->driver != &cxl_pci_driver)
>> - return;
>> -
>> - cxlds = pci_get_drvdata(pdev);
>> + cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
>> + device_id->device_num, device_id->func_num);
>> if (!cxlds)
>> return;
>>
>> @@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>>
>> static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
>> {
>> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
>> struct cxl_dev_state *cxlds;
>> - unsigned int devfn;
>>
>> - devfn = PCI_DEVFN(p_err->device, p_err->function);
>> - pdev = pci_get_domain_bus_and_slot(p_err->segment,
>> - p_err->bus, devfn);
>> - if (!pdev)
>> - return;
>> -
>> - guard(device)(&pdev->dev);
>> - if (pdev->driver != &cxl_pci_driver)
>> - return;
>> -
>> - cxlds = pci_get_drvdata(pdev);
>> + cxlds = get_cxl_dev(p_err->segment, p_err->bus,
>> + p_err->device, p_err->function);
>> if (!cxlds)
>> return;
>>
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev()
2024-05-22 15:08 ` [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev() Smita Koralahalli
2024-05-22 19:42 ` Dave Jiang
@ 2024-05-23 0:45 ` Alison Schofield
2024-06-07 15:40 ` Jonathan Cameron
2 siblings, 0 replies; 28+ messages in thread
From: Alison Schofield @ 2024-05-23 0:45 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Bowman Terry
On Wed, May 22, 2024 at 03:08:39PM +0000, Smita Koralahalli wrote:
> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3e3c36983686..26e65e5b68cb 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
> },
> };
>
> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
> + u8 function)
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(device, function);
> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
> +
> + if (!pdev)
> + return NULL;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return NULL;
> +
> + cxlds = pci_get_drvdata(pdev);
> +
> + return cxlds;
This can be:
return pci_get_drvdata(pdev);
> +}
snip
^ permalink raw reply [flat|nested] 28+ messages in thread* Re: [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev()
2024-05-22 15:08 ` [PATCH 4/4] cxl/pci: Define a common function get_cxl_dev() Smita Koralahalli
2024-05-22 19:42 ` Dave Jiang
2024-05-23 0:45 ` Alison Schofield
@ 2024-06-07 15:40 ` Jonathan Cameron
2 siblings, 0 replies; 28+ messages in thread
From: Jonathan Cameron @ 2024-06-07 15:40 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel,
Alison Schofield, Vishal Verma, Ira Weiny, Dan Williams,
Yazen Ghannam, Bowman Terry
On Wed, 22 May 2024 15:08:39 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> Refactor computation of cxlds to a common function get_cxl_dev() and reuse
> the function in both cxl_handle_cper_event() and cxl_handle_prot_err().
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> drivers/cxl/pci.c | 52 +++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 3e3c36983686..26e65e5b68cb 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -974,32 +974,43 @@ static struct pci_driver cxl_pci_driver = {
> },
> };
>
> +static struct cxl_dev_state *get_cxl_dev(u16 segment, u8 bus, u8 device,
> + u8 function)
I'd not expect a function with this name to return anything other
than a struct device *
get_cxl_devstate() maybe?
> +{
> + struct pci_dev *pdev __free(pci_dev_put) = NULL;
> + struct cxl_dev_state *cxlds;
> + unsigned int devfn;
> +
> + devfn = PCI_DEVFN(device, function);
Might as well do
unsigned int devfn = PCI_DEVFN(device, function);
> + pdev = pci_get_domain_bus_and_slot(segment, bus, devfn);
struct pci_dev *pdev __free(pci_dev_put) =
pci_get_domain_bus_and_slot(segment, bus, devfn);
see fwctl thread for a reference to Linus expressing a preference for
keeping construct and destructor definitions together even when
that means relaxing c conventions that we are so used to!
Obviously this is copied from below, but might as well tidy it up
whilst here.
However, do the devfn change above and it is in the definitions block...
> +
> + if (!pdev)
> + return NULL;
> +
> + guard(device)(&pdev->dev);
> + if (pdev->driver != &cxl_pci_driver)
> + return NULL;
> +
> + cxlds = pci_get_drvdata(pdev);
> +
> + return cxlds;
return pci_get_drvdata(pdev);
I think the function is small enough having cxlds just so it's obvious
what is being returned is unnecessary.
> +}
> +
> #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> static void cxl_handle_cper_event(enum cxl_event_type ev_type,
> struct cxl_cper_event_rec *rec)
> {
> struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> enum cxl_event_log_type log_type;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
> u32 hdr_flags;
>
> pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
> device_id->segment_num, device_id->bus_num,
> device_id->device_num, device_id->func_num);
>
> - devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
> - pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
> - device_id->bus_num, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(device_id->segment_num, device_id->bus_num,
> + device_id->device_num, device_id->func_num);
> if (!cxlds)
> return;
>
> @@ -1013,21 +1024,10 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
>
> static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)
> {
> - struct pci_dev *pdev __free(pci_dev_put) = NULL;
> struct cxl_dev_state *cxlds;
> - unsigned int devfn;
>
> - devfn = PCI_DEVFN(p_err->device, p_err->function);
> - pdev = pci_get_domain_bus_and_slot(p_err->segment,
> - p_err->bus, devfn);
> - if (!pdev)
> - return;
> -
> - guard(device)(&pdev->dev);
> - if (pdev->driver != &cxl_pci_driver)
> - return;
> -
> - cxlds = pci_get_drvdata(pdev);
> + cxlds = get_cxl_dev(p_err->segment, p_err->bus,
> + p_err->device, p_err->function);
> if (!cxlds)
> return;
>
^ permalink raw reply [flat|nested] 28+ messages in thread