* [PATCH] acpi/ghes: Remove CXL CPER notifications
@ 2024-02-17 20:29 Dan Williams
2024-02-18 21:54 ` Ira Weiny
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Dan Williams @ 2024-02-17 20:29 UTC (permalink / raw)
To: linux-cxl
Cc: Ard Biesheuvel, Rafael J. Wysocki, Ira Weiny, Jonathan Cameron,
vishal.l.verma, alison.schofield, linux-acpi
Initial tests with the CXL CPER implementation identified that error
reports were being duplicated in the log and the trace event [1]. Then
it was discovered that the notification handler took sleeping locks
while the GHES event handling runs in spin_lock_irqsave() context [2]
Given multiple bugs to fix and how late it is in the development cycle,
remove the CXL hookup for now and try again during the next merge
window.
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1]
Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
drivers/acpi/apei/ghes.c | 89 ---------------------------------------------
drivers/cxl/pci.c | 57 +----------------------------
include/linux/cxl-event.h | 18 ---------
3 files changed, 1 insertion(+), 163 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 7b7c605166e0..ab2a82cb1b0b 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -26,7 +26,6 @@
#include <linux/interrupt.h>
#include <linux/timer.h>
#include <linux/cper.h>
-#include <linux/cxl-event.h>
#include <linux/platform_device.h>
#include <linux/mutex.h>
#include <linux/ratelimit.h>
@@ -674,78 +673,6 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}
-/*
- * Only a single callback can be registered for CXL CPER events.
- */
-static DECLARE_RWSEM(cxl_cper_rw_sem);
-static cxl_cper_callback cper_callback;
-
-/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
-
-/*
- * General Media Event Record
- * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
- */
-#define CPER_SEC_CXL_GEN_MEDIA_GUID \
- GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \
- 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6)
-
-/*
- * DRAM Event Record
- * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
- */
-#define CPER_SEC_CXL_DRAM_GUID \
- GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \
- 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24)
-
-/*
- * Memory Module Event Record
- * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
- */
-#define CPER_SEC_CXL_MEM_MODULE_GUID \
- GUID_INIT(0xfe927475, 0xdd59, 0x4339, \
- 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74)
-
-static void cxl_cper_post_event(enum cxl_event_type event_type,
- struct cxl_cper_event_rec *rec)
-{
- if (rec->hdr.length <= sizeof(rec->hdr) ||
- rec->hdr.length > sizeof(*rec)) {
- pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n",
- rec->hdr.length);
- return;
- }
-
- if (!(rec->hdr.validation_bits & CPER_CXL_COMP_EVENT_LOG_VALID)) {
- pr_err(FW_WARN "CXL CPER invalid event\n");
- return;
- }
-
- guard(rwsem_read)(&cxl_cper_rw_sem);
- if (cper_callback)
- cper_callback(event_type, rec);
-}
-
-int cxl_cper_register_callback(cxl_cper_callback callback)
-{
- guard(rwsem_write)(&cxl_cper_rw_sem);
- if (cper_callback)
- return -EINVAL;
- cper_callback = callback;
- return 0;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_cper_register_callback, CXL);
-
-int cxl_cper_unregister_callback(cxl_cper_callback callback)
-{
- guard(rwsem_write)(&cxl_cper_rw_sem);
- if (callback != cper_callback)
- return -EINVAL;
- cper_callback = NULL;
- return 0;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_callback, CXL);
-
static bool ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
@@ -780,22 +707,6 @@ static bool ghes_do_proc(struct ghes *ghes,
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
- } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
- struct cxl_cper_event_rec *rec =
- acpi_hest_get_payload(gdata);
-
- cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec);
- } else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) {
- struct cxl_cper_event_rec *rec =
- acpi_hest_get_payload(gdata);
-
- cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec);
- } else if (guid_equal(sec_type,
- &CPER_SEC_CXL_MEM_MODULE_GUID)) {
- struct cxl_cper_event_rec *rec =
- acpi_hest_get_payload(gdata);
-
- cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
} else {
void *err = acpi_hest_get_payload(gdata);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 233e7c42c161..2ff361e756d6 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -974,61 +974,6 @@ static struct pci_driver cxl_pci_driver = {
},
};
-#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
-static void cxl_cper_event_call(enum cxl_event_type ev_type,
- struct cxl_cper_event_rec *rec)
-{
- struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
- struct pci_dev *pdev __free(pci_dev_put) = NULL;
- enum cxl_event_log_type log_type;
- struct cxl_dev_state *cxlds;
- unsigned int devfn;
- u32 hdr_flags;
-
- devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
- pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
- device_id->bus_num, devfn);
- if (!pdev)
- return;
-
- guard(pci_dev)(pdev);
- if (pdev->driver != &cxl_pci_driver)
- return;
-
- cxlds = pci_get_drvdata(pdev);
- if (!cxlds)
- return;
-
- /* Fabricate a log type */
- hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
- log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
-
- cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
- &uuid_null, &rec->event);
-}
-
-static int __init cxl_pci_driver_init(void)
-{
- int rc;
-
- rc = cxl_cper_register_callback(cxl_cper_event_call);
- if (rc)
- return rc;
-
- rc = pci_register_driver(&cxl_pci_driver);
- if (rc)
- cxl_cper_unregister_callback(cxl_cper_event_call);
-
- return rc;
-}
-
-static void __exit cxl_pci_driver_exit(void)
-{
- pci_unregister_driver(&cxl_pci_driver);
- cxl_cper_unregister_callback(cxl_cper_event_call);
-}
-
-module_init(cxl_pci_driver_init);
-module_exit(cxl_pci_driver_exit);
+module_pci_driver(cxl_pci_driver);
MODULE_LICENSE("GPL v2");
MODULE_IMPORT_NS(CXL);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 91125eca4c8a..03fa6d50d46f 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -140,22 +140,4 @@ struct cxl_cper_event_rec {
union cxl_event event;
} __packed;
-typedef void (*cxl_cper_callback)(enum cxl_event_type type,
- struct cxl_cper_event_rec *rec);
-
-#ifdef CONFIG_ACPI_APEI_GHES
-int cxl_cper_register_callback(cxl_cper_callback callback);
-int cxl_cper_unregister_callback(cxl_cper_callback callback);
-#else
-static inline int cxl_cper_register_callback(cxl_cper_callback callback)
-{
- return 0;
-}
-
-static inline int cxl_cper_unregister_callback(cxl_cper_callback callback)
-{
- return 0;
-}
-#endif
-
#endif /* _LINUX_CXL_EVENT_H */
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH] acpi/ghes: Remove CXL CPER notifications
2024-02-17 20:29 [PATCH] acpi/ghes: Remove CXL CPER notifications Dan Williams
@ 2024-02-18 21:54 ` Ira Weiny
2024-02-18 22:49 ` Dan Williams
2024-02-18 22:48 ` Dan Williams
2024-02-19 11:47 ` Jonathan Cameron
2 siblings, 1 reply; 5+ messages in thread
From: Ira Weiny @ 2024-02-18 21:54 UTC (permalink / raw)
To: Dan Williams, linux-cxl
Cc: Ard Biesheuvel, Rafael J. Wysocki, Ira Weiny, Jonathan Cameron,
vishal.l.verma, alison.schofield, linux-acpi
Dan Williams wrote:
> Initial tests with the CXL CPER implementation identified that error
> reports were being duplicated in the log and the trace event [1]. Then
> it was discovered that the notification handler took sleeping locks
> while the GHES event handling runs in spin_lock_irqsave() context [2]
>
> Given multiple bugs to fix and how late it is in the development cycle,
> remove the CXL hookup for now and try again during the next merge
> window.
>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Rafael J. Wysocki <rafael@kernel.org>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1]
> Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2]
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Dan should we add the following hunk to remove that dead code for now?
With or without this hunk.
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 03fa6d50d46f..f4934d0d1fb2 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -114,30 +114,4 @@ enum cxl_event_type {
CXL_CPER_EVENT_MEM_MODULE,
};
-#define CPER_CXL_DEVICE_ID_VALID BIT(0)
-#define CPER_CXL_DEVICE_SN_VALID BIT(1)
-#define CPER_CXL_COMP_EVENT_LOG_VALID BIT(2)
-struct cxl_cper_event_rec {
- struct {
- u32 length;
- u64 validation_bits;
- struct cper_cxl_event_devid {
- u16 vendor_id;
- u16 device_id;
- u8 func_num;
- u8 device_num;
- u8 bus_num;
- u16 segment_num;
- u16 slot_num; /* bits 2:0 reserved */
- u8 reserved;
- } __packed device_id;
- struct cper_cxl_event_sn {
- u32 lower_dw;
- u32 upper_dw;
- } __packed dev_serial_num;
- } __packed hdr;
-
- union cxl_event event;
-} __packed;
-
#endif /* _LINUX_CXL_EVENT_H */
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH] acpi/ghes: Remove CXL CPER notifications
2024-02-18 21:54 ` Ira Weiny
@ 2024-02-18 22:49 ` Dan Williams
0 siblings, 0 replies; 5+ messages in thread
From: Dan Williams @ 2024-02-18 22:49 UTC (permalink / raw)
To: Ira Weiny, Dan Williams, linux-cxl
Cc: Ard Biesheuvel, Rafael J. Wysocki, Ira Weiny, Jonathan Cameron,
vishal.l.verma, alison.schofield, linux-acpi
Ira Weiny wrote:
> Dan Williams wrote:
> > Initial tests with the CXL CPER implementation identified that error
> > reports were being duplicated in the log and the trace event [1]. Then
> > it was discovered that the notification handler took sleeping locks
> > while the GHES event handling runs in spin_lock_irqsave() context [2]
> >
> > Given multiple bugs to fix and how late it is in the development cycle,
> > remove the CXL hookup for now and try again during the next merge
> > window.
> >
> > Cc: Ard Biesheuvel <ardb@kernel.org>
> > Cc: Rafael J. Wysocki <rafael@kernel.org>
> > Cc: Ira Weiny <ira.weiny@intel.com>
> > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1]
> > Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2]
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> Dan should we add the following hunk to remove that dead code for now?
A dead definition, not dead logic. I think it is ok to stick around.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] acpi/ghes: Remove CXL CPER notifications
2024-02-17 20:29 [PATCH] acpi/ghes: Remove CXL CPER notifications Dan Williams
2024-02-18 21:54 ` Ira Weiny
@ 2024-02-18 22:48 ` Dan Williams
2024-02-19 11:47 ` Jonathan Cameron
2 siblings, 0 replies; 5+ messages in thread
From: Dan Williams @ 2024-02-18 22:48 UTC (permalink / raw)
To: Dan Williams, linux-cxl
Cc: Ard Biesheuvel, Rafael J. Wysocki, Ira Weiny, Jonathan Cameron,
vishal.l.verma, alison.schofield, linux-acpi
Dan Williams wrote:
> Initial tests with the CXL CPER implementation identified that error
> reports were being duplicated in the log and the trace event [1]. Then
> it was discovered that the notification handler took sleeping locks
> while the GHES event handling runs in spin_lock_irqsave() context [2]
>
> Given multiple bugs to fix and how late it is in the development cycle,
> remove the CXL hookup for now and try again during the next merge
> window.
>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Rafael J. Wysocki <rafael@kernel.org>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1]
Stephen noticed that the fix for this already went upstream as:
54ce1927eb78 ("cxl/cper: Fix errant CPER prints for CXL events")
...and it collides with this removal. I really do not want to have a
fire drill to fix locking this late in the cycle, so I still think
trying again for v6.9 is more comfortable. That also allows collecting
Smita's work as well.
I will fix up the changelog to:
---
Initial tests with the CXL CPER implementation identified that error
reports were being duplicated in the log and the trace event [1]. Then
it was discovered that the notification handler took sleeping locks
while the GHES event handling runs in spin_lock_irqsave() context [2]
While the duplicate reporting was fixed in v6.8-rc4, the fix for the
sleeping-lock-vs-atomic collision would enjoy more time to settle and
gain some test cycles. Given how late it is in the development cycle,
remove the CXL hookup for now and try again during the next merge
window.
Note that end result is that v6.8 does not emit CXL CPER payloads to the
kernel log, but this is in line with the CXL trend to move error
reporting to trace events instead of the kernel log.
---
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH] acpi/ghes: Remove CXL CPER notifications
2024-02-17 20:29 [PATCH] acpi/ghes: Remove CXL CPER notifications Dan Williams
2024-02-18 21:54 ` Ira Weiny
2024-02-18 22:48 ` Dan Williams
@ 2024-02-19 11:47 ` Jonathan Cameron
2 siblings, 0 replies; 5+ messages in thread
From: Jonathan Cameron @ 2024-02-19 11:47 UTC (permalink / raw)
To: Dan Williams
Cc: linux-cxl, Ard Biesheuvel, Rafael J. Wysocki, Ira Weiny,
vishal.l.verma, alison.schofield, linux-acpi
On Sat, 17 Feb 2024 12:29:38 -0800
Dan Williams <dan.j.williams@intel.com> wrote:
> Initial tests with the CXL CPER implementation identified that error
> reports were being duplicated in the log and the trace event [1]. Then
> it was discovered that the notification handler took sleeping locks
> while the GHES event handling runs in spin_lock_irqsave() context [2]
>
> Given multiple bugs to fix and how late it is in the development cycle,
> remove the CXL hookup for now and try again during the next merge
> window.
>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Rafael J. Wysocki <rafael@kernel.org>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1]
> Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2]
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
I'm fine with this. Updated text in thread is fine as well.
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-02-19 11:47 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-02-17 20:29 [PATCH] acpi/ghes: Remove CXL CPER notifications Dan Williams
2024-02-18 21:54 ` Ira Weiny
2024-02-18 22:49 ` Dan Williams
2024-02-18 22:48 ` Dan Williams
2024-02-19 11:47 ` Jonathan Cameron
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox