linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
@ 2025-07-21 10:13 Breno Leitao
  2025-07-21 13:40 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Breno Leitao @ 2025-07-21 10:13 UTC (permalink / raw)
  To: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: linux-acpi, linux-kernel, acpica-devel, osandov, xueshuai,
	konrad.wilk, linux-edac, linuxppc-dev, linux-pci, kernel-team,
	Breno Leitao

Introduce a generic infrastructure for tracking recoverable hardware
errors (HW errors that did not cause a panic) and record them for vmcore
consumption. This aids post-mortem crash analysis tools by preserving
a count and timestamp for the last occurrence of such errors.

This patch adds centralized logging for three common sources of
recoverable hardware errors:

  - PCIe AER Correctable errors
  - x86 Machine Check Exceptions (MCE)
  - APEI/CPER GHES corrected or recoverable errors

hwerror_tracking is write-only at kernel runtime, and it is meant to be
read from vmcore using tools like crash/drgn. For example, this is how
it looks like when opening the crashdump from drgn.

	>>> prog['hwerror_tracking']
	(struct hwerror_tracking_info [3]){
		{
			.count = (int)844,
			.timestamp = (time64_t)1752852018,
		},
		...

Suggested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
Changes in v2:
- Split the counter by recoverable error (Tony Luck)
- Link to v1: https://lore.kernel.org/r/20250714-vmcore_hw_error-v1-1-8cf45edb6334@debian.org
---
 arch/x86/kernel/cpu/mce/core.c |  3 +++
 drivers/acpi/apei/ghes.c       |  8 ++++++--
 drivers/pci/pcie/aer.c         |  2 ++
 include/linux/vmcore_info.h    | 14 ++++++++++++++
 kernel/vmcore_info.c           | 18 ++++++++++++++++++
 5 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4da4eab56c81d..781cf574642eb 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -45,6 +45,7 @@
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 #include <linux/kexec.h>
+#include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
 #include <asm/cpu_device_id.h>
@@ -1692,6 +1693,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
 out:
 	instrumentation_end();
 
+	/* Given it didn't panic, mark it as recoverable */
+	hwerror_tracking_log(HWE_RECOV_MCE);
 clear:
 	mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
 }
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a0d54993edb3b..396cdffbe6a37 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -43,6 +43,7 @@
 #include <linux/uuid.h>
 #include <linux/ras.h>
 #include <linux/task_work.h>
+#include <linux/vmcore_info.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -1136,13 +1137,16 @@ static int ghes_proc(struct ghes *ghes)
 {
 	struct acpi_hest_generic_status *estatus = ghes->estatus;
 	u64 buf_paddr;
-	int rc;
+	int rc, sev;
 
 	rc = ghes_read_estatus(ghes, estatus, &buf_paddr, FIX_APEI_GHES_IRQ);
 	if (rc)
 		goto out;
 
-	if (ghes_severity(estatus->error_severity) >= GHES_SEV_PANIC)
+	sev = ghes_severity(estatus->error_severity);
+	if (sev == GHES_SEV_RECOVERABLE || sev ==  GHES_SEV_CORRECTED)
+		hwerror_tracking_log(HWE_RECOV_GHES);
+	else if (sev >= GHES_SEV_PANIC)
 		__ghes_panic(ghes, estatus, buf_paddr, FIX_APEI_GHES_IRQ);
 
 	if (!ghes_estatus_cached(estatus)) {
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e286c197d7167..064d220564b59 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -30,6 +30,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
+#include <linux/vmcore_info.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
@@ -746,6 +747,7 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
 	switch (info->severity) {
 	case AER_CORRECTABLE:
 		aer_info->dev_total_cor_errs++;
+		hwerror_tracking_log(HWE_RECOV_AER);
 		counter = &aer_info->dev_cor_errs[0];
 		max = AER_MAX_TYPEOF_COR_ERRS;
 		break;
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index 37e003ae52626..5894da92a6ba4 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -77,4 +77,18 @@ extern u32 *vmcoreinfo_note;
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
+
+enum hwerror_tracking_source {
+	HWE_RECOV_AER,
+	HWE_RECOV_MCE,
+	HWE_RECOV_GHES,
+	HWE_RECOV_MAX,
+};
+
+#ifdef CONFIG_VMCORE_INFO
+void hwerror_tracking_log(enum hwerror_tracking_source src);
+#else
+void hwerror_tracking_log(enum hwerror_tracking_source src) {};
+#endif
+
 #endif /* LINUX_VMCORE_INFO_H */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e066d31d08f89..23d7ddcd55cdd 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+struct hwerror_tracking_info {
+	int __data_racy count;
+	time64_t __data_racy timestamp;
+};
+
+static struct hwerror_tracking_info hwerror_tracking[HWE_RECOV_MAX];
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len)
 {
@@ -118,6 +125,17 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
+void hwerror_tracking_log(enum hwerror_tracking_source src)
+{
+	if (src < 0 || src >= HWE_RECOV_MAX)
+		return;
+
+	/* No need to atomics/locks given the precision is not important */
+	hwerror_tracking[src].count++;
+	hwerror_tracking[src].timestamp = ktime_get_real_seconds();
+}
+EXPORT_SYMBOL_GPL(hwerror_tracking_log);
+
 static int __init crash_save_vmcoreinfo_init(void)
 {
 	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);

---
base-commit: 97987520025658f30bb787a99ffbd9bbff9ffc9d
change-id: 20250707-vmcore_hw_error-322429e6c316

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
  2025-07-21 10:13 [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
@ 2025-07-21 13:40 ` kernel test robot
  2025-07-21 13:57 ` Borislav Petkov
  2025-07-21 16:24 ` kernel test robot
  2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2025-07-21 13:40 UTC (permalink / raw)
  To: Breno Leitao, Rafael J. Wysocki, Len Brown, James Morse,
	Tony Luck, Borislav Petkov, Robert Moore, Thomas Gleixner,
	Ingo Molnar, Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: oe-kbuild-all, linux-media, linux-acpi, linux-kernel,
	acpica-devel, osandov, xueshuai, konrad.wilk, linux-edac,
	linuxppc-dev, linux-pci, kernel-team, Breno Leitao

Hi Breno,

kernel test robot noticed the following build errors:

[auto build test ERROR on 97987520025658f30bb787a99ffbd9bbff9ffc9d]

url:    https://github.com/intel-lab-lkp/linux/commits/Breno-Leitao/vmcoreinfo-Track-and-log-recoverable-hardware-errors/20250721-181439
base:   97987520025658f30bb787a99ffbd9bbff9ffc9d
patch link:    https://lore.kernel.org/r/20250721-vmcore_hw_error-v2-1-ab65a6b43c5a%40debian.org
patch subject: [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
config: x86_64-buildonly-randconfig-004-20250721 (https://download.01.org/0day-ci/archive/20250721/202507212132.OA9HtsQY-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250721/202507212132.OA9HtsQY-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507212132.OA9HtsQY-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/kexec.h:18,
                    from init/initramfs.c:603:
>> include/linux/vmcore_info.h:91:6: warning: no previous prototype for 'hwerror_tracking_log' [-Wmissing-prototypes]
      91 | void hwerror_tracking_log(enum hwerror_tracking_source src) {};
         |      ^~~~~~~~~~~~~~~~~~~~
--
   ld: arch/x86/kernel/traps.o: in function `hwerror_tracking_log':
>> traps.c:(.text+0x68e): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/traps.o: in function `__pfx_hwerror_tracking_log':
>> traps.c:(.text+0x67e): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/dumpstack_64.o: in function `hwerror_tracking_log':
   dumpstack_64.c:(.text+0x10): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/dumpstack_64.o: in function `__pfx_hwerror_tracking_log':
   dumpstack_64.c:(.text+0x0): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/dumpstack.o: in function `__pfx_hwerror_tracking_log':
   dumpstack.c:(.text+0xc9): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/dumpstack.o: in function `hwerror_tracking_log':
   dumpstack.c:(.text+0xd9): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/setup.o: in function `hwerror_tracking_log':
   setup.c:(.text+0x54): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/setup.o: in function `__pfx_hwerror_tracking_log':
   setup.c:(.text+0x44): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/e820.o: in function `__pfx_hwerror_tracking_log':
   e820.c:(.text+0x19f): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/e820.o: in function `hwerror_tracking_log':
   e820.c:(.text+0x1af): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/cpu/mce/core.o: in function `hwerror_tracking_log':
   core.c:(.text+0x14f0): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/cpu/mce/core.o: in function `__pfx_hwerror_tracking_log':
   core.c:(.text+0x14e0): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/acpi/madt_wakeup.o: in function `hwerror_tracking_log':
   madt_wakeup.c:(.text+0x180): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/acpi/madt_wakeup.o: in function `__pfx_hwerror_tracking_log':
   madt_wakeup.c:(.text+0x170): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/reboot.o: in function `hwerror_tracking_log':
   reboot.c:(.text+0x13d): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/reboot.o: in function `__pfx_hwerror_tracking_log':
   reboot.c:(.text+0x12d): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/smp.o: in function `hwerror_tracking_log':
   smp.c:(.text+0x28e): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/smp.o: in function `__pfx_hwerror_tracking_log':
   smp.c:(.text+0x27e): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/smpboot.o: in function `hwerror_tracking_log':
   smpboot.c:(.text+0x8fe): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/smpboot.o: in function `__pfx_hwerror_tracking_log':
   smpboot.c:(.text+0x8ee): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/setup_percpu.o: in function `hwerror_tracking_log':
   setup_percpu.c:(.text+0x10): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/setup_percpu.o: in function `__pfx_hwerror_tracking_log':
   setup_percpu.c:(.text+0x0): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/machine_kexec_64.o: in function `hwerror_tracking_log':
   machine_kexec_64.c:(.text+0x772): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/machine_kexec_64.o: in function `__pfx_hwerror_tracking_log':
   machine_kexec_64.c:(.text+0x762): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/kexec-bzimage64.o: in function `hwerror_tracking_log':
   kexec-bzimage64.c:(.text+0xb9c): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/kexec-bzimage64.o: in function `__pfx_hwerror_tracking_log':
   kexec-bzimage64.c:(.text+0xb8c): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: arch/x86/kernel/early_printk.o: in function `hwerror_tracking_log':
   early_printk.c:(.text+0x31e): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: arch/x86/kernel/early_printk.o: in function `__pfx_hwerror_tracking_log':
   early_printk.c:(.text+0x30e): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/panic.o: in function `hwerror_tracking_log':
   panic.c:(.text+0x466): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/panic.o: in function `__pfx_hwerror_tracking_log':
   panic.c:(.text+0x456): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/ksysfs.o: in function `hwerror_tracking_log':
   ksysfs.c:(.text+0x1f6): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/ksysfs.o: in function `__pfx_hwerror_tracking_log':
   ksysfs.c:(.text+0x1e6): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/reboot.o: in function `hwerror_tracking_log':
   reboot.c:(.text+0xd31): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/reboot.o: in function `__pfx_hwerror_tracking_log':
   reboot.c:(.text+0xd21): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/printk/printk.o: in function `hwerror_tracking_log':
   printk.c:(.text+0x2119): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/printk/printk.o: in function `__pfx_hwerror_tracking_log':
   printk.c:(.text+0x2109): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/kexec_core.o: in function `hwerror_tracking_log':
   kexec_core.c:(.text+0x346): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/kexec_core.o: in function `__pfx_hwerror_tracking_log':
   kexec_core.c:(.text+0x336): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/kexec_file.o: in function `hwerror_tracking_log':
   kexec_file.c:(.text+0x42a): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/kexec_file.o: in function `__pfx_hwerror_tracking_log':
   kexec_file.c:(.text+0x41a): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: kernel/kexec_handover.o: in function `hwerror_tracking_log':
   kexec_handover.c:(.text+0xf63): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: kernel/kexec_handover.o: in function `__pfx_hwerror_tracking_log':
   kexec_handover.c:(.text+0xf53): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: mm/mm_init.o: in function `hwerror_tracking_log':
   mm_init.c:(.text+0x26d): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: mm/mm_init.o: in function `__pfx_hwerror_tracking_log':
   mm_init.c:(.text+0x25d): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: block/blk-mq.o: in function `hwerror_tracking_log':
   blk-mq.c:(.text+0x48d3): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: block/blk-mq.o: in function `__pfx_hwerror_tracking_log':
   blk-mq.c:(.text+0x48c3): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: drivers/iommu/dma-iommu.o: in function `hwerror_tracking_log':
   dma-iommu.c:(.text+0x196e): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: drivers/iommu/dma-iommu.o: in function `__pfx_hwerror_tracking_log':
   dma-iommu.c:(.text+0x195e): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: drivers/md/dm-ioctl.o: in function `hwerror_tracking_log':
   dm-ioctl.c:(.text+0x2f65): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: drivers/md/dm-ioctl.o: in function `__pfx_hwerror_tracking_log':
   dm-ioctl.c:(.text+0x2f55): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here
   ld: drivers/firmware/efi/efi.o: in function `hwerror_tracking_log':
   efi.c:(.text+0x5f2): multiple definition of `hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x80): first defined here
   ld: drivers/firmware/efi/efi.o: in function `__pfx_hwerror_tracking_log':
   efi.c:(.text+0x5e2): multiple definition of `__pfx_hwerror_tracking_log'; init/initramfs.o:initramfs.c:(.text+0x70): first defined here

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
  2025-07-21 10:13 [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
  2025-07-21 13:40 ` kernel test robot
@ 2025-07-21 13:57 ` Borislav Petkov
  2025-07-21 15:43   ` Breno Leitao
  2025-07-21 16:24 ` kernel test robot
  2 siblings, 1 reply; 5+ messages in thread
From: Borislav Petkov @ 2025-07-21 13:57 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Robert Moore, Thomas Gleixner, Ingo Molnar, Dave Hansen, x86,
	H. Peter Anvin, Hanjun Guo, Mauro Carvalho Chehab,
	Mahesh J Salgaonkar, Oliver O'Halloran, Bjorn Helgaas,
	linux-acpi, linux-kernel, acpica-devel, osandov, xueshuai,
	konrad.wilk, linux-edac, linuxppc-dev, linux-pci, kernel-team

On Mon, Jul 21, 2025 at 03:13:40AM -0700, Breno Leitao wrote:
> Introduce a generic infrastructure for tracking recoverable hardware
> errors (HW errors that did not cause a panic) and record them for vmcore
> consumption. This aids post-mortem crash analysis tools by preserving
> a count and timestamp for the last occurrence of such errors.
> 
> This patch adds centralized logging for three common sources of

"Add centralized... "

> recoverable hardware errors:
> 
>   - PCIe AER Correctable errors
>   - x86 Machine Check Exceptions (MCE)
>   - APEI/CPER GHES corrected or recoverable errors
> 
> hwerror_tracking is write-only at kernel runtime, and it is meant to be
> read from vmcore using tools like crash/drgn. For example, this is how
> it looks like when opening the crashdump from drgn.
> 
> 	>>> prog['hwerror_tracking']
> 	(struct hwerror_tracking_info [3]){
> 		{
> 			.count = (int)844,
> 			.timestamp = (time64_t)1752852018,
> 		},
> 		...
> 

I'm still missing the justification why rasdaemon can't be used here.
You did explain it already in past emails.

> +enum hwerror_tracking_source {
> +	HWE_RECOV_AER,
> +	HWE_RECOV_MCE,
> +	HWE_RECOV_GHES,
> +	HWE_RECOV_MAX,
> +};

Are we confident this separation will serve all cloud dudes?

> +
> +#ifdef CONFIG_VMCORE_INFO
> +void hwerror_tracking_log(enum hwerror_tracking_source src);
> +#else
> +void hwerror_tracking_log(enum hwerror_tracking_source src) {};
> +#endif
> +
>  #endif /* LINUX_VMCORE_INFO_H */
> diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
> index e066d31d08f89..23d7ddcd55cdd 100644
> --- a/kernel/vmcore_info.c
> +++ b/kernel/vmcore_info.c
> @@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
>  /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
>  static unsigned char *vmcoreinfo_data_safecopy;
>  
> +struct hwerror_tracking_info {
> +	int __data_racy count;
> +	time64_t __data_racy timestamp;
> +};
> +
> +static struct hwerror_tracking_info hwerror_tracking[HWE_RECOV_MAX];
> +
>  Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
>  			  void *data, size_t data_len)
>  {
> @@ -118,6 +125,17 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
>  }
>  EXPORT_SYMBOL(paddr_vmcoreinfo_note);
>  
> +void hwerror_tracking_log(enum hwerror_tracking_source src)

A function should have a verb in its name explaining what it does:

hwerr_log_error_type()

or so.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
  2025-07-21 13:57 ` Borislav Petkov
@ 2025-07-21 15:43   ` Breno Leitao
  0 siblings, 0 replies; 5+ messages in thread
From: Breno Leitao @ 2025-07-21 15:43 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Robert Moore, Thomas Gleixner, Ingo Molnar, Dave Hansen, x86,
	H. Peter Anvin, Hanjun Guo, Mauro Carvalho Chehab,
	Mahesh J Salgaonkar, Oliver O'Halloran, Bjorn Helgaas,
	linux-acpi, linux-kernel, acpica-devel, osandov, xueshuai,
	konrad.wilk, linux-edac, linuxppc-dev, linux-pci, kernel-team

Hello Borislav,

On Mon, Jul 21, 2025 at 03:57:18PM +0200, Borislav Petkov wrote:
> On Mon, Jul 21, 2025 at 03:13:40AM -0700, Breno Leitao wrote:
> > Introduce a generic infrastructure for tracking recoverable hardware
> > errors (HW errors that did not cause a panic) and record them for vmcore
> > consumption. This aids post-mortem crash analysis tools by preserving
> > a count and timestamp for the last occurrence of such errors.
> > 
> > This patch adds centralized logging for three common sources of
> 
> "Add centralized... "

Ack!

> > recoverable hardware errors:
> > 
> >   - PCIe AER Correctable errors
> >   - x86 Machine Check Exceptions (MCE)
> >   - APEI/CPER GHES corrected or recoverable errors
> > 
> > hwerror_tracking is write-only at kernel runtime, and it is meant to be
> > read from vmcore using tools like crash/drgn. For example, this is how
> > it looks like when opening the crashdump from drgn.
> > 
> > 	>>> prog['hwerror_tracking']
> > 	(struct hwerror_tracking_info [3]){
> > 		{
> > 			.count = (int)844,
> > 			.timestamp = (time64_t)1752852018,
> > 		},
> > 		...
> > 
> 
> I'm still missing the justification why rasdaemon can't be used here.
> You did explain it already in past emails.

Sorry, I will update it.

> > +enum hwerror_tracking_source {
> > +	HWE_RECOV_AER,
> > +	HWE_RECOV_MCE,
> > +	HWE_RECOV_GHES,
> > +	HWE_RECOV_MAX,
> > +};
> 
> Are we confident this separation will serve all cloud dudes?

I am not, but, I've added them to CC list of this patch, so, they are
more than free to chime in.

> > +void hwerror_tracking_log(enum hwerror_tracking_source src)
> 
> A function should have a verb in its name explaining what it does:
> 
> hwerr_log_error_type()
> 
> or so.

Ack!

I will wait a bit more and send an updated version.

Thanks for the review
--breno

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
  2025-07-21 10:13 [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
  2025-07-21 13:40 ` kernel test robot
  2025-07-21 13:57 ` Borislav Petkov
@ 2025-07-21 16:24 ` kernel test robot
  2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2025-07-21 16:24 UTC (permalink / raw)
  To: Breno Leitao, Rafael J. Wysocki, Len Brown, James Morse,
	Tony Luck, Borislav Petkov, Robert Moore, Thomas Gleixner,
	Ingo Molnar, Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: llvm, oe-kbuild-all, linux-media, linux-acpi, linux-kernel,
	acpica-devel, osandov, xueshuai, konrad.wilk, linux-edac,
	linuxppc-dev, linux-pci, kernel-team, Breno Leitao

Hi Breno,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 97987520025658f30bb787a99ffbd9bbff9ffc9d]

url:    https://github.com/intel-lab-lkp/linux/commits/Breno-Leitao/vmcoreinfo-Track-and-log-recoverable-hardware-errors/20250721-181439
base:   97987520025658f30bb787a99ffbd9bbff9ffc9d
patch link:    https://lore.kernel.org/r/20250721-vmcore_hw_error-v2-1-ab65a6b43c5a%40debian.org
patch subject: [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors
config: i386-buildonly-randconfig-001-20250721 (https://download.01.org/0day-ci/archive/20250722/202507220057.iVSR8aqd-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250722/202507220057.iVSR8aqd-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507220057.iVSR8aqd-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from init/initramfs.c:603:
   In file included from include/linux/kexec.h:18:
>> include/linux/vmcore_info.h:91:6: warning: no previous prototype for function 'hwerror_tracking_log' [-Wmissing-prototypes]
      91 | void hwerror_tracking_log(enum hwerror_tracking_source src) {};
         |      ^
   include/linux/vmcore_info.h:91:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
      91 | void hwerror_tracking_log(enum hwerror_tracking_source src) {};
         | ^
         | static 
   1 warning generated.


vim +/hwerror_tracking_log +91 include/linux/vmcore_info.h

    87	
    88	#ifdef CONFIG_VMCORE_INFO
    89	void hwerror_tracking_log(enum hwerror_tracking_source src);
    90	#else
  > 91	void hwerror_tracking_log(enum hwerror_tracking_source src) {};
    92	#endif
    93	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-07-21 16:24 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-21 10:13 [PATCH v2] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
2025-07-21 13:40 ` kernel test robot
2025-07-21 13:57 ` Borislav Petkov
2025-07-21 15:43   ` Breno Leitao
2025-07-21 16:24 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).