[PATCH v4] vmcoreinfo: Track and log recoverable hardware errors

acpica-devel.lists.linux.dev archive mirror
 help / color / mirror / Atom feed

* [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
@ 2025-08-01 12:31 Breno Leitao
  2025-08-01 14:52 ` Dave Hansen
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Breno Leitao @ 2025-08-01 12:31 UTC (permalink / raw)
  To: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: linux-acpi, linux-kernel, acpica-devel, osandov, xueshuai,
	konrad.wilk, linux-edac, linuxppc-dev, linux-pci, kernel-team,
	osandov, Breno Leitao

Introduce a generic infrastructure for tracking recoverable hardware
errors (HW errors that are visible to the OS but does not cause a panic)
and record them for vmcore consumption. This aids post-mortem crash
analysis tools by preserving a count and timestamp for the last
occurrence of such errors. On the other side, correctable errors, which
the OS typically remains unaware of because the underlying hardware
handles them transparently, are less relevant for crash dump
and therefore are NOT tracked in this infrastructure.

Add centralized logging for sources of recoverable hardware
errors based on the subsystem it has been notified.

hwerror_data is write-only at kernel runtime, and it is meant to be read
from vmcore using tools like crash/drgn. For example, this is how it
looks like when opening the crashdump from drgn.

	>>> prog['hwerror_data']
	(struct hwerror_info[1]){
		{
			.count = (int)844,
			.timestamp = (time64_t)1752852018,
		},
		...

This helps fleet operators quickly triage whether a crash may be
influenced by hardware recoverable errors (which executes a uncommon
code path in the kernel), especially when recoverable errors occurred
shortly before a panic, such as the bug fixed by
commit ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them
when destroying the pool")

This is not intended to replace full hardware diagnostics but provides
a fast way to correlate hardware events with kernel panics quickly.

Rare machine check exceptions—like those indicated by mce_flags.p5 or
mce_flags.winchip—are not accounted for in this method, as they fall
outside the intended usage scope for this feature’s user base.

Suggested-by: Tony Luck <tony.luck@intel.com>
Suggested-by: Shuai Xue <xueshuai@linux.alibaba.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
Changes in v4:
- Split the error by hardware subsystem instead of kernel
  subsystem/driver (Shuai)
- Do not count the corrected errors, only focusing on recoverable errors (Shuai)
- Link to v3: https://lore.kernel.org/r/20250722-vmcore_hw_error-v3-1-ff0683fc1f17@debian.org

Changes in v3:
- Add more information about this feature in the commit message
  (Borislav Petkov)
- Renamed the function to hwerr_log_error_type() and use hwerr as
  suffix (Borislav Petkov)
- Make the empty function static inline (kernel test robot)
- Link to v2: https://lore.kernel.org/r/20250721-vmcore_hw_error-v2-1-ab65a6b43c5a@debian.org

Changes in v2:
- Split the counter by recoverable error (Tony Luck)
- Link to v1: https://lore.kernel.org/r/20250714-vmcore_hw_error-v1-1-8cf45edb6334@debian.org
---
 arch/x86/kernel/cpu/mce/core.c |  4 ++++
 drivers/acpi/apei/ghes.c       | 36 ++++++++++++++++++++++++++++++++++++
 drivers/pci/pcie/aer.c         |  2 ++
 include/linux/vmcore_info.h    | 17 +++++++++++++++++
 kernel/vmcore_info.c           | 18 ++++++++++++++++++
 5 files changed, 77 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4da4eab56c81d..f85759453f89a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -45,6 +45,7 @@
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 #include <linux/kexec.h>
+#include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
 #include <asm/cpu_device_id.h>
@@ -1690,6 +1691,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
 	}
 
 out:
+	/* Given it didn't panic, mark it as recoverable */
+	hwerr_log_error_type(HWERR_RECOV_MCE);
+
 	instrumentation_end();
 
 clear:
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a0d54993edb3b..562459e9d632e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -43,6 +43,7 @@
 #include <linux/uuid.h>
 #include <linux/ras.h>
 #include <linux/task_work.h>
+#include <linux/vmcore_info.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -867,6 +868,40 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL");
 
+static void ghes_log_hwerr(int sev, guid_t *sec_type)
+{
+	if (sev != CPER_SEV_RECOVERABLE)
+		return;
+
+	if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) ||
+	    guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) ||
+	    guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+		hwerr_log_error_type(HWERR_RECOV_CPU);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) {
+		hwerr_log_error_type(HWERR_RECOV_CXL);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_PCIE) ||
+	    guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) {
+		hwerr_log_error_type(HWERR_RECOV_PCI);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
+		hwerr_log_error_type(HWERR_RECOV_MEMORY);
+		return;
+	}
+
+	hwerr_log_error_type(HWERR_RECOV_OTHERS);
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -888,6 +923,7 @@ static void ghes_do_proc(struct ghes *ghes,
 		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
 			fru_text = gdata->fru_text;
 
+		ghes_log_hwerr(sev, sec_type);
 		if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 70ac661883672..fe0174b972a7b 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -30,6 +30,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
+#include <linux/vmcore_info.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
@@ -751,6 +752,7 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
 		break;
 	case AER_NONFATAL:
 		aer_info->dev_total_nonfatal_errs++;
+		hwerr_log_error_type(HWERR_RECOV_PCI);
 		counter = &aer_info->dev_nonfatal_errs[0];
 		max = AER_MAX_TYPEOF_UNCOR_ERRS;
 		break;
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index 37e003ae52626..538a3635fb1e5 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -77,4 +77,21 @@ extern u32 *vmcoreinfo_note;
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
+
+enum hwerr_error_type {
+	HWERR_RECOV_MCE,
+	HWERR_RECOV_CPU,
+	HWERR_RECOV_MEMORY,
+	HWERR_RECOV_PCI,
+	HWERR_RECOV_CXL,
+	HWERR_RECOV_OTHERS,
+	HWERR_RECOV_MAX,
+};
+
+#ifdef CONFIG_VMCORE_INFO
+noinstr void hwerr_log_error_type(enum hwerr_error_type src);
+#else
+static inline void hwerr_log_error_type(enum hwerr_error_type src) {};
+#endif
+
 #endif /* LINUX_VMCORE_INFO_H */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e066d31d08f89..4b5ab45d468f5 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+struct hwerr_info {
+	int __data_racy count;
+	time64_t __data_racy timestamp;
+};
+
+static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len)
 {
@@ -118,6 +125,17 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
+void hwerr_log_error_type(enum hwerr_error_type src)
+{
+	if (src < 0 || src >= HWERR_RECOV_MAX)
+		return;
+
+	/* No need to atomics/locks given the precision is not important */
+	hwerr_data[src].count++;
+	hwerr_data[src].timestamp = ktime_get_real_seconds();
+}
+EXPORT_SYMBOL_GPL(hwerr_log_error_type);
+
 static int __init crash_save_vmcoreinfo_init(void)
 {
 	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);

---
base-commit: 89748acdf226fd1a8775ff6fa2703f8412b286c8
change-id: 20250707-vmcore_hw_error-322429e6c316

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 12:31 [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
@ 2025-08-01 14:52 ` Dave Hansen
  2025-08-01 15:13   ` Breno Leitao
  2025-08-02  0:51 ` kernel test robot
  2025-08-04  0:05 ` kernel test robot
  2 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2025-08-01 14:52 UTC (permalink / raw)
  To: Breno Leitao, Rafael J. Wysocki, Len Brown, James Morse,
	Tony Luck, Borislav Petkov, Robert Moore, Thomas Gleixner,
	Ingo Molnar, Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: linux-acpi, linux-kernel, acpica-devel, osandov, xueshuai,
	konrad.wilk, linux-edac, linuxppc-dev, linux-pci, kernel-team,
	osandov

On 8/1/25 05:31, Breno Leitao wrote:
> Introduce a generic infrastructure for tracking recoverable hardware
> errors (HW errors that are visible to the OS but does not cause a panic)
> and record them for vmcore consumption.
...

Are there patches for the consumer side of this, too? Or do humans
looking at crash dumps have to know what to go digging for?

In either case, don't we need documentation for this new ABI?

> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 4da4eab56c81d..f85759453f89a 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -45,6 +45,7 @@
>  #include <linux/task_work.h>
>  #include <linux/hardirq.h>
>  #include <linux/kexec.h>
> +#include <linux/vmcore_info.h>
>  
>  #include <asm/fred.h>
>  #include <asm/cpu_device_id.h>
> @@ -1690,6 +1691,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
>  	}
>  
>  out:
> +	/* Given it didn't panic, mark it as recoverable */
> +	hwerr_log_error_type(HWERR_RECOV_MCE);
> +

Does "MCE" mean anything outside of x86?

I wonder if this would be better left as "HWERR_RECOV_ARCH" or something.

...
> +void hwerr_log_error_type(enum hwerr_error_type src)
> +{
> +	if (src < 0 || src >= HWERR_RECOV_MAX)
> +		return;
> +
> +	/* No need to atomics/locks given the precision is not important */

Sure, but it's not even more lines of code to do:

	atomic_inc(&hwerr_data[src].count);
	WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());

So why not?

> +	hwerr_data[src].count++;
> +	hwerr_data[src].timestamp = ktime_get_real_seconds();
> +}
> +EXPORT_SYMBOL_GPL(hwerr_log_error_type);

I'd also love to hear more about _actual_ users of this. Surely, someone
hit a real world problem and thought this would be a nifty solution. Who
was that? What problem did they hit? How does this help them?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 14:52 ` Dave Hansen
@ 2025-08-01 15:13   ` Breno Leitao
  2025-08-01 16:24     ` Dave Hansen
  0 siblings, 1 reply; 11+ messages in thread
From: Breno Leitao @ 2025-08-01 15:13 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

Hello Dave,

On Fri, Aug 01, 2025 at 07:52:17AM -0700, Dave Hansen wrote:
> On 8/1/25 05:31, Breno Leitao wrote:
> > Introduce a generic infrastructure for tracking recoverable hardware
> > errors (HW errors that are visible to the OS but does not cause a panic)
> > and record them for vmcore consumption.
> ...
> 
> Are there patches for the consumer side of this, too? Or do humans
> looking at crash dumps have to know what to go digging for?
> 
> In either case, don't we need documentation for this new ABI?

I have considered this, but the documentation for vmcoreinfo
(admin-guide/kdump/vmcoreinfo.rst) solely documents what is explicitly
exposed by vmcore, which differs from the nature of these counters.

Where would be a good place to document it?

> > @@ -1690,6 +1691,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
> >  	}
> >  
> >  out:
> > +	/* Given it didn't panic, mark it as recoverable */
> > +	hwerr_log_error_type(HWERR_RECOV_MCE);
> > +
> 
> Does "MCE" mean anything outside of x86?

AFAIK this is a MCE concept.

> I wonder if this would be better left as "HWERR_RECOV_ARCH" or something.

Sure. I can update it to be more generic.

> > +void hwerr_log_error_type(enum hwerr_error_type src)
> > +{
> > +	if (src < 0 || src >= HWERR_RECOV_MAX)
> > +		return;
> > +
> > +	/* No need to atomics/locks given the precision is not important */
> 
> Sure, but it's not even more lines of code to do:
> 
> 	atomic_inc(&hwerr_data[src].count);
> 	WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());
> 
> So why not?

Sure, we can do that, I will update it also.

> > +	hwerr_data[src].count++;
> > +	hwerr_data[src].timestamp = ktime_get_real_seconds();
> > +}
> > +EXPORT_SYMBOL_GPL(hwerr_log_error_type);
> 
> I'd also love to hear more about _actual_ users of this. Surely, someone
> hit a real world problem and thought this would be a nifty solution. Who
> was that? What problem did they hit? How does this help them?

Yes, this has been extensively discussed in the very first version of
the patch. Borislav raised the same question, which was discussed in the
following link:

https://lore.kernel.org/all/20250715125327.GGaHZPRz9QLNNO-7q8@fat_crate.local/

Thanks for the review,
--breno

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 15:13   ` Breno Leitao
@ 2025-08-01 16:24     ` Dave Hansen
  2025-08-01 17:00       ` Breno Leitao
  0 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2025-08-01 16:24 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

On 8/1/25 08:13, Breno Leitao wrote:
> Hello Dave,
> 
> On Fri, Aug 01, 2025 at 07:52:17AM -0700, Dave Hansen wrote:
>> On 8/1/25 05:31, Breno Leitao wrote:
>>> Introduce a generic infrastructure for tracking recoverable hardware
>>> errors (HW errors that are visible to the OS but does not cause a panic)
>>> and record them for vmcore consumption.
>> ...
>>
>> Are there patches for the consumer side of this, too? Or do humans
>> looking at crash dumps have to know what to go digging for?
>>
>> In either case, don't we need documentation for this new ABI?
> 
> I have considered this, but the documentation for vmcoreinfo
> (admin-guide/kdump/vmcoreinfo.rst) solely documents what is explicitly
> exposed by vmcore, which differs from the nature of these counters.
> 
> Where would be a good place to document it?

I'm not picky. But you also didn't quite answer the question I was asking.

Is this new data for humans or machines to read?

>>> @@ -1690,6 +1691,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
>>>  	}
>>>  
>>>  out:
>>> +	/* Given it didn't panic, mark it as recoverable */
>>> +	hwerr_log_error_type(HWERR_RECOV_MCE);
>>> +
>>
>> Does "MCE" mean anything outside of x86?
> 
> AFAIK this is a MCE concept.

I'm not really sure what that response means.

There are two problems here. First is that HWERR_RECOV_MCE is defined in
arch-generic code, but it may never get used by anything other than x86
when CONFIG_X86_MCE.

That also completely wastes space in your data structure when
HWERR_RECOV_MCE=n. Not a huge deal as-is, but it's still a bit sloppy
and wasteful.

...
>>> +	hwerr_data[src].count++;
>>> +	hwerr_data[src].timestamp = ktime_get_real_seconds();
>>> +}
>>> +EXPORT_SYMBOL_GPL(hwerr_log_error_type);
>>
>> I'd also love to hear more about _actual_ users of this. Surely, someone
>> hit a real world problem and thought this would be a nifty solution. Who
>> was that? What problem did they hit? How does this help them?
> 
> Yes, this has been extensively discussed in the very first version of
> the patch. Borislav raised the same question, which was discussed in the
> following link:
> 
> https://lore.kernel.org/all/20250715125327.GGaHZPRz9QLNNO-7q8@fat_crate.local/

When someone raises a concern, we usually try to alleviate the concern
in a way that is self-contained in the next posting. A cover letter with
a full explanation would be one place to put the reasoning, for example.

But expecting future reviewers to plod through all the old threads isn't
really feasible.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 16:24     ` Dave Hansen
@ 2025-08-01 17:00       ` Breno Leitao
  2025-08-01 17:06         ` Dave Hansen
  0 siblings, 1 reply; 11+ messages in thread
From: Breno Leitao @ 2025-08-01 17:00 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

hello Dave,

On Fri, Aug 01, 2025 at 09:24:43AM -0700, Dave Hansen wrote:
> On 8/1/25 08:13, Breno Leitao wrote:
> > On Fri, Aug 01, 2025 at 07:52:17AM -0700, Dave Hansen wrote:
> >> On 8/1/25 05:31, Breno Leitao wrote:
> >>> Introduce a generic infrastructure for tracking recoverable hardware
> >>> errors (HW errors that are visible to the OS but does not cause a panic)
> >>> and record them for vmcore consumption.
> >> ...
> >>
> >> Are there patches for the consumer side of this, too? Or do humans
> >> looking at crash dumps have to know what to go digging for?
> >>
> >> In either case, don't we need documentation for this new ABI?
> > 
> > I have considered this, but the documentation for vmcoreinfo
> > (admin-guide/kdump/vmcoreinfo.rst) solely documents what is explicitly
> > exposed by vmcore, which differs from the nature of these counters.
> > 
> > Where would be a good place to document it?
> 
> I'm not picky. But you also didn't quite answer the question I was asking.
> 
> Is this new data for humans or machines to read?

I would say that the main consumer for this are post-mortem tools that
collect information of the vmcore and do diagnostic and correlation.
This is a common tooling for cloud providers, AFAIK.

In my work environment, there is a script that runs `drgn` on every
vmcore to capture information that would help operator to address the
problem. The information that this patch is proposing adds another field
that would help to potentially correlate crashes with recoverable error.

> >> Does "MCE" mean anything outside of x86?
> > 
> > AFAIK this is a MCE concept.
> 
> I'm not really sure what that response means.
> 
> There are two problems here. First is that HWERR_RECOV_MCE is defined in
> arch-generic code, but it may never get used by anything other than x86
> when CONFIG_X86_MCE.
> 
> That also completely wastes space in your data structure when
> HWERR_RECOV_MCE=n. Not a huge deal as-is, but it's still a bit sloppy
> and wasteful.

Would a solution like this look better?

	enum hwerr_error_type {
		HWERR_RECOV_CPU,
		HWERR_RECOV_MEMORY,
		HWERR_RECOV_PCI,
		HWERR_RECOV_CXL,
		HWERR_RECOV_OTHERS,
	#ifdef CONFIG_X86_MCE
		HWERR_RECOV_MCE,
	#endif
		HWERR_RECOV_MAX,
	};

Or, would you prefer to have HWERR_RECOV_ARCH and keep it always there?

> >> I'd also love to hear more about _actual_ users of this. Surely, someone
> >> hit a real world problem and thought this would be a nifty solution. Who
> >> was that? What problem did they hit? How does this help them?
> > 
> > Yes, this has been extensively discussed in the very first version of
> > the patch. Borislav raised the same question, which was discussed in the
> > following link:
> > 
> > https://lore.kernel.org/all/20250715125327.GGaHZPRz9QLNNO-7q8@fat_crate.local/
> 
> When someone raises a concern, we usually try to alleviate the concern
> in a way that is self-contained in the next posting. A cover letter with
> a full explanation would be one place to put the reasoning, for example.
> 
> But expecting future reviewers to plod through all the old threads isn't
> really feasible.

Sorry. I tried to improve the documentation and wrote the following
message to the commit message, which was clearly not enough.

	This helps fleet operators quickly triage whether a crash may be
	influenced by hardware recoverable errors (which executes a uncommon
	code path in the kernel), especially when recoverable errors occurred
	shortly before a panic, such as the bug fixed by
	commit ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them
	when destroying the pool")

For next commit I will add a cover-letter, with the summary of the
details of that discussion.

Thanks for the review and suggestions,
--breno


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 17:00       ` Breno Leitao
@ 2025-08-01 17:06         ` Dave Hansen
  2025-08-04 17:12           ` Breno Leitao
  0 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2025-08-01 17:06 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

On 8/1/25 10:00, Breno Leitao wrote:
> Would a solution like this look better?
> 
> 	enum hwerr_error_type {
> 		HWERR_RECOV_CPU,
> 		HWERR_RECOV_MEMORY,
> 		HWERR_RECOV_PCI,
> 		HWERR_RECOV_CXL,
> 		HWERR_RECOV_OTHERS,
> 	#ifdef CONFIG_X86_MCE
> 		HWERR_RECOV_MCE,
> 	#endif
> 		HWERR_RECOV_MAX,
> 	};
> 
> Or, would you prefer to have HWERR_RECOV_ARCH and keep it always there?

That would only work for HWERR_RECOV_MCE, though. If you added another:

#ifdef CONFIG_FOO
	HWERR_RECOV_FOO
#endif

then your example of:

	>>> prog['hwerror_data']
	(struct hwerror_info[6]){
		{
			.count = (int)844,
			.timestamp = (time64_t)1752852018,
		},
		...

doesn't work any more. You wouldn't be able to tell HWERR_RECOV_MCE from
HWERR_RECOV_FOO because they'd alias to the same constant.

This whole thing is an ABI. Right?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 12:31 [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
  2025-08-01 14:52 ` Dave Hansen
@ 2025-08-02  0:51 ` kernel test robot
  2025-08-04  0:05 ` kernel test robot
  2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2025-08-02  0:51 UTC (permalink / raw)
  To: Breno Leitao, Rafael J. Wysocki, Len Brown, James Morse,
	Tony Luck, Borislav Petkov, Robert Moore, Thomas Gleixner,
	Ingo Molnar, Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: oe-kbuild-all, linux-media, linux-acpi, linux-kernel,
	acpica-devel, osandov, xueshuai, konrad.wilk, linux-edac,
	linuxppc-dev, linux-pci, kernel-team, osandov, Breno Leitao

Hi Breno,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 89748acdf226fd1a8775ff6fa2703f8412b286c8]

url:    https://github.com/intel-lab-lkp/linux/commits/Breno-Leitao/vmcoreinfo-Track-and-log-recoverable-hardware-errors/20250801-211624
base:   89748acdf226fd1a8775ff6fa2703f8412b286c8
patch link:    https://lore.kernel.org/r/20250801-vmcore_hw_error-v4-1-fa1fe65edb83%40debian.org
patch subject: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20250802/202508020814.lzX1CZpj-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250802/202508020814.lzX1CZpj-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202508020814.lzX1CZpj-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> vmlinux.o: warning: objtool: hwerr_log_error_type+0x23: call to ktime_get_real_seconds() leaves .noinstr.text section

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 12:31 [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
  2025-08-01 14:52 ` Dave Hansen
  2025-08-02  0:51 ` kernel test robot
@ 2025-08-04  0:05 ` kernel test robot
  2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2025-08-04  0:05 UTC (permalink / raw)
  To: Breno Leitao, Rafael J. Wysocki, Len Brown, James Morse,
	Tony Luck, Borislav Petkov, Robert Moore, Thomas Gleixner,
	Ingo Molnar, Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas
  Cc: oe-kbuild-all, linux-media, linux-acpi, linux-kernel,
	acpica-devel, osandov, xueshuai, konrad.wilk, linux-edac,
	linuxppc-dev, linux-pci, kernel-team, osandov, Breno Leitao

Hi Breno,

kernel test robot noticed the following build errors:

[auto build test ERROR on 89748acdf226fd1a8775ff6fa2703f8412b286c8]

url:    https://github.com/intel-lab-lkp/linux/commits/Breno-Leitao/vmcoreinfo-Track-and-log-recoverable-hardware-errors/20250801-211624
base:   89748acdf226fd1a8775ff6fa2703f8412b286c8
patch link:    https://lore.kernel.org/r/20250801-vmcore_hw_error-v4-1-fa1fe65edb83%40debian.org
patch subject: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
config: x86_64-randconfig-076-20250803 (https://download.01.org/0day-ci/archive/20250804/202508040737.rlDPN1um-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250804/202508040737.rlDPN1um-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202508040737.rlDPN1um-lkp@intel.com/

All errors (new ones prefixed by >>):

>> vmlinux.o: error: objtool: hwerr_log_error_type+0x3b: call to ktime_get_real_seconds() leaves .noinstr.text section

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-01 17:06         ` Dave Hansen
@ 2025-08-04 17:12           ` Breno Leitao
  2025-08-04 17:41             ` Dave Hansen
  0 siblings, 1 reply; 11+ messages in thread
From: Breno Leitao @ 2025-08-04 17:12 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

On Fri, Aug 01, 2025 at 10:06:51AM -0700, Dave Hansen wrote:
> On 8/1/25 10:00, Breno Leitao wrote:
> > Would a solution like this look better?
> > 
> > 	enum hwerr_error_type {
> > 		HWERR_RECOV_CPU,
> > 		HWERR_RECOV_MEMORY,
> > 		HWERR_RECOV_PCI,
> > 		HWERR_RECOV_CXL,
> > 		HWERR_RECOV_OTHERS,
> > 	#ifdef CONFIG_X86_MCE
> > 		HWERR_RECOV_MCE,
> > 	#endif
> > 		HWERR_RECOV_MAX,
> > 	};
> > 
> > Or, would you prefer to have HWERR_RECOV_ARCH and keep it always there?
> 
> That would only work for HWERR_RECOV_MCE, though. If you added another:
> 
> #ifdef CONFIG_FOO
> 	HWERR_RECOV_FOO
> #endif
> 
> then your example of:
> 
> 	>>> prog['hwerror_data']
> 	(struct hwerror_info[6]){
> 		{
> 			.count = (int)844,
> 			.timestamp = (time64_t)1752852018,
> 		},
> 		...
> 
> doesn't work any more. You wouldn't be able to tell HWERR_RECOV_MCE from
> HWERR_RECOV_FOO because they'd alias to the same constant.

Very good point, that is not what we want. Thanks for raising it.

> This whole thing is an ABI. Right?

Exactly.

I've digested the feedback a bit more and this is how it looks like now:

1) Remove HWERR_RECOV_MCE and account MCE errors in HWERR_RECOV_OTHERS
2) Use the atomic operators to increase the number of errors, according
   to Dave.
3) Have an RST documentation that would help to answer question about
   this feature as to let users know that this is available.


Is this a better patch?

commit ff35595f66871ddf80cda0a8e42398738171abe1
Author: Breno Leitao <leitao@debian.org>
Date:   Thu Jul 17 07:39:26 2025 -0700

    vmcoreinfo: Track and log recoverable hardware errors
    
    Introduce a generic infrastructure for tracking recoverable hardware
    errors (HW errors that are visible to the OS but does not cause a panic)
    and record them for vmcore consumption. This aids post-mortem crash
    analysis tools by preserving a count and timestamp for the last
    occurrence of such errors. On the other side, correctable errors, which
    the OS typically remains unaware of because the underlying hardware
    handles them transparently, are less relevant for crash dump
    and therefore are NOT tracked in this infrastructure.
    
    Add centralized logging for sources of recoverable hardware
    errors based on the subsystem it has been notified.
    
    hwerror_data is write-only at kernel runtime, and it is meant to be read
    from vmcore using tools like crash/drgn. For example, this is how it
    looks like when opening the crashdump from drgn.
    
            >>> prog['hwerror_data']
            (struct hwerror_info[1]){
                    {
                            .count = (int)844,
                            .timestamp = (time64_t)1752852018,
                    },
                    ...
    
    This helps fleet operators quickly triage whether a crash may be
    influenced by hardware recoverable errors (which executes a uncommon
    code path in the kernel), especially when recoverable errors occurred
    shortly before a panic, such as the bug fixed by
    commit ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them
    when destroying the pool")
    
    This is not intended to replace full hardware diagnostics but provides
    a fast way to correlate hardware events with kernel panics quickly.
    
    Rare machine check exceptions—like those indicated by mce_flags.p5 or
    mce_flags.winchip—are not accounted for in this method, as they fall
    outside the intended usage scope for this feature’s user base.
    
    Suggested-by: Tony Luck <tony.luck@intel.com>
    Suggested-by: Shuai Xue <xueshuai@linux.alibaba.com>
    Signed-off-by: Breno Leitao <leitao@debian.org>

diff --git a/Documentation/driver-api/hw-recoverable-errors.rst b/Documentation/driver-api/hw-recoverable-errors.rst
new file mode 100644
index 0000000000000..7989258262d49
--- /dev/null
+++ b/Documentation/driver-api/hw-recoverable-errors.rst
@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================================
+Recoverable Hardware Error Tracking in vmcoreinfo
+=================================================
+
+Overview
+--------
+
+This feature provides a generic infrastructure within the Linux kernel to track
+and log recoverable hardware errors. These are hardware recoverable errors
+visible that might not cause immediate panics but may influence health, mainly
+because new code path will be executed in the kernel.
+
+By recording counts and timestamps of recoverable errors into the vmcoreinfo
+crash dump notes, this infrastructure aids post-mortem crash analysis tools in
+correlating hardware events with kernel failures. This enables faster triage
+and better understanding of root causes, especially in large-scale cloud
+environments where hardware issues are common.
+
+Benefits
+--------
+
+- Facilitates correlation of hardware recoverable errors with kernel panics or
+  unusual code paths that lead to system crashes.
+- Provides operators and cloud providers quick insights, improving reliability
+  and reducing troubleshooting time.
+- Complements existing full hardware diagnostics without replacing them.
+
+Data Exposure and Consumption
+-----------------------------
+
+- The tracked error data consists of per-error-type counts and timestamps of
+  last occurrence.
+- This data is stored in the `hwerror_data` array, categorized by error source
+  types like CPU, memory, PCI, CXL, and others.
+- It is exposed via vmcoreinfo crash dump notes and can be read using tools
+  like `crash`, `drgn`, or other kernel crash analysis utilities.
+- There is no other way to read these data other than from crash dumps.
+- These errros are divided by are, which includes CPU, Memory, PCI, CXL and
+  others.
+
+Typical usage example (in drgn REPL):
+
+.. code-block:: python
+
+    >>> prog['hwerror_data']
+    (struct hwerror_info[HWERR_RECOV_MAX]){
+        {
+            .count = (int)844,
+            .timestamp = (time64_t)1752852018,
+        },
+        ...
+    }
+
+Enabling
+--------
+
+- This feature is enabled when CONFIG_VMCORE_INFO is set.
+
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4da4eab56c81d..9cc38c5ffb77a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -45,6 +45,7 @@
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 #include <linux/kexec.h>
+#include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
 #include <asm/cpu_device_id.h>
@@ -1690,6 +1691,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
 	}
 
 out:
+	/* Given it didn't panic, mark it as recoverable */
+	hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
 	instrumentation_end();
 
 clear:
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a0d54993edb3b..562459e9d632e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -43,6 +43,7 @@
 #include <linux/uuid.h>
 #include <linux/ras.h>
 #include <linux/task_work.h>
+#include <linux/vmcore_info.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -867,6 +868,40 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL");
 
+static void ghes_log_hwerr(int sev, guid_t *sec_type)
+{
+	if (sev != CPER_SEV_RECOVERABLE)
+		return;
+
+	if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) ||
+	    guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) ||
+	    guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
+		hwerr_log_error_type(HWERR_RECOV_CPU);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) ||
+	    guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) {
+		hwerr_log_error_type(HWERR_RECOV_CXL);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_PCIE) ||
+	    guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) {
+		hwerr_log_error_type(HWERR_RECOV_PCI);
+		return;
+	}
+
+	if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
+		hwerr_log_error_type(HWERR_RECOV_MEMORY);
+		return;
+	}
+
+	hwerr_log_error_type(HWERR_RECOV_OTHERS);
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -888,6 +923,7 @@ static void ghes_do_proc(struct ghes *ghes,
 		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
 			fru_text = gdata->fru_text;
 
+		ghes_log_hwerr(sev, sec_type);
 		if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e286c197d7167..d814c06cdbee6 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -30,6 +30,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
+#include <linux/vmcore_info.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
 #include <ras/ras_event.h>
@@ -751,6 +752,7 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
 		break;
 	case AER_NONFATAL:
 		aer_info->dev_total_nonfatal_errs++;
+		hwerr_log_error_type(HWERR_RECOV_PCI);
 		counter = &aer_info->dev_nonfatal_errs[0];
 		max = AER_MAX_TYPEOF_UNCOR_ERRS;
 		break;
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index 37e003ae52626..92e713c1a83d0 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -77,4 +77,20 @@ extern u32 *vmcoreinfo_note;
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
+
+enum hwerr_error_type {
+	HWERR_RECOV_CPU,
+	HWERR_RECOV_MEMORY,
+	HWERR_RECOV_PCI,
+	HWERR_RECOV_CXL,
+	HWERR_RECOV_OTHERS,
+	HWERR_RECOV_MAX,
+};
+
+#ifdef CONFIG_VMCORE_INFO
+void hwerr_log_error_type(enum hwerr_error_type src);
+#else
+static inline void hwerr_log_error_type(enum hwerr_error_type src) {};
+#endif
+
 #endif /* LINUX_VMCORE_INFO_H */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e066d31d08f89..fe9bf8db1922e 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+struct hwerr_info {
+	atomic_t count;
+	time64_t timestamp;
+};
+
+static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len)
 {
@@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
+void hwerr_log_error_type(enum hwerr_error_type src)
+{
+	if (src < 0 || src >= HWERR_RECOV_MAX)
+		return;
+
+	atomic_inc(&hwerr_data[src].count);
+	WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());
+}
+EXPORT_SYMBOL_GPL(hwerr_log_error_type);
+
 static int __init crash_save_vmcoreinfo_init(void)
 {
 	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-04 17:12           ` Breno Leitao
@ 2025-08-04 17:41             ` Dave Hansen
  2025-08-05 13:00               ` Breno Leitao
  0 siblings, 1 reply; 11+ messages in thread
From: Dave Hansen @ 2025-08-04 17:41 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

On 8/4/25 10:12, Breno Leitao wrote:
...
> +- These errros are divided by are, which includes CPU, Memory, PCI, CXL and
> +  others.

There's a double typo in there I think:

	errros => errors
and
	are,=>area,

> --- a/include/linux/vmcore_info.h
> +++ b/include/linux/vmcore_info.h
> @@ -77,4 +77,20 @@ extern u32 *vmcoreinfo_note;
>  Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
>  			  void *data, size_t data_len);
>  void final_note(Elf_Word *buf);
> +
> +enum hwerr_error_type {
> +	HWERR_RECOV_CPU,
> +	HWERR_RECOV_MEMORY,
> +	HWERR_RECOV_PCI,
> +	HWERR_RECOV_CXL,
> +	HWERR_RECOV_OTHERS,
> +	HWERR_RECOV_MAX,
> +};
That enum needs to go into an abi header.

Otherwise, this is starting to look sane to me.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors
  2025-08-04 17:41             ` Dave Hansen
@ 2025-08-05 13:00               ` Breno Leitao
  0 siblings, 0 replies; 11+ messages in thread
From: Breno Leitao @ 2025-08-05 13:00 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Rafael J. Wysocki, Len Brown, James Morse, Tony Luck,
	Borislav Petkov, Robert Moore, Thomas Gleixner, Ingo Molnar,
	Dave Hansen, x86, H. Peter Anvin, Hanjun Guo,
	Mauro Carvalho Chehab, Mahesh J Salgaonkar, Oliver O'Halloran,
	Bjorn Helgaas, linux-acpi, linux-kernel, acpica-devel, osandov,
	xueshuai, konrad.wilk, linux-edac, linuxppc-dev, linux-pci,
	kernel-team, osandov

On Mon, Aug 04, 2025 at 10:41:05AM -0700, Dave Hansen wrote:
> On 8/4/25 10:12, Breno Leitao wrote:
> ...
> > +- These errros are divided by are, which includes CPU, Memory, PCI, CXL and
> > +  others.
> 
> There's a double typo in there I think:
> 
> 	errros => errors
> and
> 	are,=>area,
> 
> > --- a/include/linux/vmcore_info.h
> > +++ b/include/linux/vmcore_info.h
> > @@ -77,4 +77,20 @@ extern u32 *vmcoreinfo_note;
> >  Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
> >  			  void *data, size_t data_len);
> >  void final_note(Elf_Word *buf);
> > +
> > +enum hwerr_error_type {
> > +	HWERR_RECOV_CPU,
> > +	HWERR_RECOV_MEMORY,
> > +	HWERR_RECOV_PCI,
> > +	HWERR_RECOV_CXL,
> > +	HWERR_RECOV_OTHERS,
> > +	HWERR_RECOV_MAX,
> > +};
> That enum needs to go into an abi header.

Agree. I came up with something like the change below. Is it the right
thing to mark the enum as stable ABI?

Thanks
--breno

diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index 37e003ae52626..e71518caacdfc 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -5,6 +5,7 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
+#include <uapi/linux/vmcore.h>

 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
 #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
@@ -77,4 +78,11 @@ extern u32 *vmcoreinfo_note;
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
                          void *data, size_t data_len);
 void final_note(Elf_Word *buf);
+
+#ifdef CONFIG_VMCORE_INFO
+void hwerr_log_error_type(enum hwerr_error_type src);
+#else
+static inline void hwerr_log_error_type(enum hwerr_error_type src) {};
+#endif
+
 #endif /* LINUX_VMCORE_INFO_H */
diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h
index 3e9da91866ffd..2ba89fafa518a 100644
--- a/include/uapi/linux/vmcore.h
+++ b/include/uapi/linux/vmcore.h
@@ -15,4 +15,13 @@ struct vmcoredd_header {
        __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
 };

+enum hwerr_error_type {
+       HWERR_RECOV_CPU,
+       HWERR_RECOV_MEMORY,
+       HWERR_RECOV_PCI,
+       HWERR_RECOV_CXL,
+       HWERR_RECOV_OTHERS,
+       HWERR_RECOV_MAX,
+};
+
 #endif /* _UAPI_VMCORE_H */

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-08-05 13:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-01 12:31 [PATCH v4] vmcoreinfo: Track and log recoverable hardware errors Breno Leitao
2025-08-01 14:52 ` Dave Hansen
2025-08-01 15:13   ` Breno Leitao
2025-08-01 16:24     ` Dave Hansen
2025-08-01 17:00       ` Breno Leitao
2025-08-01 17:06         ` Dave Hansen
2025-08-04 17:12           ` Breno Leitao
2025-08-04 17:41             ` Dave Hansen
2025-08-05 13:00               ` Breno Leitao
2025-08-02  0:51 ` kernel test robot
2025-08-04  0:05 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).