linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yazen Ghannam <yazen.ghannam@amd.com>
To: <x86@kernel.org>, Tony Luck <tony.luck@intel.com>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Len Brown <lenb@kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <linux-edac@vger.kernel.org>,
	<Smita.KoralahalliChannabasappa@amd.com>,
	Qiuxu Zhuo <qiuxu.zhuo@intel.com>, <linux-acpi@vger.kernel.org>,
	Yazen Ghannam <yazen.ghannam@amd.com>
Subject: [PATCH v4 22/22] x86/mce: Save and use APEI corrected threshold limit
Date: Tue, 24 Jun 2025 14:16:17 +0000	[thread overview]
Message-ID: <20250624-wip-mca-updates-v4-22-236dd74f645f@amd.com> (raw)
In-Reply-To: <20250624-wip-mca-updates-v4-0-236dd74f645f@amd.com>

The MCA threshold limit generally is not something that needs to change
during runtime. It is common for a system administrator to decide on a
policy for their managed systems.

If MCA thresholding is OS-managed, then the threshold limit must be set
at every boot. However, many systems allow the user to set a value in
their BIOS. And this is reported through an APEI HEST entry even if
thresholding is not in FW-First mode.

Use this value, if available, to set the OS-managed threshold limit.
Users can still override it through sysfs if desired for testing or
debug.

APEI is parsed after MCE is initialized. So reset the thresholding
blocks later to pick up the threshold limit.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---

Notes:
    v3->v4:
    * New in v4.

 arch/x86/include/asm/mce.h          |  6 ++++++
 arch/x86/kernel/acpi/apei.c         |  2 ++
 arch/x86/kernel/cpu/mce/amd.c       | 18 ++++++++++++++++--
 arch/x86/kernel/cpu/mce/internal.h  |  2 ++
 arch/x86/kernel/cpu/mce/threshold.c | 13 +++++++++++++
 5 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 7d6588195d56..1cfbfff0be3f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -308,6 +308,12 @@ DECLARE_PER_CPU(struct mce, injectm);
 /* Disable CMCI/polling for MCA bank claimed by firmware */
 extern void mce_disable_bank(int bank);
 
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void mce_save_apei_thr_limit(u32 thr_limit);
+#else
+static inline void mce_save_apei_thr_limit(u32 thr_limit) { }
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
 /*
  * Exception handler
  */
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index 0916f00a992e..e21419e686eb 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
 	if (!cmc->enabled)
 		return 0;
 
+	mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
+
 	/*
 	 * We expect HEST to provide a list of MC banks that report errors
 	 * in firmware first mode. Otherwise, return non-zero value to
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 85fd9273e90a..9e26b0437dc6 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -488,6 +488,18 @@ static void threshold_restart_bank(unsigned int bank, bool intr_en)
 	}
 }
 
+/* Try to use the threshold limit reported through APEI. */
+static u16 get_thr_limit(void)
+{
+	u32 thr_limit = mce_get_apei_thr_limit();
+
+	/* Fallback to old default if APEI limit is not available. */
+	if (!thr_limit)
+		return THRESHOLD_MAX;
+
+	return min(thr_limit, THRESHOLD_MAX);
+}
+
 static void mce_threshold_block_init(struct threshold_block *b, int offset)
 {
 	struct thresh_restart tr = {
@@ -496,7 +508,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
 		.lvt_off		= offset,
 	};
 
-	b->threshold_limit		= THRESHOLD_MAX;
+	b->threshold_limit		= get_thr_limit();
 	threshold_restart_block(&tr);
 };
 
@@ -1079,7 +1091,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
 	b->address		= address;
 	b->interrupt_enable	= 0;
 	b->interrupt_capable	= lvt_interrupt_supported(bank, high);
-	b->threshold_limit	= THRESHOLD_MAX;
+	b->threshold_limit	= get_thr_limit();
 
 	if (b->interrupt_capable) {
 		default_attrs[2] = &interrupt_enable.attr;
@@ -1090,6 +1102,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
 
 	list_add(&b->miscj, &tb->miscj);
 
+	mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
+
 	err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
 	if (err)
 		goto out_free;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 09ebcf82df93..df98930a32a5 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -67,6 +67,7 @@ void mce_track_storm(struct mce *mce);
 void mce_inherit_storm(unsigned int bank);
 bool mce_get_storm_mode(void);
 void mce_set_storm_mode(bool storm);
+u32  mce_get_apei_thr_limit(void);
 #else
 static inline void cmci_storm_begin(unsigned int bank) {}
 static inline void cmci_storm_end(unsigned int bank) {}
@@ -74,6 +75,7 @@ static inline void mce_track_storm(struct mce *mce) {}
 static inline void mce_inherit_storm(unsigned int bank) {}
 static inline bool mce_get_storm_mode(void) { return false; }
 static inline void mce_set_storm_mode(bool storm) {}
+static inline u32  mce_get_apei_thr_limit(void) { return 0; }
 #endif
 
 /*
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 45144598ec74..d00d5bf9959d 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -13,6 +13,19 @@
 
 #include "internal.h"
 
+static u32 mce_apei_thr_limit;
+
+void mce_save_apei_thr_limit(u32 thr_limit)
+{
+	mce_apei_thr_limit = thr_limit;
+	pr_info("HEST: Corrected error threshold limit = %u\n", thr_limit);
+}
+
+u32 mce_get_apei_thr_limit(void)
+{
+	return mce_apei_thr_limit;
+}
+
 static void default_threshold_interrupt(void)
 {
 	pr_err("Unexpected threshold interrupt at vector %x\n",

-- 
2.49.0


      parent reply	other threads:[~2025-06-24 14:16 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-24 14:15 [PATCH v4 00/22] AMD MCA interrupts rework Yazen Ghannam
2025-06-24 14:15 ` [PATCH v4 01/22] x86/mce: Don't remove sysfs if thresholding sysfs init fails Yazen Ghannam
2025-06-28 11:33   ` [tip: ras/urgent] " tip-bot2 for Yazen Ghannam
2025-06-24 14:15 ` [PATCH v4 02/22] x86/mce: Restore poll settings after storm subsides Yazen Ghannam
2025-06-25 13:22   ` Nikolay Borisov
2025-06-28 11:33   ` [tip: ras/urgent] x86/mce: Ensure user polling settings are honored when restarting timer tip-bot2 for Yazen Ghannam
2025-06-24 14:15 ` [PATCH v4 03/22] x86/mce/amd: Add default names for MCA banks and blocks Yazen Ghannam
2025-06-28 11:33   ` [tip: ras/urgent] " tip-bot2 for Yazen Ghannam
2025-06-24 14:15 ` [PATCH v4 04/22] x86/mce/amd: Fix threshold limit reset Yazen Ghannam
2025-06-28 11:33   ` [tip: ras/urgent] " tip-bot2 for Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 05/22] x86/mce/amd: Rename threshold restart function Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 06/22] x86/mce/amd: Remove return value for mce_threshold_{create,remove}_device() Yazen Ghannam
2025-06-25 14:57   ` Nikolay Borisov
2025-06-24 14:16 ` [PATCH v4 07/22] x86/mce/amd: Remove smca_banks_map Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 08/22] x86/mce/amd: Put list_head in threshold_bank Yazen Ghannam
2025-06-25 16:52   ` Nikolay Borisov
2025-06-27 11:14     ` Nikolay Borisov
2025-06-30 12:57       ` Yazen Ghannam
2025-08-25 13:59     ` Borislav Petkov
2025-06-24 14:16 ` [PATCH v4 09/22] x86/mce: Cleanup bank processing on init Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 10/22] x86/mce: Remove __mcheck_cpu_init_early() Yazen Ghannam
2025-06-26  8:03   ` Nikolay Borisov
2025-06-30 12:58     ` Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 11/22] x86/mce: Define BSP-only init Yazen Ghannam
2025-06-25 11:04   ` Nikolay Borisov
2025-06-25 11:26     ` Borislav Petkov
2025-06-24 14:16 ` [PATCH v4 12/22] x86/mce: Define BSP-only SMCA init Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 13/22] x86/mce: Do 'UNKNOWN' vendor check early Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 14/22] x86/mce: Separate global and per-CPU quirks Yazen Ghannam
2025-06-27 11:02   ` Nikolay Borisov
2025-06-30 13:00     ` Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 15/22] x86/mce: Move machine_check_poll() status checks to helper functions Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 16/22] x86/mce: Unify AMD THR handler with MCA Polling Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 17/22] x86/mce: Unify AMD DFR " Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 18/22] x86/mce/amd: Support SMCA Corrected Error Interrupt Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 19/22] x86/mce/amd: Remove redundant reset_block() Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 20/22] x86/mce/amd: Define threshold restart function for banks Yazen Ghannam
2025-06-24 14:16 ` [PATCH v4 21/22] x86/mce: Handle AMD threshold interrupt storms Yazen Ghannam
2025-06-24 14:16 ` Yazen Ghannam [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250624-wip-mca-updates-v4-22-236dd74f645f@amd.com \
    --to=yazen.ghannam@amd.com \
    --cc=Smita.KoralahalliChannabasappa@amd.com \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=qiuxu.zhuo@intel.com \
    --cc=rafael@kernel.org \
    --cc=tony.luck@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).