From: Yazen Ghannam <yazen.ghannam@amd.com>
To: <x86@kernel.org>, Tony Luck <tony.luck@intel.com>
Cc: <linux-kernel@vger.kernel.org>, <linux-edac@vger.kernel.org>,
<Smita.KoralahalliChannabasappa@amd.com>,
Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Subject: [PATCH v3 12/17] x86/mce: Unify AMD THR handler with MCA Polling
Date: Tue, 15 Apr 2025 14:55:07 +0000 [thread overview]
Message-ID: <20250415-wip-mca-updates-v3-12-8ffd9eb4aa56@amd.com> (raw)
In-Reply-To: <20250415-wip-mca-updates-v3-0-8ffd9eb4aa56@amd.com>
AMD systems optionally support an MCA thresholding interrupt. The
interrupt should be used as another signal to trigger MCA polling. This
is similar to how the Intel Corrected Machine Check interrupt (CMCI) is
handled.
AMD MCA thresholding is managed using the MCA_MISC registers within an
MCA bank. The OS will need to modify the hardware error count field in
order to reset the threshold limit and rearm the interrupt. Management
of the MCA_MISC register should be done as a follow up to the basic MCA
polling flow. It should not be the main focus of the interrupt handler.
Furthermore, future systems will have the ability to send an MCA
thresholding interrupt to the OS even when the OS does not manage the
feature, i.e. MCA_MISC registers are Read-as-Zero/Locked.
Call the common MCA polling function when handling the MCA thresholding
interrupt. This will allow the OS to find any valid errors whether or
not the MCA thresholding feature is OS-managed. Also, this allows the
common MCA polling options and kernel parameters to apply to AMD
systems.
Add a callback to the MCA polling function to check and reset any
threshold blocks that have reached their threshold limit.
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
Notes:
Link:
https://lore.kernel.org/r/20250213-wip-mca-updates-v2-12-3636547fe05f@amd.com
v2->v3:
* Add tags from Qiuxu and Tony.
v1->v2:
* Start collecting per-CPU items in a struct.
* Keep and use mce_flags.amd_threshold.
arch/x86/kernel/cpu/mce/amd.c | 49 ++++++++++++++++----------------------
arch/x86/kernel/cpu/mce/core.c | 3 +++
arch/x86/kernel/cpu/mce/internal.h | 2 ++
3 files changed, 26 insertions(+), 28 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 6a69cac36c18..f8755a21fd48 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -54,6 +54,12 @@
static bool thresholding_irq_en;
+struct mce_amd_cpu_data {
+ mce_banks_t thr_intr_banks;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -559,6 +565,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
if (!b.interrupt_capable)
goto done;
+ __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
b.interrupt_enable = 1;
if (!mce_flags.smca) {
@@ -898,12 +905,7 @@ static void amd_deferred_error_interrupt(void)
log_error_deferred(bank);
}
-static void log_error_thresholding(unsigned int bank, u64 misc)
-{
- _log_error_deferred(bank, misc);
-}
-
-static void log_and_reset_block(struct threshold_block *block)
+static void reset_block(struct threshold_block *block)
{
struct thresh_restart tr;
u32 low = 0, high = 0;
@@ -917,23 +919,14 @@ static void log_and_reset_block(struct threshold_block *block)
if (!(high & MASK_OVERFLOW_HI))
return;
- /* Log the MCE which caused the threshold event. */
- log_error_thresholding(block->bank, ((u64)high << 32) | low);
-
- /* Reset threshold block after logging error. */
memset(&tr, 0, sizeof(tr));
tr.b = block;
threshold_restart_bank(&tr);
}
-/*
- * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
- * goes off when error_count reaches threshold_limit.
- */
-static void amd_threshold_interrupt(void)
+void amd_reset_thr_limit(unsigned int bank)
{
- struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank;
- unsigned int bank, cpu = smp_processor_id();
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
struct threshold_block *block, *tmp;
/*
@@ -941,20 +934,20 @@ static void amd_threshold_interrupt(void)
* handler is installed at boot time, but on a hotplug event the
* interrupt might fire before the data has been initialized.
*/
- if (!bp)
+ if (!bp || !bp[bank])
return;
- for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
- continue;
-
- thr_bank = bp[bank];
- if (!thr_bank)
- continue;
+ list_for_each_entry_safe(block, tmp, &bp[bank]->miscj, miscj)
+ reset_block(block);
+}
- list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj)
- log_and_reset_block(block);
- }
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
}
/*
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index c82c9e435066..de85b014653f 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -831,6 +831,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_log(&err);
clear_it:
+ if (mce_flags.amd_threshold)
+ amd_reset_thr_limit(i);
+
/*
* Clear state for this bank.
*/
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 87b69935d57d..aeb0a998f553 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -269,6 +269,7 @@ void mce_threshold_create_device(unsigned int cpu);
void mce_threshold_remove_device(unsigned int cpu);
extern bool amd_filter_mce(struct mce *m);
bool amd_mce_usable_address(struct mce *m);
+void amd_reset_thr_limit(unsigned int bank);
/*
* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@@ -300,6 +301,7 @@ static inline void mce_threshold_create_device(unsigned int cpu) { }
static inline void mce_threshold_remove_device(unsigned int cpu) { }
static inline bool amd_filter_mce(struct mce *m) { return false; }
static inline bool amd_mce_usable_address(struct mce *m) { return false; }
+static inline void amd_reset_thr_limit(unsigned int bank) { }
static inline void smca_extract_err_addr(struct mce *m) { }
static inline void mce_smca_cpu_init(void) {}
#endif
--
2.49.0
next prev parent reply other threads:[~2025-04-15 14:55 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-15 14:54 [PATCH v3 00/17] AMD MCA interrupts rework Yazen Ghannam
2025-04-15 14:54 ` [PATCH v3 01/17] x86/mce: Don't remove sysfs if thresholding sysfs init fails Yazen Ghannam
2025-04-15 14:54 ` [PATCH v3 02/17] x86/mce/amd: Remove return value for mce_threshold_{create,remove}_device() Yazen Ghannam
2025-04-15 14:54 ` [PATCH v3 03/17] x86/mce/amd: Remove smca_banks_map Yazen Ghannam
2025-04-15 14:54 ` [PATCH v3 04/17] x86/mce/amd: Put list_head in threshold_bank Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 05/17] x86/mce: Cleanup bank processing on init Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 06/17] x86/mce: Remove __mcheck_cpu_init_early() Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 07/17] x86/mce: Define BSP-only init Yazen Ghannam
2025-04-17 2:18 ` Borislav Petkov
2025-05-01 17:07 ` Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 08/17] x86/mce: Define BSP-only SMCA init Yazen Ghannam
2025-04-17 9:52 ` Borislav Petkov
2025-05-01 17:12 ` Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 09/17] x86/mce: Do 'UNKNOWN' vendor check early Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 10/17] x86/mce: Separate global and per-CPU quirks Yazen Ghannam
2025-04-17 12:16 ` Borislav Petkov
2025-05-01 17:23 ` Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 11/17] x86/mce: Move machine_check_poll() status checks to helper functions Yazen Ghannam
2025-04-15 14:55 ` Yazen Ghannam [this message]
2025-04-15 14:55 ` [PATCH v3 13/17] x86/mce: Unify AMD DFR handler with MCA Polling Yazen Ghannam
2025-05-07 9:20 ` Borislav Petkov
2025-05-08 15:37 ` Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 14/17] x86/mce/amd: Enable interrupt vectors once per-CPU on SMCA systems Yazen Ghannam
2025-05-07 19:35 ` Borislav Petkov
2025-05-08 15:53 ` Yazen Ghannam
2025-05-09 14:08 ` Borislav Petkov
2025-05-12 15:34 ` Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 15/17] x86/mce/amd: Support SMCA Corrected Error Interrupt Yazen Ghannam
2025-05-09 19:37 ` Borislav Petkov
2025-05-12 15:35 ` Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 16/17] x86/mce: Handle AMD threshold interrupt storms Yazen Ghannam
2025-04-15 14:55 ` [PATCH v3 17/17] x86/mce: Restore poll settings after storm subsides Yazen Ghannam
2025-05-12 7:46 ` Borislav Petkov
2025-05-12 15:43 ` Yazen Ghannam
2025-05-12 15:53 ` Luck, Tony
2025-05-13 17:44 ` Yazen Ghannam
2025-05-13 17:55 ` Borislav Petkov
2025-05-13 21:06 ` Yazen Ghannam
2025-05-13 22:07 ` Luck, Tony
2025-05-14 14:34 ` Yazen Ghannam
2025-05-15 12:37 ` Borislav Petkov
2025-05-15 15:47 ` Yazen Ghannam
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250415-wip-mca-updates-v3-12-8ffd9eb4aa56@amd.com \
--to=yazen.ghannam@amd.com \
--cc=Smita.KoralahalliChannabasappa@amd.com \
--cc=linux-edac@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=qiuxu.zhuo@intel.com \
--cc=tony.luck@intel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox