* [PATCH] MCE: Implement memory page offlining for AMD
@ 2012-10-05 13:55 Christoph Egger
0 siblings, 0 replies; only message in thread
From: Christoph Egger @ 2012-10-05 13:55 UTC (permalink / raw)
To: xen-devel@lists.xen.org
[-- Attachment #1: Type: text/plain, Size: 351 bytes --]
Implement memory page offlining for AMD
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
[-- Attachment #2: xen_mce_pageoffline.diff --]
[-- Type: text/plain, Size: 14174 bytes --]
# User Christoph Egger
# Date 1349440371 -7200
Implement page offline recovery action for AMD
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile
+++ b/xen/arch/x86/cpu/mcheck/Makefile
@@ -3,6 +3,7 @@ obj-y += k7.o
obj-y += amd_k8.o
obj-y += amd_f10.o
obj-y += mce_amd.o
+obj-y += mcaction.o
obj-y += barrier.o
obj-y += mctelem.o
obj-y += mce.o
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c
@@ -44,6 +44,7 @@
#include "mce_quirks.h"
#include "x86_mca.h"
#include "mce_amd.h"
+#include "mcaction.h"
static struct mcinfo_extended *
amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
@@ -97,6 +98,7 @@ enum mcheck_type amd_f10_mcheck_init(str
x86_mce_callback_register(amd_f10_handler);
mce_recoverable_register(mc_amd_recoverable_scan);
+ mce_register_addrcheck(mc_amd_addrcheck);
return mcheck_amd_famXX;
}
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mcaction.c
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mcaction.c
@@ -0,0 +1,139 @@
+#include <xen/types.h>
+#include <xen/sched.h>
+#include "mcaction.h"
+#include "vmce.h"
+#include "mce.h"
+
+static struct mcinfo_recovery *
+mci_action_add_pageoffline(int bank, struct mc_info *mi,
+ uint64_t mfn, uint32_t status)
+{
+ struct mcinfo_recovery *rec;
+
+ if (!mi)
+ return NULL;
+
+ rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+ if (!rec) {
+ mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+ return NULL;
+ }
+
+ memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+ rec->common.type = MC_TYPE_RECOVERY;
+ rec->common.size = sizeof(*rec);
+ rec->mc_bank = bank;
+ rec->action_types = MC_ACTION_PAGE_OFFLINE;
+ rec->action_info.page_retire.mfn = mfn;
+ rec->action_info.page_retire.status = status;
+ return rec;
+}
+
+mce_check_addr_t mc_check_addr = NULL;
+
+void mce_register_addrcheck(mce_check_addr_t cbfunc)
+{
+ mc_check_addr = cbfunc;
+}
+
+void
+mc_memerr_dhandler(struct mca_binfo *binfo,
+ enum mce_result *result,
+ struct cpu_user_regs *regs)
+{
+ struct mcinfo_bank *bank = binfo->mib;
+ struct mcinfo_global *global = binfo->mig;
+ struct domain *d;
+ unsigned long mfn, gfn;
+ uint32_t status;
+ uint16_t vmce_vcpuid;
+
+ if (!mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL)) {
+ dprintk(XENLOG_WARNING,
+ "No physical address provided for memory error\n");
+ return;
+ }
+
+ mfn = bank->mc_addr >> PAGE_SHIFT;
+ if (offline_page(mfn, 1, &status))
+ {
+ dprintk(XENLOG_WARNING,
+ "Failed to offline page %lx for MCE error\n", mfn);
+ return;
+ }
+
+ mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status);
+
+ /* This is free page */
+ if (status & PG_OFFLINE_OFFLINED)
+ *result = MCER_RECOVERED;
+ else if (status & PG_OFFLINE_AGAIN)
+ *result = MCER_CONTINUE;
+ else if (status & PG_OFFLINE_PENDING) {
+ /* This page has owner */
+ if (status & PG_OFFLINE_OWNED) {
+ bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
+ mce_printk(MCE_QUIET, "MCE: This error page is ownded"
+ " by DOM %d\n", bank->mc_domid);
+ /* XXX: Cannot handle shared pages yet
+ * (this should identify all domains and gfn mapping to
+ * the mfn in question) */
+ BUG_ON( bank->mc_domid == DOMID_COW );
+ if ( bank->mc_domid != DOMID_XEN ) {
+ d = get_domain_by_id(bank->mc_domid);
+ ASSERT(d);
+ gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
+
+ if ( !is_vmce_ready(bank, d) )
+ {
+ printk("DOM%d not ready for vMCE\n", d->domain_id);
+ goto vmce_failed;
+ }
+
+ if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
+ {
+ printk("Unmap broken memory %lx for DOM%d failed\n",
+ mfn, d->domain_id);
+ goto vmce_failed;
+ }
+
+ bank->mc_addr = gfn << PAGE_SHIFT |
+ (bank->mc_addr & (PAGE_SIZE -1 ));
+ if ( fill_vmsr_data(bank, d,
+ global->mc_gstatus) == -1 )
+ {
+ mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+ "failed\n", bank->mc_domid);
+ goto vmce_failed;
+ }
+
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+ vmce_vcpuid = VMCE_INJECT_BROADCAST;
+ else
+ vmce_vcpuid = global->mc_vcpuid;
+
+ /* We will inject vMCE to DOMU*/
+ if ( inject_vmce(d, vmce_vcpuid) < 0 )
+ {
+ mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
+ " failed\n", d->domain_id);
+ goto vmce_failed;
+ }
+
+ /* Impacted domain go on with domain's recovery job
+ * if the domain has its own MCA handler.
+ * For xen, it has contained the error and finished
+ * its own recovery job.
+ */
+ *result = MCER_RECOVERED;
+ put_domain(d);
+
+ return;
+vmce_failed:
+ put_domain(d);
+ domain_crash(d);
+ }
+ }
+ }
+}
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mcaction.h
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mcaction.h
@@ -0,0 +1,20 @@
+#ifndef _MCHECK_ACTION_H
+#define _MCHECK_ACTION_H
+
+#include <xen/types.h>
+#include "x86_mca.h"
+
+void
+mc_memerr_dhandler(struct mca_binfo *binfo,
+ enum mce_result *result,
+ struct cpu_user_regs *regs);
+
+#define MC_ADDR_PHYSICAL 0
+#define MC_ADDR_VIRTUAL 1
+
+typedef int (*mce_check_addr_t)(uint64_t status, uint64_t misc, int addr_type);
+extern void mce_register_addrcheck(mce_check_addr_t);
+
+extern mce_check_addr_t mc_check_addr;
+
+#endif
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -24,6 +24,7 @@
#include "mce.h"
#include "barrier.h"
+#include "mcaction.h"
#include "util.h"
#include "vmce.h"
@@ -216,7 +217,7 @@ static void mca_init_bank(enum mca_sourc
if ((mib->mc_status & MCi_STATUS_MISCV) &&
(mib->mc_status & MCi_STATUS_ADDRV) &&
- ((mib->mc_misc & MCi_MISC_ADDRMOD_MASK) == MCi_MISC_PHYSMOD) &&
+ (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
(who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
(mfn_valid(paddr_to_pfn(mib->mc_addr))))
{
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce_amd.c
--- a/xen/arch/x86/cpu/mcheck/mce_amd.c
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.c
@@ -25,6 +25,7 @@
#include "mce.h"
#include "x86_mca.h"
#include "mce_amd.h"
+#include "mcaction.h"
/* Error Code Types */
enum mc_ec_type {
@@ -75,3 +76,25 @@ mc_amd_recoverable_scan(uint64_t status)
return ret;
}
+
+int
+mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype)
+{
+ enum mc_ec_type ectype;
+ uint16_t errorcode;
+
+ errorcode = status & (MCi_STATUS_MCA | MCi_STATUS_MSEC);
+ ectype = mc_ec2type(errorcode);
+
+ switch (ectype) {
+ case MC_EC_BUS_TYPE: /* value in addr MSR is physical */
+ case MC_EC_MEM_TYPE: /* value in addr MSR is physical */
+ return (addrtype == MC_ADDR_PHYSICAL);
+ case MC_EC_TLB_TYPE: /* value in addr MSR is virtual */
+ return (addrtype == MC_ADDR_VIRTUAL);
+ }
+
+ /* unreached */
+ BUG();
+ return 0;
+}
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce_amd.h
--- a/xen/arch/x86/cpu/mcheck/mce_amd.h
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.h
@@ -2,5 +2,6 @@
#define _MCHECK_AMD_H
int mc_amd_recoverable_scan(uint64_t status);
+int mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype);
#endif
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -19,6 +19,7 @@
#include "barrier.h"
#include "util.h"
#include "vmce.h"
+#include "mcaction.h"
DEFINE_PER_CPU(struct mca_banks *, mce_banks_owned);
DEFINE_PER_CPU(struct mca_banks *, no_cmci_banks);
@@ -257,130 +258,13 @@ static enum intel_mce_type intel_check_m
return intel_mce_fatal;
}
-struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
- uint64_t mfn, uint32_t status)
-{
- struct mcinfo_recovery *rec;
-
- if (!mi)
- return NULL;
-
- rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
- if (!rec)
- {
- mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
- return NULL;
- }
-
- memset(rec, 0, sizeof(struct mcinfo_recovery));
-
- rec->mc_bank = bank;
- rec->action_types = MC_ACTION_PAGE_OFFLINE;
- rec->action_info.page_retire.mfn = mfn;
- rec->action_info.page_retire.status = status;
- return rec;
-}
-
static void intel_memerr_dhandler(
struct mca_binfo *binfo,
enum mce_result *result,
struct cpu_user_regs *regs)
{
- struct mcinfo_bank *bank = binfo->mib;
- struct mcinfo_global *global = binfo->mig;
- struct domain *d;
- unsigned long mfn, gfn;
- uint32_t status;
- uint64_t mc_status, mc_misc;
-
mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
-
- mc_status = bank->mc_status;
- mc_misc = bank->mc_misc;
- if (!(mc_status & MCi_STATUS_ADDRV) ||
- !(mc_status & MCi_STATUS_MISCV) ||
- ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
- {
- dprintk(XENLOG_WARNING,
- "No physical address provided for memory error\n");
- return;
- }
-
- mfn = bank->mc_addr >> PAGE_SHIFT;
- if (offline_page(mfn, 1, &status))
- {
- dprintk(XENLOG_WARNING,
- "Failed to offline page %lx for MCE error\n", mfn);
- return;
- }
-
- mci_add_pageoff_action(binfo->bank, binfo->mi, mfn, status);
-
- /* This is free page */
- if (status & PG_OFFLINE_OFFLINED)
- *result = MCER_RECOVERED;
- else if (status & PG_OFFLINE_AGAIN)
- *result = MCER_CONTINUE;
- else if (status & PG_OFFLINE_PENDING) {
- /* This page has owner */
- if (status & PG_OFFLINE_OWNED) {
- bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
- mce_printk(MCE_QUIET, "MCE: This error page is ownded"
- " by DOM %d\n", bank->mc_domid);
- /* XXX: Cannot handle shared pages yet
- * (this should identify all domains and gfn mapping to
- * the mfn in question) */
- BUG_ON( bank->mc_domid == DOMID_COW );
- if ( bank->mc_domid != DOMID_XEN ) {
- d = get_domain_by_id(bank->mc_domid);
- ASSERT(d);
- gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-
- if ( !is_vmce_ready(bank, d) )
- {
- printk("DOM%d not ready for vMCE\n", d->domain_id);
- goto vmce_failed;
- }
-
- if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
- {
- printk("Unmap broken memory %lx for DOM%d failed\n",
- mfn, d->domain_id);
- goto vmce_failed;
- }
-
- bank->mc_addr = gfn << PAGE_SHIFT |
- (bank->mc_addr & (PAGE_SIZE -1 ));
- if ( fill_vmsr_data(bank, d,
- global->mc_gstatus) == -1 )
- {
- mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
- "failed\n", bank->mc_domid);
- goto vmce_failed;
- }
-
- /* We will inject vMCE to DOMU*/
- if ( inject_vmce(d, VMCE_INJECT_BROADCAST) < 0 )
- {
- mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
- " failed\n", d->domain_id);
- goto vmce_failed;
- }
- /* Impacted domain go on with domain's recovery job
- * if the domain has its own MCA handler.
- * For xen, it has contained the error and finished
- * its own recovery job.
- */
- *result = MCER_RECOVERED;
- put_domain(d);
-
- return;
-vmce_failed:
- put_domain(d);
- domain_crash(d);
- }
- }
- }
+ mc_memerr_dhandler(binfo, result, regs);
}
static int intel_srar_check(uint64_t status)
@@ -388,6 +272,19 @@ static int intel_srar_check(uint64_t sta
return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
}
+static int intel_checkaddr(uint64_t status, uint64_t misc, int addrtype)
+{
+ if (!(status & MCi_STATUS_ADDRV) ||
+ !(status & MCi_STATUS_MISCV) ||
+ ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
+ {
+ /* addr is virtual */
+ return (addrtype == MC_ADDR_VIRTUAL);
+ }
+
+ return (addrtype == MC_ADDR_PHYSICAL);
+}
+
static void intel_srar_dhandler(
struct mca_binfo *binfo,
enum mce_result *result,
@@ -882,6 +779,7 @@ static void intel_init_mce(void)
x86_mce_vector_register(intel_machine_check);
mce_recoverable_register(intel_recoverable_scan);
mce_need_clearbank_register(intel_need_clearbank_scan);
+ mce_register_addrcheck(intel_checkaddr);
mce_dhandlers = intel_mce_dhandlers;
mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);
[-- Attachment #3: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2012-10-05 13:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-05 13:55 [PATCH] MCE: Implement memory page offlining for AMD Christoph Egger
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.