All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Egger <Christoph.Egger@amd.com>
To: "xen-devel@lists.xen.org" <xen-devel@lists.xen.org>
Subject: [PATCH] MCE: Implement memory page offlining for AMD
Date: Fri, 5 Oct 2012 15:55:05 +0200	[thread overview]
Message-ID: <506EE6B9.2070205@amd.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 351 bytes --]


Implement memory page offlining for AMD

Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>


-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

[-- Attachment #2: xen_mce_pageoffline.diff --]
[-- Type: text/plain, Size: 14174 bytes --]

# User Christoph Egger
# Date 1349440371 -7200
Implement page offline recovery action for AMD

Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>

diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile
+++ b/xen/arch/x86/cpu/mcheck/Makefile
@@ -3,6 +3,7 @@ obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
 obj-y += mce_amd.o
+obj-y += mcaction.o
 obj-y += barrier.o
 obj-y += mctelem.o
 obj-y += mce.o
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c
@@ -44,6 +44,7 @@
 #include "mce_quirks.h"
 #include "x86_mca.h"
 #include "mce_amd.h"
+#include "mcaction.h"
 
 static struct mcinfo_extended *
 amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
@@ -97,6 +98,7 @@ enum mcheck_type amd_f10_mcheck_init(str
 
 	x86_mce_callback_register(amd_f10_handler);
 	mce_recoverable_register(mc_amd_recoverable_scan);
+	mce_register_addrcheck(mc_amd_addrcheck);
 
 	return mcheck_amd_famXX;
 }
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mcaction.c
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mcaction.c
@@ -0,0 +1,139 @@
+#include <xen/types.h>
+#include <xen/sched.h>
+#include "mcaction.h"
+#include "vmce.h"
+#include "mce.h"
+
+static struct mcinfo_recovery *
+mci_action_add_pageoffline(int bank, struct mc_info *mi,
+                       uint64_t mfn, uint32_t status)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec) {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->common.type = MC_TYPE_RECOVERY;
+    rec->common.size = sizeof(*rec);
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_PAGE_OFFLINE;
+    rec->action_info.page_retire.mfn = mfn;
+    rec->action_info.page_retire.status = status;
+    return rec;
+}
+
+mce_check_addr_t mc_check_addr = NULL;
+
+void mce_register_addrcheck(mce_check_addr_t cbfunc)
+{
+    mc_check_addr = cbfunc;
+}
+
+void
+mc_memerr_dhandler(struct mca_binfo *binfo,
+                   enum mce_result *result,
+                   struct cpu_user_regs *regs)
+{
+    struct mcinfo_bank *bank = binfo->mib;
+    struct mcinfo_global *global = binfo->mig;
+    struct domain *d;
+    unsigned long mfn, gfn;
+    uint32_t status;
+    uint16_t vmce_vcpuid;
+
+    if (!mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL)) {
+        dprintk(XENLOG_WARNING,
+            "No physical address provided for memory error\n");
+        return;
+    }
+
+    mfn = bank->mc_addr >> PAGE_SHIFT;
+    if (offline_page(mfn, 1, &status))
+    {
+        dprintk(XENLOG_WARNING,
+                "Failed to offline page %lx for MCE error\n", mfn);
+        return;
+    }
+
+    mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status);
+
+    /* This is free page */
+    if (status & PG_OFFLINE_OFFLINED)
+        *result = MCER_RECOVERED;
+    else if (status & PG_OFFLINE_AGAIN)
+        *result = MCER_CONTINUE;
+    else if (status & PG_OFFLINE_PENDING) {
+        /* This page has owner */
+        if (status & PG_OFFLINE_OWNED) {
+            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
+            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
+              " by DOM %d\n", bank->mc_domid);
+            /* XXX: Cannot handle shared pages yet
+             * (this should identify all domains and gfn mapping to
+             *  the mfn in question) */
+            BUG_ON( bank->mc_domid == DOMID_COW );
+            if ( bank->mc_domid != DOMID_XEN ) {
+                d = get_domain_by_id(bank->mc_domid);
+                ASSERT(d);
+                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
+
+                if ( !is_vmce_ready(bank, d) )
+                {
+                    printk("DOM%d not ready for vMCE\n", d->domain_id);
+                    goto vmce_failed;
+                }
+
+                if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
+                {
+                    printk("Unmap broken memory %lx for DOM%d failed\n",
+                            mfn, d->domain_id);
+                    goto vmce_failed;
+                }
+
+                bank->mc_addr = gfn << PAGE_SHIFT |
+                  (bank->mc_addr & (PAGE_SIZE -1 ));
+                if ( fill_vmsr_data(bank, d,
+                      global->mc_gstatus) == -1 )
+                {
+                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+                      "failed\n", bank->mc_domid);
+                    goto vmce_failed;
+                }
+
+                if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+                    vmce_vcpuid = VMCE_INJECT_BROADCAST;
+                else
+                    vmce_vcpuid = global->mc_vcpuid;
+
+                /* We will inject vMCE to DOMU*/
+                if ( inject_vmce(d, vmce_vcpuid) < 0 )
+                {
+                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
+                      " failed\n", d->domain_id);
+                    goto vmce_failed;
+                }
+
+                /* Impacted domain go on with domain's recovery job
+                 * if the domain has its own MCA handler.
+                 * For xen, it has contained the error and finished
+                 * its own recovery job.
+                 */
+                *result = MCER_RECOVERED;
+                put_domain(d);
+
+                return;
+vmce_failed:
+                put_domain(d);
+                domain_crash(d);
+            }
+        }
+    }
+}
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mcaction.h
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mcaction.h
@@ -0,0 +1,20 @@
+#ifndef _MCHECK_ACTION_H
+#define _MCHECK_ACTION_H
+
+#include <xen/types.h>
+#include "x86_mca.h"
+
+void
+mc_memerr_dhandler(struct mca_binfo *binfo,
+                   enum mce_result *result,
+                   struct cpu_user_regs *regs);
+
+#define MC_ADDR_PHYSICAL  0
+#define MC_ADDR_VIRTUAL   1
+
+typedef int (*mce_check_addr_t)(uint64_t status, uint64_t misc, int addr_type);
+extern void mce_register_addrcheck(mce_check_addr_t);
+
+extern mce_check_addr_t mc_check_addr;
+
+#endif
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -24,6 +24,7 @@
 
 #include "mce.h"
 #include "barrier.h"
+#include "mcaction.h"
 #include "util.h"
 #include "vmce.h"
 
@@ -216,7 +217,7 @@ static void mca_init_bank(enum mca_sourc
 
     if ((mib->mc_status & MCi_STATUS_MISCV) &&
         (mib->mc_status & MCi_STATUS_ADDRV) &&
-        ((mib->mc_misc & MCi_MISC_ADDRMOD_MASK) == MCi_MISC_PHYSMOD) && 
+        (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
         (who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
         (mfn_valid(paddr_to_pfn(mib->mc_addr))))
     {
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce_amd.c
--- a/xen/arch/x86/cpu/mcheck/mce_amd.c
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.c
@@ -25,6 +25,7 @@
 #include "mce.h"
 #include "x86_mca.h"
 #include "mce_amd.h"
+#include "mcaction.h"
 
 /* Error Code Types */
 enum mc_ec_type {
@@ -75,3 +76,25 @@ mc_amd_recoverable_scan(uint64_t status)
 
     return ret;
 }
+
+int
+mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype)
+{
+    enum mc_ec_type ectype;
+    uint16_t errorcode;
+
+    errorcode = status & (MCi_STATUS_MCA | MCi_STATUS_MSEC);
+    ectype = mc_ec2type(errorcode);
+
+    switch (ectype) {
+    case MC_EC_BUS_TYPE: /* value in addr MSR is physical */
+    case MC_EC_MEM_TYPE: /* value in addr MSR is physical */
+        return (addrtype == MC_ADDR_PHYSICAL);
+    case MC_EC_TLB_TYPE: /* value in addr MSR is virtual */
+        return (addrtype == MC_ADDR_VIRTUAL);
+    }
+
+    /* unreached */
+    BUG();
+    return 0;
+}
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce_amd.h
--- a/xen/arch/x86/cpu/mcheck/mce_amd.h
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.h
@@ -2,5 +2,6 @@
 #define _MCHECK_AMD_H
 
 int mc_amd_recoverable_scan(uint64_t status);
+int mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype);
 
 #endif
diff -r ee2d4b68aa2b -r 13daa0d9bb59 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -19,6 +19,7 @@
 #include "barrier.h"
 #include "util.h"
 #include "vmce.h"
+#include "mcaction.h"
 
 DEFINE_PER_CPU(struct mca_banks *, mce_banks_owned);
 DEFINE_PER_CPU(struct mca_banks *, no_cmci_banks);
@@ -257,130 +258,13 @@ static enum intel_mce_type intel_check_m
     return intel_mce_fatal;
 }
 
-struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
-                              uint64_t mfn, uint32_t status)
-{
-    struct mcinfo_recovery *rec;
-
-    if (!mi)
-        return NULL;
-
-    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
-    if (!rec)
-    {
-        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
-        return NULL;
-    }
-
-    memset(rec, 0, sizeof(struct mcinfo_recovery));
-
-    rec->mc_bank = bank;
-    rec->action_types = MC_ACTION_PAGE_OFFLINE;
-    rec->action_info.page_retire.mfn = mfn;
-    rec->action_info.page_retire.status = status;
-    return rec;
-}
-
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
              enum mce_result *result,
              struct cpu_user_regs *regs)
 {
-    struct mcinfo_bank *bank = binfo->mib;
-    struct mcinfo_global *global = binfo->mig;
-    struct domain *d;
-    unsigned long mfn, gfn;
-    uint32_t status;
-    uint64_t mc_status, mc_misc;
-
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
-
-    mc_status = bank->mc_status;
-    mc_misc = bank->mc_misc;
-    if (!(mc_status &  MCi_STATUS_ADDRV) ||
-        !(mc_status & MCi_STATUS_MISCV) ||
-        ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
-    {
-        dprintk(XENLOG_WARNING,
-            "No physical address provided for memory error\n");
-        return;
-    }
-
-    mfn = bank->mc_addr >> PAGE_SHIFT;
-    if (offline_page(mfn, 1, &status))
-    {
-        dprintk(XENLOG_WARNING,
-                "Failed to offline page %lx for MCE error\n", mfn);
-        return;
-    }
-
-    mci_add_pageoff_action(binfo->bank, binfo->mi, mfn, status);
-
-    /* This is free page */
-    if (status & PG_OFFLINE_OFFLINED)
-        *result = MCER_RECOVERED;
-    else if (status & PG_OFFLINE_AGAIN)
-        *result = MCER_CONTINUE;
-    else if (status & PG_OFFLINE_PENDING) {
-        /* This page has owner */
-        if (status & PG_OFFLINE_OWNED) {
-            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
-            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-              " by DOM %d\n", bank->mc_domid);
-            /* XXX: Cannot handle shared pages yet 
-             * (this should identify all domains and gfn mapping to
-             *  the mfn in question) */
-            BUG_ON( bank->mc_domid == DOMID_COW );
-            if ( bank->mc_domid != DOMID_XEN ) {
-                d = get_domain_by_id(bank->mc_domid);
-                ASSERT(d);
-                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-
-                if ( !is_vmce_ready(bank, d) )
-                {
-                    printk("DOM%d not ready for vMCE\n", d->domain_id);
-                    goto vmce_failed;
-                }
-
-                if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
-                {
-                    printk("Unmap broken memory %lx for DOM%d failed\n",
-                            mfn, d->domain_id);
-                    goto vmce_failed;
-                }
-
-                bank->mc_addr =  gfn << PAGE_SHIFT |
-                  (bank->mc_addr & (PAGE_SIZE -1 ));
-                if ( fill_vmsr_data(bank, d,
-                      global->mc_gstatus) == -1 )
-                {
-                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
-                      "failed\n", bank->mc_domid);
-                    goto vmce_failed;
-                }
-
-                /* We will inject vMCE to DOMU*/
-                if ( inject_vmce(d, VMCE_INJECT_BROADCAST) < 0 )
-                {
-                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                      " failed\n", d->domain_id);
-                    goto vmce_failed;
-                }
-                /* Impacted domain go on with domain's recovery job
-                 * if the domain has its own MCA handler.
-                 * For xen, it has contained the error and finished
-                 * its own recovery job.
-                 */
-                *result = MCER_RECOVERED;
-                put_domain(d);
-
-                return;
-vmce_failed:
-                put_domain(d);
-                domain_crash(d);
-            }
-        }
-    }
+    mc_memerr_dhandler(binfo, result, regs);
 }
 
 static int intel_srar_check(uint64_t status)
@@ -388,6 +272,19 @@ static int intel_srar_check(uint64_t sta
     return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
 }
 
+static int intel_checkaddr(uint64_t status, uint64_t misc, int addrtype)
+{
+    if (!(status & MCi_STATUS_ADDRV) ||
+        !(status & MCi_STATUS_MISCV) ||
+        ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
+    {
+        /* addr is virtual */
+        return (addrtype == MC_ADDR_VIRTUAL);
+    }
+
+    return (addrtype == MC_ADDR_PHYSICAL);
+}
+
 static void intel_srar_dhandler(
              struct mca_binfo *binfo,
              enum mce_result *result,
@@ -882,6 +779,7 @@ static void intel_init_mce(void)
     x86_mce_vector_register(intel_machine_check);
     mce_recoverable_register(intel_recoverable_scan);
     mce_need_clearbank_register(intel_need_clearbank_scan);
+    mce_register_addrcheck(intel_checkaddr);
 
     mce_dhandlers = intel_mce_dhandlers;
     mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

                 reply	other threads:[~2012-10-05 13:55 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=506EE6B9.2070205@amd.com \
    --to=christoph.egger@amd.com \
    --cc=xen-devel@lists.xen.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.