From: Huang Ying <ying.huang@intel.com>
To: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>,
Anthony Liguori <aliguori@us.ibm.com>,
"kvm@vger.kernel.org" <kvm@vger.kernel.org>
Subject: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
Date: Mon, 07 Sep 2009 16:32:33 +0800 [thread overview]
Message-ID: <1252312353.14648.731.camel@yhuang-dev.sh.intel.com> (raw)
UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
where some hardware error such as some memory error can be reported
without PCC (processor context corrupted). To recover from such MCE,
the corresponding memory will be unmapped, and all processes accessing
the memory will be killed via SIGBUS.
For KVM, if QEMU/KVM is killed, all guest processes will be killed
too. So we relay SIGBUS from host OS to guest system via a UCR MCE
injection. Then guest OS can isolate corresponding memory and kill
necessary guest processes only. SIGBUS sent to main thread (not VCPU
threads) will be broadcast to all VCPU threads as UCR MCE.
Signed-off-by: Huang Ying <ying.huang@intel.com>
---
qemu-kvm.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++----
target-i386/cpu.h | 20 +++++-
2 files changed, 181 insertions(+), 12 deletions(-)
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -27,10 +27,23 @@
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <signal.h>
+#include <sys/signalfd.h>
+#include <sys/prctl.h>
#define false 0
#define true 1
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
#define EXPECTED_KVM_API_VERSION 12
#if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
@@ -702,6 +715,24 @@ int kvm_get_dirty_pages_range(kvm_contex
return 0;
}
+static int kvm_addr_userspace_to_phys(unsigned long userspace_addr,
+ unsigned long *phys_addr)
+{
+ int i;
+ struct slot_info *slot;
+
+ for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
+ slot = &slots[i];
+ if (slot->len && slot->userspace_addr <= userspace_addr &&
+ (slot->userspace_addr + slot->len) > userspace_addr) {
+ *phys_addr = userspace_addr - slot->userspace_addr +
+ slot->phys_addr;
+ return 0;
+ }
+ }
+ return -1;
+}
+
#ifdef KVM_CAP_IRQCHIP
int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
@@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
{
}
+static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
+{
+ if (siginfo->ssi_code == BUS_MCEERR_AO) {
+ uint64_t status;
+ unsigned long paddr;
+ CPUState *cenv;
+
+ /* Hope we are lucky for AO MCE */
+ if (kvm_addr_userspace_to_phys((unsigned long)siginfo->ssi_addr,
+ &paddr)) {
+ fprintf(stderr, "Hardware memory error for memory used by "
+ "QEMU itself instead of guest system!: %llx\n",
+ (unsigned long long)siginfo->ssi_addr);
+ return;
+ }
+ status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+ | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+ | 0xc0;
+ kvm_inject_x86_mce(first_cpu, 9, status,
+ MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+ (MCM_ADDR_PHYS << 6) | 0xc);
+ for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
+ kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+ MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
+ return;
+ } else if (siginfo->ssi_code == BUS_MCEERR_AR)
+ fprintf(stderr, "Hardware memory error!\n");
+ else
+ fprintf(stderr, "Internal error in QEMU!\n");
+ exit(1);
+}
+
static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
{
struct qemu_work_item wi;
@@ -1657,29 +1720,102 @@ static void flush_queued_work(CPUState *
pthread_cond_broadcast(&qemu_work_cond);
}
+static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
+{
+#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
+ struct kvm_x86_mce mce = {
+ .bank = 9,
+ };
+ unsigned long paddr;
+ int r;
+
+ if (env->mcg_cap && siginfo->si_addr
+ && (siginfo->si_code == BUS_MCEERR_AR
+ || siginfo->si_code == BUS_MCEERR_AO)) {
+ if (siginfo->si_code == BUS_MCEERR_AR) {
+ mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+ | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+ | MCI_STATUS_AR;
+ mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+ mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+ } else {
+ /* Fake an Intel architectural Memory scrubbing UCR */
+ mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+ | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+ | 0xc0;
+ mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+ mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+ }
+ if (kvm_addr_userspace_to_phys((unsigned long)siginfo->si_addr,
+ &paddr)) {
+ fprintf(stderr, "Hardware memory error for memory used by "
+ "QEMU itself instaed of guest system!\n");
+ /* Hope we are lucky for AO MCE */
+ if (siginfo->si_code == BUS_MCEERR_AO)
+ return;
+ else
+ exit(1);
+ }
+ mce.addr = paddr;
+ r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
+ if (r < 0) {
+ fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+ exit(1);
+ }
+ } else
+#endif
+ {
+ if (siginfo->si_code == BUS_MCEERR_AO)
+ return;
+ if (siginfo->si_code == BUS_MCEERR_AR)
+ fprintf(stderr, "Hardware memory error!\n");
+ else
+ fprintf(stderr, "Internal error in QEMU!\n");
+ exit(1);
+ }
+}
+
static void kvm_main_loop_wait(CPUState *env, int timeout)
{
struct timespec ts;
int r, e;
siginfo_t siginfo;
sigset_t waitset;
-
- pthread_mutex_unlock(&qemu_mutex);
+ sigset_t chkset;
ts.tv_sec = timeout / 1000;
ts.tv_nsec = (timeout % 1000) * 1000000;
sigemptyset(&waitset);
sigaddset(&waitset, SIG_IPI);
+ sigaddset(&waitset, SIGBUS);
- r = sigtimedwait(&waitset, &siginfo, &ts);
- e = errno;
+ do {
+ pthread_mutex_unlock(&qemu_mutex);
- pthread_mutex_lock(&qemu_mutex);
+ r = sigtimedwait(&waitset, &siginfo, &ts);
+ e = errno;
- if (r == -1 && !(e == EAGAIN || e == EINTR)) {
- printf("sigtimedwait: %s\n", strerror(e));
- exit(1);
- }
+ pthread_mutex_lock(&qemu_mutex);
+
+ if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+ printf("sigtimedwait: %s\n", strerror(e));
+ exit(1);
+ }
+
+ switch (r) {
+ case SIGBUS:
+ kvm_on_sigbus(env, &siginfo);
+ break;
+ default:
+ break;
+ }
+
+ r = sigpending(&chkset);
+ if (r == -1) {
+ printf("sigpending: %s\n", strerror(e));
+ exit(1);
+ }
+ } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
cpu_single_env = env;
flush_queued_work(env);
@@ -1760,6 +1896,7 @@ static void setup_kernel_sigmask(CPUStat
sigprocmask(SIG_BLOCK, NULL, &set);
sigdelset(&set, SIG_IPI);
+ sigdelset(&set, SIGBUS);
kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
}
@@ -1885,12 +2022,20 @@ void kvm_hpet_enable_kpit(void)
int kvm_init_ap(void)
{
+ struct sigaction action;
+
#ifdef TARGET_I386
kvm_tpr_opt_setup();
#endif
qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
signal(SIG_IPI, sig_ipi_handler);
+
+ memset(&action, 0, sizeof(action));
+ action.sa_flags = SA_SIGINFO;
+ action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+ sigaction(SIGBUS, &action, NULL);
+ prctl(PR_MCE_KILL, 1, 1);
return 0;
}
@@ -1951,7 +2096,10 @@ static void sigfd_handler(void *opaque)
}
sigaction(info.ssi_signo, NULL, &action);
- if (action.sa_handler)
+ if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
+ action.sa_sigaction(info.ssi_signo,
+ (siginfo_t *)&info, NULL);
+ else if (action.sa_handler)
action.sa_handler(info.ssi_signo);
}
@@ -2001,6 +2149,7 @@ int kvm_main_loop(void)
sigemptyset(&mask);
sigaddset(&mask, SIGIO);
sigaddset(&mask, SIGALRM);
+ sigaddset(&mask, SIGBUS);
sigprocmask(SIG_BLOCK, &mask, NULL);
sigfd = qemu_signalfd(&mask);
@@ -2526,6 +2675,10 @@ void kvm_inject_x86_mce(CPUState *cenv,
.mce = &mce,
};
+ if (!cenv->mcg_cap) {
+ fprintf(stderr, "MCE support is not enabled!\n");
+ return;
+ }
on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
#endif
}
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -250,16 +250,32 @@
#define PG_ERROR_RSVD_MASK 0x08
#define PG_ERROR_I_D_MASK 0x10
-#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
+#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
+#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
-#define MCE_CAP_DEF MCG_CTL_P
+#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
#define MCE_BANKS_DEF 10
+#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
+#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
+#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
+#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
+#define MCI_STATUS_AR (1ULL<<55) /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF 0 /* segment offset */
+#define MCM_ADDR_LINEAR 1 /* linear address */
+#define MCM_ADDR_PHYS 2 /* physical address */
+#define MCM_ADDR_MEM 3 /* memory address */
+#define MCM_ADDR_GENERIC 7 /* generic */
#define MSR_IA32_TSC 0x10
#define MSR_IA32_APICBASE 0x1b
next reply other threads:[~2009-09-07 8:32 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-07 8:32 Huang Ying [this message]
2009-09-07 20:48 ` [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest Anthony Liguori
2009-09-08 5:41 ` Huang Ying
2009-09-08 13:07 ` Anthony Liguori
2009-09-08 6:41 ` Avi Kivity
2009-09-08 6:46 ` Huang Ying
2009-09-08 8:11 ` Andi Kleen
2009-09-09 12:10 ` Avi Kivity
2009-09-10 2:50 ` Huang Ying
2009-09-08 6:44 ` Avi Kivity
2009-09-08 6:43 ` Huang Ying
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1252312353.14648.731.camel@yhuang-dev.sh.intel.com \
--to=ying.huang@intel.com \
--cc=aliguori@us.ibm.com \
--cc=andi@firstfloor.org \
--cc=avi@redhat.com \
--cc=kvm@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox