From: Marcelo Tosatti <mtosatti@redhat.com>
To: Huang Ying <ying.huang@intel.com>
Cc: Avi Kivity <avi@redhat.com>, Andi Kleen <andi@firstfloor.org>,
Anthony Liguori <aliguori@us.ibm.com>,
"kvm@vger.kernel.org" <kvm@vger.kernel.org>
Subject: Re: [PATCH -v3] QEMU-KVM: MCE: Relay UCR MCE to guest
Date: Fri, 18 Sep 2009 14:05:35 -0300 [thread overview]
Message-ID: <20090918170535.GA9100@amt.cnet> (raw)
In-Reply-To: <1253260739.15717.511.camel@yhuang-dev.sh.intel.com>
On Fri, Sep 18, 2009 at 03:58:59PM +0800, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
>
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
>
> v3:
>
> - Re-raise SIGBUS for SIGBUS not from MCE
> - Kill itself for error in kvm_inject_x86_mce
This is broken, non-MCE SIGBUS causes qemu-kvm to call the sigbus
handler in a loop.
BTW, how are you testing this and what guests have been tested?
>
> v2:
>
> - Use qemu_ram_addr_from_host instead of self made one to covert from
> host address to guest RAM address. Thanks Anthony Liguori.
>
> Signed-off-by: Huang Ying <ying.huang@intel.com>
>
> ---
> cpu-common.h | 1
> exec.c | 20 ++++-
> qemu-kvm.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++----
> qemu-kvm.h | 9 ++
> target-i386/cpu.h | 20 +++++
> target-i386/helper.c | 2
> 6 files changed, 208 insertions(+), 22 deletions(-)
>
> --- a/qemu-kvm.c
> +++ b/qemu-kvm.c
> @@ -27,10 +27,23 @@
> #include <sys/mman.h>
> #include <sys/ioctl.h>
> #include <signal.h>
> +#include <sys/signalfd.h>
> +#include <sys/prctl.h>
>
> #define false 0
> #define true 1
>
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
> #define EXPECTED_KVM_API_VERSION 12
>
> #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> @@ -1509,6 +1522,49 @@ static void sig_ipi_handler(int n)
> {
> }
>
> +static void hardware_memory_error(void)
> +{
> + fprintf(stderr, "Hardware memory error!\n");
> + exit(1);
> +}
> +
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> + if (first_cpu->mcg_cap && siginfo->ssi_addr
> + && siginfo->ssi_code == BUS_MCEERR_AO) {
> + uint64_t status;
> + unsigned long paddr;
> + CPUState *cenv;
> +
> + /* Hope we are lucky for AO MCE */
> + if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> + fprintf(stderr, "Hardware memory error for memory used by "
> + "QEMU itself instead of guest system!: %llx\n",
> + (unsigned long long)siginfo->ssi_addr);
> + return;
> + }
> + status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> + | 0xc0;
> + kvm_inject_x86_mce(first_cpu, 9, status,
> + MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> + (MCM_ADDR_PHYS << 6) | 0xc, 1);
> + for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
> + } else
> +#endif
> + {
> + if (siginfo->ssi_code == BUS_MCEERR_AO)
> + return;
> + else if (siginfo->ssi_code == BUS_MCEERR_AR)
> + hardware_memory_error();
> + else
> + raise(SIGBUS);
> + }
> +}
> +
> static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
> {
> struct qemu_work_item wi;
> @@ -1666,29 +1722,101 @@ static void flush_queued_work(CPUState *
> pthread_cond_broadcast(&qemu_work_cond);
> }
>
> +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> + struct kvm_x86_mce mce = {
> + .bank = 9,
> + };
> + unsigned long paddr;
> + int r;
> +
> + if (env->mcg_cap && siginfo->si_addr
> + && (siginfo->si_code == BUS_MCEERR_AR
> + || siginfo->si_code == BUS_MCEERR_AO)) {
> + if (siginfo->si_code == BUS_MCEERR_AR) {
> + /* Fake an Intel architectural Data Load SRAR UCR */
> + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> + | MCI_STATUS_AR | 0x134;
> + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> + } else {
> + /* Fake an Intel architectural Memory scrubbing UCR */
> + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> + | 0xc0;
> + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> + }
> + if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
> + fprintf(stderr, "Hardware memory error for memory used by "
> + "QEMU itself instaed of guest system!\n");
> + /* Hope we are lucky for AO MCE */
> + if (siginfo->si_code == BUS_MCEERR_AO)
> + return;
> + else
> + hardware_memory_error();
> + }
> + mce.addr = paddr;
> + r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
> + if (r < 0) {
> + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
> + abort();
> + }
> + } else
> +#endif
> + {
> + if (siginfo->si_code == BUS_MCEERR_AO)
> + return;
> + else if (siginfo->si_code == BUS_MCEERR_AR)
> + hardware_memory_error();
> + else
> + raise(SIGBUS);
> + }
> +}
> +
> static void kvm_main_loop_wait(CPUState *env, int timeout)
> {
> struct timespec ts;
> int r, e;
> siginfo_t siginfo;
> sigset_t waitset;
> -
> - pthread_mutex_unlock(&qemu_mutex);
> + sigset_t chkset;
>
> ts.tv_sec = timeout / 1000;
> ts.tv_nsec = (timeout % 1000) * 1000000;
> sigemptyset(&waitset);
> sigaddset(&waitset, SIG_IPI);
> + sigaddset(&waitset, SIGBUS);
>
> - r = sigtimedwait(&waitset, &siginfo, &ts);
> - e = errno;
> + do {
> + pthread_mutex_unlock(&qemu_mutex);
>
> - pthread_mutex_lock(&qemu_mutex);
> + r = sigtimedwait(&waitset, &siginfo, &ts);
> + e = errno;
>
> - if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> - printf("sigtimedwait: %s\n", strerror(e));
> - exit(1);
> - }
> + pthread_mutex_lock(&qemu_mutex);
> +
> + if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> + printf("sigtimedwait: %s\n", strerror(e));
> + exit(1);
> + }
> +
> + switch (r) {
> + case SIGBUS:
> + kvm_on_sigbus(env, &siginfo);
> + break;
> + default:
> + break;
> + }
> +
> + r = sigpending(&chkset);
> + if (r == -1) {
> + printf("sigpending: %s\n", strerror(e));
> + exit(1);
> + }
> + } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
>
> cpu_single_env = env;
> flush_queued_work(env);
> @@ -1769,6 +1897,7 @@ static void setup_kernel_sigmask(CPUStat
>
> sigprocmask(SIG_BLOCK, NULL, &set);
> sigdelset(&set, SIG_IPI);
> + sigdelset(&set, SIGBUS);
>
> kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
> }
> @@ -1896,12 +2025,20 @@ void kvm_hpet_enable_kpit(void)
>
> int kvm_init_ap(void)
> {
> + struct sigaction action;
> +
> #ifdef TARGET_I386
> kvm_tpr_opt_setup();
> #endif
> qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
>
> signal(SIG_IPI, sig_ipi_handler);
> +
> + memset(&action, 0, sizeof(action));
> + action.sa_flags = SA_SIGINFO;
> + action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
> + sigaction(SIGBUS, &action, NULL);
> + prctl(PR_MCE_KILL, 1, 1);
> return 0;
> }
>
> @@ -1962,7 +2099,10 @@ static void sigfd_handler(void *opaque)
> }
>
> sigaction(info.ssi_signo, NULL, &action);
> - if (action.sa_handler)
> + if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
> + action.sa_sigaction(info.ssi_signo,
> + (siginfo_t *)&info, NULL);
> + else if (action.sa_handler)
> action.sa_handler(info.ssi_signo);
>
> }
> @@ -2012,6 +2152,7 @@ int kvm_main_loop(void)
> sigemptyset(&mask);
> sigaddset(&mask, SIGIO);
> sigaddset(&mask, SIGALRM);
> + sigaddset(&mask, SIGBUS);
> sigprocmask(SIG_BLOCK, &mask, NULL);
>
> sigfd = qemu_signalfd(&mask);
> @@ -2508,6 +2649,7 @@ int kvm_set_boot_cpu_id(uint32_t id)
> struct kvm_x86_mce_data {
> CPUState *env;
> struct kvm_x86_mce *mce;
> + int abort_on_error;
> };
>
> static void kvm_do_inject_x86_mce(void *_data)
> @@ -2516,13 +2658,17 @@ static void kvm_do_inject_x86_mce(void *
> int r;
>
> r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce);
> - if (r < 0)
> + if (r < 0) {
> perror("kvm_set_mce FAILED");
> + if (data->abort_on_error)
> + abort();
> + }
> }
> #endif
>
> void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> - uint64_t mcg_status, uint64_t addr, uint64_t misc)
> + uint64_t mcg_status, uint64_t addr, uint64_t misc,
> + int abort_on_error)
> {
> #ifdef KVM_CAP_MCE
> struct kvm_x86_mce mce = {
> @@ -2535,9 +2681,17 @@ void kvm_inject_x86_mce(CPUState *cenv,
> struct kvm_x86_mce_data data = {
> .env = cenv,
> .mce = &mce,
> + .abort_on_error = abort_on_error,
> };
>
> + if (!cenv->mcg_cap) {
> + fprintf(stderr, "MCE support is not enabled!\n");
> + return;
> + }
> on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
> +#else
> + if (abort_on_error)
> + abort();
> #endif
> }
> #endif
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -250,16 +250,32 @@
> #define PG_ERROR_RSVD_MASK 0x08
> #define PG_ERROR_I_D_MASK 0x10
>
> -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
> +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
> +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
>
> -#define MCE_CAP_DEF MCG_CTL_P
> +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
> #define MCE_BANKS_DEF 10
>
> +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
> +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
> #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
>
> #define MCI_STATUS_VAL (1ULL<<63) /* valid error */
> #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
> #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
> +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
> +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
> +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
> +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
> +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
> +#define MCI_STATUS_AR (1ULL<<55) /* Action required */
> +
> +/* MISC register defines */
> +#define MCM_ADDR_SEGOFF 0 /* segment offset */
> +#define MCM_ADDR_LINEAR 1 /* linear address */
> +#define MCM_ADDR_PHYS 2 /* physical address */
> +#define MCM_ADDR_MEM 3 /* memory address */
> +#define MCM_ADDR_GENERIC 7 /* generic */
>
> #define MSR_IA32_TSC 0x10
> #define MSR_IA32_APICBASE 0x1b
> --- a/cpu-common.h
> +++ b/cpu-common.h
> @@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr);
> /* This should only be used for ram local to a device. */
> void *qemu_get_ram_ptr(ram_addr_t addr);
> /* This should not be used by devices. */
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
> ram_addr_t qemu_ram_addr_from_host(void *ptr);
>
> int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
> --- a/exec.c
> +++ b/exec.c
> @@ -2600,9 +2600,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
> return block->host + (addr - block->offset);
> }
>
> -/* Some of the softmmu routines need to translate from a host pointer
> - (typically a TLB entry) back to a ram offset. */
> -ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
> {
> RAMBlock *prev;
> RAMBlock **prevp;
> @@ -2619,11 +2617,23 @@ ram_addr_t qemu_ram_addr_from_host(void
> prev = block;
> block = block->next;
> }
> - if (!block) {
> + if (!block)
> + return -1;
> + *ram_addr = block->offset + (host - block->host);
> + return 0;
> +}
> +
> +/* Some of the softmmu routines need to translate from a host pointer
> + (typically a TLB entry) back to a ram offset. */
> +ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +{
> + ram_addr_t ram_addr;
> +
> + if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) {
> fprintf(stderr, "Bad ram pointer %p\n", ptr);
> abort();
> }
> - return block->offset + (host - block->host);
> + return ram_addr;
> }
>
> static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
> --- a/qemu-kvm.h
> +++ b/qemu-kvm.h
> @@ -625,9 +625,11 @@ int kvm_inject_nmi(kvm_vcpu_context_t vc
> * \param mcg_status MSR_MCG_STATUS
> * \param addr MSR_MCI_ADDR
> * \param misc MSR_MCI_MISC
> + * \param abort_on_error abort on error
> */
> void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> - uint64_t mcg_status, uint64_t addr, uint64_t misc);
> + uint64_t mcg_status, uint64_t addr, uint64_t misc,
> + int abort_on_error);
>
> /*!
> * \brief Query wheather in kernel pit is used
> @@ -943,8 +945,11 @@ static inline int kvm_init(int smp_cpus)
>
> static inline void kvm_inject_x86_mce(CPUState *cenv, int bank,
> uint64_t status, uint64_t mcg_status,
> - uint64_t addr, uint64_t misc)
> + uint64_t addr, uint64_t misc,
> + int abort_on_error)
> {
> + if (abort_on_error)
> + abort();
> }
>
>
> --- a/target-i386/helper.c
> +++ b/target-i386/helper.c
> @@ -1547,7 +1547,7 @@ void cpu_inject_x86_mce(CPUState *cenv,
> uint64_t *banks = cenv->mce_banks;
>
> if (kvm_enabled()) {
> - kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
> + kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
> return;
> }
>
>
next prev parent reply other threads:[~2009-09-18 17:06 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-18 7:58 [PATCH -v3] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
2009-09-18 17:05 ` Marcelo Tosatti [this message]
2009-09-21 2:23 ` Huang Ying
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090918170535.GA9100@amt.cnet \
--to=mtosatti@redhat.com \
--cc=aliguori@us.ibm.com \
--cc=andi@firstfloor.org \
--cc=avi@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox