[PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
@ 2009-09-07  8:32 Huang Ying
  2009-09-07 20:48 ` Anthony Liguori
  2009-09-08  6:44 ` Avi Kivity
  0 siblings, 2 replies; 11+ messages in thread
From: Huang Ying @ 2009-09-07  8:32 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm@vger.kernel.org

UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
where some hardware error such as some memory error can be reported
without PCC (processor context corrupted). To recover from such MCE,
the corresponding memory will be unmapped, and all processes accessing
the memory will be killed via SIGBUS.

For KVM, if QEMU/KVM is killed, all guest processes will be killed
too. So we relay SIGBUS from host OS to guest system via a UCR MCE
injection. Then guest OS can isolate corresponding memory and kill
necessary guest processes only. SIGBUS sent to main thread (not VCPU
threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Huang Ying <ying.huang@intel.com>

---
 qemu-kvm.c        |  173 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 target-i386/cpu.h |   20 +++++-
 2 files changed, 181 insertions(+), 12 deletions(-)

--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -27,10 +27,23 @@
 #include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <signal.h>
+#include <sys/signalfd.h>
+#include <sys/prctl.h>
 
 #define false 0
 #define true 1
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #define EXPECTED_KVM_API_VERSION 12
 
 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
@@ -702,6 +715,24 @@ int kvm_get_dirty_pages_range(kvm_contex
     return 0;
 }
 
+static int kvm_addr_userspace_to_phys(unsigned long userspace_addr,
+                               unsigned long *phys_addr)
+{
+	int i;
+	struct slot_info *slot;
+
+	for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
+		slot = &slots[i];
+		if (slot->len && slot->userspace_addr <= userspace_addr &&
+		    (slot->userspace_addr + slot->len) > userspace_addr) {
+			*phys_addr = userspace_addr - slot->userspace_addr +
+				slot->phys_addr;
+			return 0;
+		}
+	}
+	return -1;
+}
+
 #ifdef KVM_CAP_IRQCHIP
 
 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
@@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
 {
 }
 
+static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
+{
+    if (siginfo->ssi_code == BUS_MCEERR_AO) {
+        uint64_t status;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        if (kvm_addr_userspace_to_phys((unsigned long)siginfo->ssi_addr,
+                                       &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %llx\n",
+                    (unsigned long long)siginfo->ssi_addr);
+            return;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
+        return;
+    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
+        fprintf(stderr, "Hardware memory error!\n");
+    else
+        fprintf(stderr, "Internal error in QEMU!\n");
+    exit(1);
+}
+
 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
 {
     struct qemu_work_item wi;
@@ -1657,29 +1720,102 @@ static void flush_queued_work(CPUState *
     pthread_cond_broadcast(&qemu_work_cond);
 }
 
+static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
+{
+#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && siginfo->si_addr
+        && (siginfo->si_code == BUS_MCEERR_AR
+            || siginfo->si_code == BUS_MCEERR_AO)) {
+        if (siginfo->si_code == BUS_MCEERR_AR) {
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        if (kvm_addr_userspace_to_phys((unsigned long)siginfo->si_addr,
+                                       &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (siginfo->si_code == BUS_MCEERR_AO)
+                return;
+            else
+                exit(1);
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            exit(1);
+        }
+    } else
+#endif
+    {
+        if (siginfo->si_code == BUS_MCEERR_AO)
+            return;
+        if (siginfo->si_code == BUS_MCEERR_AR)
+            fprintf(stderr, "Hardware memory error!\n");
+        else
+            fprintf(stderr, "Internal error in QEMU!\n");
+        exit(1);
+    }
+}
+
 static void kvm_main_loop_wait(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
-
-    pthread_mutex_unlock(&qemu_mutex);
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
+    do {
+        pthread_mutex_unlock(&qemu_mutex);
 
-    pthread_mutex_lock(&qemu_mutex);
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        printf("sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        pthread_mutex_lock(&qemu_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            printf("sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+            kvm_on_sigbus(env, &siginfo);
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            printf("sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 
     cpu_single_env = env;
     flush_queued_work(env);
@@ -1760,6 +1896,7 @@ static void setup_kernel_sigmask(CPUStat
 
     sigprocmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
 
     kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
 }
@@ -1885,12 +2022,20 @@ void kvm_hpet_enable_kpit(void)
 
 int kvm_init_ap(void)
 {
+    struct sigaction action;
+
 #ifdef TARGET_I386
     kvm_tpr_opt_setup();
 #endif
     qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
 
     signal(SIG_IPI, sig_ipi_handler);
+
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1);
     return 0;
 }
 
@@ -1951,7 +2096,10 @@ static void sigfd_handler(void *opaque)
         }
 
         sigaction(info.ssi_signo, NULL, &action);
-        if (action.sa_handler)
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+	else if (action.sa_handler)
             action.sa_handler(info.ssi_signo);
 
     }
@@ -2001,6 +2149,7 @@ int kvm_main_loop(void)
     sigemptyset(&mask);
     sigaddset(&mask, SIGIO);
     sigaddset(&mask, SIGALRM);
+    sigaddset(&mask, SIGBUS);
     sigprocmask(SIG_BLOCK, &mask, NULL);
 
     sigfd = qemu_signalfd(&mask);
@@ -2526,6 +2675,10 @@ void kvm_inject_x86_mce(CPUState *cenv, 
         .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
     on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
 #endif
 }
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-07  8:32 [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
@ 2009-09-07 20:48 ` Anthony Liguori
  2009-09-08  5:41   ` Huang Ying
                     ` (2 more replies)
  2009-09-08  6:44 ` Avi Kivity
  1 sibling, 3 replies; 11+ messages in thread
From: Anthony Liguori @ 2009-09-07 20:48 UTC (permalink / raw)
  To: Huang Ying; +Cc: Avi Kivity, Andi Kleen, kvm@vger.kernel.org

Hi Huang,

Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
>
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
>
> Signed-off-by: Huang Ying <ying.huang@intel.com>
>
> ---
>  qemu-kvm.c        |  173 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>  target-i386/cpu.h |   20 +++++-
>  2 files changed, 181 insertions(+), 12 deletions(-)
>
> --- a/qemu-kvm.c
> +++ b/qemu-kvm.c
> @@ -27,10 +27,23 @@
>  #include <sys/mman.h>
>  #include <sys/ioctl.h>
>  #include <signal.h>
> +#include <sys/signalfd.h>
> +#include <sys/prctl.h>
>
>  #define false 0
>  #define true 1
>
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
>  #define EXPECTED_KVM_API_VERSION 12
>
>  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> @@ -702,6 +715,24 @@ int kvm_get_dirty_pages_range(kvm_contex
>      return 0;
>  }
>
> +static int kvm_addr_userspace_to_phys(unsigned long userspace_addr,
> +                               unsigned long *phys_addr)
> +{
> +	int i;
> +	struct slot_info *slot;
> +
> +	for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
> +		slot = &slots[i];
> +		if (slot->len && slot->userspace_addr <= userspace_addr &&
> +		    (slot->userspace_addr + slot->len) > userspace_addr) {
> +			*phys_addr = userspace_addr - slot->userspace_addr +
> +				slot->phys_addr;
> +			return 0;
> +		}
> +	}
> +	return -1;
> +}
> +
>   

The slot mapping is actually a copy of the qemu's ram_blocks structure 
(see exec.c).  If you base your check on that, it will Just Work for 
QEMU too.

>  #ifdef KVM_CAP_IRQCHIP
>
>  int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
> @@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
>  {
>  }
>
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> +{
> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> +        uint64_t status;
> +        unsigned long paddr;
> +        CPUState *cenv;
> +
> +        /* Hope we are lucky for AO MCE */
>   

Even if the error was limited to guest memory, it could have been 
generated by either the kernel or userspace reading guest memory, no?

Does this potentially open a security hole for us?  Consider the following:

1) We happen to read guest memory and that causes an MCE.  For instance, 
say we're in virtio.c and we read the virtio ring.
2) That should trigger the kernel to generate a sigbus.
3) We catch sigbus, and queue an MCE for delivery.
4) After sigbus handler completes, we're back in virtio.c, what was the 
value of the memory operation we just completed?

If the instruction gets skipped, we may be leaking host memory because 
the access never happened.

-- 
Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-07 20:48 ` Anthony Liguori
@ 2009-09-08  5:41   ` Huang Ying
  2009-09-08 13:07     ` Anthony Liguori
  2009-09-08  6:41   ` Avi Kivity
  2009-09-08  8:11   ` Andi Kleen
  2 siblings, 1 reply; 11+ messages in thread
From: Huang Ying @ 2009-09-08  5:41 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Avi Kivity, Andi Kleen, kvm@vger.kernel.org

On Tue, 2009-09-08 at 04:48 +0800, Anthony Liguori wrote: 
> Hi Huang,
> 
> Huang Ying wrote:
> > UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> > where some hardware error such as some memory error can be reported
> > without PCC (processor context corrupted). To recover from such MCE,
> > the corresponding memory will be unmapped, and all processes accessing
> > the memory will be killed via SIGBUS.
> >
> > For KVM, if QEMU/KVM is killed, all guest processes will be killed
> > too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> > injection. Then guest OS can isolate corresponding memory and kill
> > necessary guest processes only. SIGBUS sent to main thread (not VCPU
> > threads) will be broadcast to all VCPU threads as UCR MCE.
> >
> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> >
> > ---
> >  qemu-kvm.c        |  173 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> >  target-i386/cpu.h |   20 +++++-
> >  2 files changed, 181 insertions(+), 12 deletions(-)
> >
> > --- a/qemu-kvm.c
> > +++ b/qemu-kvm.c
> > @@ -27,10 +27,23 @@
> >  #include <sys/mman.h>
> >  #include <sys/ioctl.h>
> >  #include <signal.h>
> > +#include <sys/signalfd.h>
> > +#include <sys/prctl.h>
> >
> >  #define false 0
> >  #define true 1
> >
> > +#ifndef PR_MCE_KILL
> > +#define PR_MCE_KILL 33
> > +#endif
> > +
> > +#ifndef BUS_MCEERR_AR
> > +#define BUS_MCEERR_AR 4
> > +#endif
> > +#ifndef BUS_MCEERR_AO
> > +#define BUS_MCEERR_AO 5
> > +#endif
> > +
> >  #define EXPECTED_KVM_API_VERSION 12
> >
> >  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> > @@ -702,6 +715,24 @@ int kvm_get_dirty_pages_range(kvm_contex
> >      return 0;
> >  }
> >
> > +static int kvm_addr_userspace_to_phys(unsigned long userspace_addr,
> > +                               unsigned long *phys_addr)
> > +{
> > +	int i;
> > +	struct slot_info *slot;
> > +
> > +	for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
> > +		slot = &slots[i];
> > +		if (slot->len && slot->userspace_addr <= userspace_addr &&
> > +		    (slot->userspace_addr + slot->len) > userspace_addr) {
> > +			*phys_addr = userspace_addr - slot->userspace_addr +
> > +				slot->phys_addr;
> > +			return 0;
> > +		}
> > +	}
> > +	return -1;
> > +}
> > +
> >   
> 
> The slot mapping is actually a copy of the qemu's ram_blocks structure 
> (see exec.c).  If you base your check on that, it will Just Work for 
> QEMU too.

I find there is already a function named qemu_ram_addr_from_host which
translate from user space virtual address into qemu RAM address. But I
need function to return a error code instead of abort in case of no RAM
address corresponding specified user space virtual address. So I plan to
use following code to deal with that.

int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
ram_addr_t qemu_ram_addr_from_host(void *ptr);

Does this follow the coding style of qemu?

> >  #ifdef KVM_CAP_IRQCHIP
> >
> >  int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
> > @@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
> >  {
> >  }
> >
> > +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> > +{
> > +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> > +        uint64_t status;
> > +        unsigned long paddr;
> > +        CPUState *cenv;
> > +
> > +        /* Hope we are lucky for AO MCE */
> >   
> 
> Even if the error was limited to guest memory, it could have been 
> generated by either the kernel or userspace reading guest memory, no?
> 
> Does this potentially open a security hole for us?  Consider the following:
> 
> 1) We happen to read guest memory and that causes an MCE.  For instance, 
> say we're in virtio.c and we read the virtio ring.
> 2) That should trigger the kernel to generate a sigbus.
> 3) We catch sigbus, and queue an MCE for delivery.
> 4) After sigbus handler completes, we're back in virtio.c, what was the 
> value of the memory operation we just completed?
> 
> If the instruction gets skipped, we may be leaking host memory because 
> the access never happened.

There are two kinds of recoverable MCE named SRAO (Software Recoverable
Action Optional) and SRAR (Software Recoverable Action Required). For
your example, it is a SRAR error. Where kernel will munmap the error
page and send SIGBUS to qemu via force_sig_info, which will unblock
SIGBUS and reset its action to SIG_DFL, so qemu will be terminated.

If the guest mode is interrupted, because signal mask processing of KVM
kernel part, SIGBUS can be captured by qemu.

For more details of recoverable MCE (SRAO and SRAR), you can refer to
latest Intel software developer's manual.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-07 20:48 ` Anthony Liguori
  2009-09-08  5:41   ` Huang Ying
@ 2009-09-08  6:41   ` Avi Kivity
  2009-09-08  6:46     ` Huang Ying
  2009-09-08  8:11   ` Andi Kleen
  2 siblings, 1 reply; 11+ messages in thread
From: Avi Kivity @ 2009-09-08  6:41 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Huang Ying, Andi Kleen, kvm@vger.kernel.org

On 09/07/2009 11:48 PM, Anthony Liguori wrote:
>
>>  #ifdef KVM_CAP_IRQCHIP
>>
>>  int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int 
>> *status)
>> @@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
>>  {
>>  }
>>
>> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, 
>> void *ctx)
>> +{
>> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
>> +        uint64_t status;
>> +        unsigned long paddr;
>> +        CPUState *cenv;
>> +
>> +        /* Hope we are lucky for AO MCE */
>
> Even if the error was limited to guest memory, it could have been 
> generated by either the kernel or userspace reading guest memory, no?
>
> Does this potentially open a security hole for us?  Consider the 
> following:
>
> 1) We happen to read guest memory and that causes an MCE.  For 
> instance, say we're in virtio.c and we read the virtio ring.
> 2) That should trigger the kernel to generate a sigbus.
> 3) We catch sigbus, and queue an MCE for delivery.
> 4) After sigbus handler completes, we're back in virtio.c, what was 
> the value of the memory operation we just completed?
>
> If the instruction gets skipped, we may be leaking host memory because 
> the access never happened.
>

I think it's a lot safer to only report guest mode accesses to the 
guest, and let user mode accesses terminate qemu.  The guest wouldn't 
expect 100% recovery; for example if an uncorrectable error hit a vital 
kernel data structure.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-08  6:44 ` Avi Kivity
@ 2009-09-08  6:43   ` Huang Ying
  0 siblings, 0 replies; 11+ messages in thread
From: Huang Ying @ 2009-09-08  6:43 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm@vger.kernel.org

On Tue, 2009-09-08 at 14:44 +0800, Avi Kivity wrote: 
> On 09/07/2009 11:32 AM, Huang Ying wrote:
> > UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> > where some hardware error such as some memory error can be reported
> > without PCC (processor context corrupted). To recover from such MCE,
> > the corresponding memory will be unmapped, and all processes accessing
> > the memory will be killed via SIGBUS.
> >
> > For KVM, if QEMU/KVM is killed, all guest processes will be killed
> > too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> > injection. Then guest OS can isolate corresponding memory and kill
> > necessary guest processes only. SIGBUS sent to main thread (not VCPU
> > threads) will be broadcast to all VCPU threads as UCR MCE.
> >
> >    
> Won't the guest be confused by the broadcast?  How does real hardware work?

We do broadcasting to follow the hardware behavior.

> > +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> > +{
> > +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> > +        uint64_t status;
> > +        unsigned long paddr;
> > +        CPUState *cenv;
> > +
> > +        /* Hope we are lucky for AO MCE */
> > +        if (kvm_addr_userspace_to_phys((unsigned long)siginfo->ssi_addr,
> > +&paddr)) {
> > +            fprintf(stderr, "Hardware memory error for memory used by "
> > +                    "QEMU itself instead of guest system!: %llx\n",
> > +                    (unsigned long long)siginfo->ssi_addr);
> > +            return;
> > +        }
> > +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> > +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> > +            | 0xc0;
> > +        kvm_inject_x86_mce(first_cpu, 9, status,
> > +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> > +                           (MCM_ADDR_PHYS<<  6) | 0xc);
> >    
> 
> This is a vcpu ioctl, yes?  if so it must be called from the vcpu thread.

No. kvm_inject_x86_mce will call on_vcpu to do the real vcpu ioctl.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-07  8:32 [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
  2009-09-07 20:48 ` Anthony Liguori
@ 2009-09-08  6:44 ` Avi Kivity
  2009-09-08  6:43   ` Huang Ying
  1 sibling, 1 reply; 11+ messages in thread
From: Avi Kivity @ 2009-09-08  6:44 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Anthony Liguori, kvm@vger.kernel.org

On 09/07/2009 11:32 AM, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
>
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
>
>    

Won't the guest be confused by the broadcast?  How does real hardware work?
>
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> +{
> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> +        uint64_t status;
> +        unsigned long paddr;
> +        CPUState *cenv;
> +
> +        /* Hope we are lucky for AO MCE */
> +        if (kvm_addr_userspace_to_phys((unsigned long)siginfo->ssi_addr,
> +&paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instead of guest system!: %llx\n",
> +                    (unsigned long long)siginfo->ssi_addr);
> +            return;
> +        }
> +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +            | 0xc0;
> +        kvm_inject_x86_mce(first_cpu, 9, status,
> +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> +                           (MCM_ADDR_PHYS<<  6) | 0xc);
>    

This is a vcpu ioctl, yes?  if so it must be called from the vcpu thread.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-08  6:41   ` Avi Kivity
@ 2009-09-08  6:46     ` Huang Ying
  0 siblings, 0 replies; 11+ messages in thread
From: Huang Ying @ 2009-09-08  6:46 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Anthony Liguori, Andi Kleen, kvm@vger.kernel.org

On Tue, 2009-09-08 at 14:41 +0800, Avi Kivity wrote: 
> On 09/07/2009 11:48 PM, Anthony Liguori wrote:
> >
> >>  #ifdef KVM_CAP_IRQCHIP
> >>
> >>  int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int 
> >> *status)
> >> @@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
> >>  {
> >>  }
> >>
> >> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, 
> >> void *ctx)
> >> +{
> >> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> >> +        uint64_t status;
> >> +        unsigned long paddr;
> >> +        CPUState *cenv;
> >> +
> >> +        /* Hope we are lucky for AO MCE */
> >
> > Even if the error was limited to guest memory, it could have been 
> > generated by either the kernel or userspace reading guest memory, no?
> >
> > Does this potentially open a security hole for us?  Consider the 
> > following:
> >
> > 1) We happen to read guest memory and that causes an MCE.  For 
> > instance, say we're in virtio.c and we read the virtio ring.
> > 2) That should trigger the kernel to generate a sigbus.
> > 3) We catch sigbus, and queue an MCE for delivery.
> > 4) After sigbus handler completes, we're back in virtio.c, what was 
> > the value of the memory operation we just completed?
> >
> > If the instruction gets skipped, we may be leaking host memory because 
> > the access never happened.
> >
> 
> I think it's a lot safer to only report guest mode accesses to the 
> guest, and let user mode accesses terminate qemu.  The guest wouldn't 
> expect 100% recovery; for example if an uncorrectable error hit a vital 
> kernel data structure.

Yes, this is the current behavior. If MCE is caused by user mode
accessing, the KVM will be killed by force_sig_info, only MCE caused by
guest mode accessing will be captured by SIGBUS signal handler.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-07 20:48 ` Anthony Liguori
  2009-09-08  5:41   ` Huang Ying
  2009-09-08  6:41   ` Avi Kivity
@ 2009-09-08  8:11   ` Andi Kleen
  2009-09-09 12:10     ` Avi Kivity
  2 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2009-09-08  8:11 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Huang Ying, Avi Kivity, Andi Kleen, kvm@vger.kernel.org

On Mon, Sep 07, 2009 at 03:48:07PM -0500, Anthony Liguori wrote:
>>
>>  int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
>> @@ -1515,6 +1546,38 @@ static void sig_ipi_handler(int n)
>>  {
>>  }
>>
>> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
>> +{
>> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
>> +        uint64_t status;
>> +        unsigned long paddr;
>> +        CPUState *cenv;
>> +
>> +        /* Hope we are lucky for AO MCE */
>>   
>
> Even if the error was limited to guest memory, it could have been generated 
> by either the kernel or userspace reading guest memory, no?

Only user space reads or asynchronously detected errors
(e.g. patrol scrubbing) are reported this way. Kernel reading
corrupted memory always leads to panic currently.

>
> Does this potentially open a security hole for us?  Consider the following:
>
> 1) We happen to read guest memory and that causes an MCE.  For instance, 
> say we're in virtio.c and we read the virtio ring.
> 2) That should trigger the kernel to generate a sigbus.
> 3) We catch sigbus, and queue an MCE for delivery.
> 4) After sigbus handler completes, we're back in virtio.c, what was the 
> value of the memory operation we just completed?

Yes for any errors on accessing qemu internal memory that is not
owned by the guest image you should abort. I thought Ying's patch
did that already though, by aborting if there's no slot match.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-08  5:41   ` Huang Ying
@ 2009-09-08 13:07     ` Anthony Liguori
  0 siblings, 0 replies; 11+ messages in thread
From: Anthony Liguori @ 2009-09-08 13:07 UTC (permalink / raw)
  To: Huang Ying; +Cc: Avi Kivity, Andi Kleen, kvm@vger.kernel.org

Huang Ying wrote:
> I find there is already a function named qemu_ram_addr_from_host which
> translate from user space virtual address into qemu RAM address. But I
> need function to return a error code instead of abort in case of no RAM
> address corresponding specified user space virtual address. So I plan to
> use following code to deal with that.
>
> int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
> ram_addr_t qemu_ram_addr_from_host(void *ptr);
>
> Does this follow the coding style of qemu?
>   

I don't like the do_ prefix much but I don't have a better suggestion.

>> If the instruction gets skipped, we may be leaking host memory because 
>> the access never happened.
>>     
>
> There are two kinds of recoverable MCE named SRAO (Software Recoverable
> Action Optional) and SRAR (Software Recoverable Action Required). For
> your example, it is a SRAR error. Where kernel will munmap the error
> page and send SIGBUS to qemu via force_sig_info, which will unblock
> SIGBUS and reset its action to SIG_DFL, so qemu will be terminated.
>
> If the guest mode is interrupted, because signal mask processing of KVM
> kernel part, SIGBUS can be captured by qemu.
>   
Ah, I didn't realize this path just worked.

-- 
Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-08  8:11   ` Andi Kleen
@ 2009-09-09 12:10     ` Avi Kivity
  2009-09-10  2:50       ` Huang Ying
  0 siblings, 1 reply; 11+ messages in thread
From: Avi Kivity @ 2009-09-09 12:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Anthony Liguori, Huang Ying, kvm@vger.kernel.org

On 09/08/2009 11:11 AM, Andi Kleen wrote:
>
>> Does this potentially open a security hole for us?  Consider the following:
>>
>> 1) We happen to read guest memory and that causes an MCE.  For instance,
>> say we're in virtio.c and we read the virtio ring.
>> 2) That should trigger the kernel to generate a sigbus.
>> 3) We catch sigbus, and queue an MCE for delivery.
>> 4) After sigbus handler completes, we're back in virtio.c, what was the
>> value of the memory operation we just completed?
>>      
> Yes for any errors on accessing qemu internal memory that is not
> owned by the guest image you should abort. I thought Ying's patch
> did that already though, by aborting if there's no slot match.
>    

User-mode qemu access should abort even if accessing guest memory, since 
there no way to recover the thread of execution (need a kernel-style 
exception table for each instruction that accesses guest memory, which 
would be a total overkill).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-09 12:10     ` Avi Kivity
@ 2009-09-10  2:50       ` Huang Ying
  0 siblings, 0 replies; 11+ messages in thread
From: Huang Ying @ 2009-09-10  2:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm@vger.kernel.org

On Wed, 2009-09-09 at 20:10 +0800, Avi Kivity wrote: 
> On 09/08/2009 11:11 AM, Andi Kleen wrote:
> >
> >> Does this potentially open a security hole for us?  Consider the following:
> >>
> >> 1) We happen to read guest memory and that causes an MCE.  For instance,
> >> say we're in virtio.c and we read the virtio ring.
> >> 2) That should trigger the kernel to generate a sigbus.
> >> 3) We catch sigbus, and queue an MCE for delivery.
> >> 4) After sigbus handler completes, we're back in virtio.c, what was the
> >> value of the memory operation we just completed?
> >>      
> > Yes for any errors on accessing qemu internal memory that is not
> > owned by the guest image you should abort. I thought Ying's patch
> > did that already though, by aborting if there's no slot match.
> >    
> 
> User-mode qemu access should abort even if accessing guest memory, since 
> there no way to recover the thread of execution (need a kernel-style 
> exception table for each instruction that accesses guest memory, which 
> would be a total overkill).

For UCR MCE caused by user space read/write, SIGBUS will be sent via
force_sig_info. For guest mode qemu, SIGBUS will be captured, and for
user mode qemu, SIGBUS will kill qemu.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-09-10  2:50 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-07  8:32 [PATCH] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
2009-09-07 20:48 ` Anthony Liguori
2009-09-08  5:41   ` Huang Ying
2009-09-08 13:07     ` Anthony Liguori
2009-09-08  6:41   ` Avi Kivity
2009-09-08  6:46     ` Huang Ying
2009-09-08  8:11   ` Andi Kleen
2009-09-09 12:10     ` Avi Kivity
2009-09-10  2:50       ` Huang Ying
2009-09-08  6:44 ` Avi Kivity
2009-09-08  6:43   ` Huang Ying

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox