From: Avi Kivity <avi@redhat.com>
To: Huang Ying <ying.huang@intel.com>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
Andi Kleen <andi@firstfloor.org>
Subject: Re: [PATCH] Add MCE support to KVM
Date: Thu, 09 Apr 2009 18:50:53 +0300 [thread overview]
Message-ID: <49DE195D.1020303@redhat.com> (raw)
In-Reply-To: <1239155601.6384.3.camel@yhuang-dev.sh.intel.com>
Huang Ying wrote:
> Add MCE support to KVM. The related MSRs are emulated. A new vcpu
> ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as
> the mcg_cap. MCE is injected via vcpu ioctl command
> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
> not implemented.
>
> Signed-off-by: Huang Ying <ying.huang@intel.com>
>
> ---
> arch/x86/include/asm/kvm_host.h | 5
> arch/x86/include/asm/mce.h | 1
> arch/x86/kvm/x86.c | 202 +++++++++++++++++++++++++++++++++++-----
> include/linux/kvm.h | 15 ++
> 4 files changed, 199 insertions(+), 24 deletions(-)
>
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -42,6 +42,7 @@
> #include <asm/msr.h>
> #include <asm/desc.h>
> #include <asm/mtrr.h>
> +#include <asm/mce.h>
>
> #define MAX_IO_MSRS 256
> #define CR0_RESERVED_BITS \
> @@ -734,23 +735,43 @@ static int set_msr_mtrr(struct kvm_vcpu
> return 0;
> }
>
> -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> {
> + u64 mcg_cap = vcpu->arch.mcg_cap;
> + unsigned bank_num = mcg_cap & 0xff;
> +
> switch (msr) {
> - case MSR_EFER:
> - set_efer(vcpu, data);
> - break;
> - case MSR_IA32_MC0_STATUS:
> - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
> - __func__, data);
> - break;
> case MSR_IA32_MCG_STATUS:
> - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
> - __func__, data);
> + vcpu->arch.mcg_status = data;
> break;
> case MSR_IA32_MCG_CTL:
> - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
> - __func__, data);
> + if (!(mcg_cap & MCG_CTL_P))
> + return 1;
> + if (data != 0 && data != ~(u64)0)
> + return -1;
> + vcpu->arch.mcg_ctl = data;
> + break;
> + default:
> + if (msr >= MSR_IA32_MC0_CTL &&
> + msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
> + u32 offset = msr - MSR_IA32_MC0_CTL;
> + /* only 0 or all 1s can be written to IA32_MCi_CTL */
> + if ((offset & 0x3) == 0 &&
> + data != 0 && data != ~(u64)0)
> + return -1;
> + vcpu->arch.mce_banks[offset] = data;
> + break;
> + }
> + return 1;
> + }
> + return 0;
> +}
> +
> +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> +{
> + switch (msr) {
> + case MSR_EFER:
> + set_efer(vcpu, data);
> break;
> case MSR_IA32_DEBUGCTLMSR:
> if (!data) {
> @@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
> break;
> }
> default:
> + if (!set_msr_mce(vcpu, msr, data))
> + break;
> pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
> return 1;
> }
>
Is there any reason you split kvm_set_msr_common() into two functions?
>
> -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> {
> u64 data;
> + u64 mcg_cap = vcpu->arch.mcg_cap;
> + unsigned bank_num = mcg_cap & 0xff;
>
> switch (msr) {
> - case 0xc0010010: /* SYSCFG */
> - case 0xc0010015: /* HWCR */
> - case MSR_IA32_PLATFORM_ID:
> case MSR_IA32_P5_MC_ADDR:
> case MSR_IA32_P5_MC_TYPE:
> - case MSR_IA32_MC0_CTL:
> - case MSR_IA32_MCG_STATUS:
> + data = 0;
> + break;
> case MSR_IA32_MCG_CAP:
> + data = vcpu->arch.mcg_cap;
> + break;
> case MSR_IA32_MCG_CTL:
> - case MSR_IA32_MC0_MISC:
> - case MSR_IA32_MC0_MISC+4:
> - case MSR_IA32_MC0_MISC+8:
> - case MSR_IA32_MC0_MISC+12:
> - case MSR_IA32_MC0_MISC+16:
> - case MSR_IA32_MC0_MISC+20:
> + if (!(mcg_cap & MCG_CTL_P))
> + return 1;
> + data = vcpu->arch.mcg_ctl;
> + break;
> + case MSR_IA32_MCG_STATUS:
> + data = vcpu->arch.mcg_status;
> + break;
> + default:
> + if (msr >= MSR_IA32_MC0_CTL &&
> + msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
> + u32 offset = msr - MSR_IA32_MC0_CTL;
> + data = vcpu->arch.mce_banks[offset];
> + break;
> + }
> + return 1;
> + }
> + *pdata = data;
> + return 0;
> +}
> +
> +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> +{
> + u64 data;
> +
> + switch (msr) {
> + case 0xc0010010: /* SYSCFG */
> + case 0xc0010015: /* HWCR */
>
Please use MSR_ constants (add them if they don't exist yet).
> + case MSR_IA32_PLATFORM_ID:
> case MSR_IA32_UCODE_REV:
> case MSR_IA32_EBL_CR_POWERON:
> case MSR_IA32_DEBUGCTLMSR:
> @@ -921,6 +967,8 @@ int kvm_get_msr_common(struct kvm_vcpu *
> data = vcpu->arch.time;
> break;
> default:
> + if (!get_msr_mce(vcpu, msr, &data))
> + break;
> pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
> return 1;
> }
> @@ -1443,6 +1491,87 @@ static int vcpu_ioctl_tpr_access_reporti
> return 0;
> }
>
Again, why split?
>
> +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> + u64 mcg_cap)
> +{
> + int r;
> + unsigned bank_num = mcg_cap & 0xff, bank;
> + u64 *banks;
> +
> + r = -EINVAL;
> + if (!bank_num)
> + goto out;
> + r = -ENOMEM;
> + banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL);
>
If banks is already allocated, you'll leak it.
Why not always allocate it on vcpu setup?
> +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> + struct kvm_x86_mce *mce)
> +{
> + u64 mcg_cap = vcpu->arch.mcg_cap;
> + unsigned bank_num = mcg_cap & 0xff;
> + u64 *banks = vcpu->arch.mce_banks;
> +
> + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
> + return -EINVAL;
> + /*
> + * if IA32_MCG_CTL is not all 1s, the uncorrected error
> + * reporting is disabled
> + */
> + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
> + vcpu->arch.mcg_ctl != ~(u64)0)
> + return 0;
> + banks += 4 * mce->bank;
> + /*
> + * if IA32_MCi_CTL is not all 1s, the uncorrected error
> + * reporting is disabled for the bank
> + */
> + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
> + return 0;
> + if (mce->status & MCI_STATUS_UC) {
> + u64 status = mce->status;
> + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
> + !(vcpu->arch.cr4 & X86_CR4_MCE)) {
> + printk(KERN_DEBUG "kvm: set_mce: "
> + "injects mce exception while "
> + "previous one is in progress!\n");
> + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
> + return 0;
> + }
> + if (banks[1] & MCI_STATUS_VAL)
> + status |= MCI_STATUS_OVER;
> + banks[1] = mce->status;
> + banks[2] = mce->addr;
> + banks[3] = mce->misc;
> + vcpu->arch.mcg_status = mce->mcg_status;
> + kvm_queue_exception(vcpu, MC_VECTOR);
> + } else if (!(banks[1] & MCI_STATUS_VAL) ||
> + (!(banks[1] & MCI_STATUS_UC) &&
> + !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
> + u64 status = mce->status;
> + if (banks[1] & MCI_STATUS_VAL)
> + status |= MCI_STATUS_OVER;
> + banks[1] = mce->status;
> + banks[2] = mce->addr;
> + banks[3] = mce->misc;
> + } else
> + banks[1] |= MCI_STATUS_OVER;
> + return 0;
> +}
>
Can userspace just use KVM_SET_MSR for this?
> + case KVM_X86_SETUP_MCE: {
> + u64 mcg_cap;
> +
> + r = -EFAULT;
> + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
> + goto out;
> + /*
> + * extended machine-check state registers and CMCI are
> + * not supported.
> + */
> + mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
>
Instead of silently dropping, should return an error.
> + if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
> + goto out;
>
And not copy.
> #define KVM_TRC_SHIFT 16
> /*
> * kvm trace categories
> @@ -528,6 +540,9 @@ struct kvm_irq_routing {
> #define KVM_NMI _IO(KVMIO, 0x9a)
> /* Available with KVM_CAP_SET_GUEST_DEBUG */
> #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug)
> +/* MCE for x86 */
> +#define KVM_X86_SETUP_MCE _IOWR(KVMIO, 0x9a, __u64)
> +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9b, struct kvm_x86_mce)
>
Please add a KVM_CAP_ to advertise this capability.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
next prev parent reply other threads:[~2009-04-09 15:51 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-04-08 1:53 [PATCH] Add MCE support to KVM Huang Ying
2009-04-09 15:50 ` Avi Kivity [this message]
2009-04-10 3:00 ` Huang Ying
2009-04-11 12:04 ` Avi Kivity
2009-04-11 12:19 ` Andi Kleen
2009-04-11 12:25 ` Avi Kivity
2009-04-13 8:26 ` Andi Kleen
2009-04-13 2:41 ` Huang Ying
2009-04-13 13:02 ` Avi Kivity
2009-04-14 2:04 ` Huang Ying
2009-04-14 10:45 ` Avi Kivity
2009-04-15 7:24 ` Huang Ying
2009-04-18 22:17 ` Anthony Liguori
2009-04-19 8:33 ` Avi Kivity
2009-04-20 7:52 ` Gerd Hoffmann
2009-04-20 8:26 ` Avi Kivity
2009-04-20 8:59 ` Gerd Hoffmann
2009-04-20 9:05 ` Avi Kivity
2009-04-20 10:04 ` Andi Kleen
2009-04-20 11:02 ` Avi Kivity
2009-04-20 11:23 ` Gerd Hoffmann
2009-04-20 11:27 ` Avi Kivity
2009-04-20 12:20 ` Gerd Hoffmann
2009-04-20 12:43 ` Avi Kivity
2009-04-20 13:24 ` Gerd Hoffmann
2009-04-20 13:45 ` Avi Kivity
2009-04-21 9:14 ` Xenner design and kvm msr handling Gerd Hoffmann
2009-04-21 10:14 ` Avi Kivity
2009-04-21 11:41 ` Gerd Hoffmann
2009-04-21 12:27 ` Avi Kivity
2009-04-21 12:47 ` Gerd Hoffmann
2009-04-21 13:33 ` Avi Kivity
2009-04-21 16:00 ` Gerd Hoffmann
2009-04-21 16:15 ` Avi Kivity
2009-04-21 16:15 ` Avi Kivity
2009-04-22 10:02 ` Gerd Hoffmann
2009-04-22 10:02 ` Gerd Hoffmann
2009-04-21 16:04 ` [PATCH] Add MCE support to KVM Anthony Liguori
2009-04-21 16:17 ` Avi Kivity
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=49DE195D.1020303@redhat.com \
--to=avi@redhat.com \
--cc=andi@firstfloor.org \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.