[PATCH] KVM: x86: Add support for CMCI and UCNA.

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] KVM: x86: Add support for CMCI and UCNA.
@ 2022-03-23 18:28 Jue Wang
  2022-04-10 16:56 ` Jue Wang
  2022-04-11 19:08 ` Sean Christopherson
  0 siblings, 2 replies; 10+ messages in thread
From: Jue Wang @ 2022-03-23 18:28 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Jue Wang

CMCI is supported since Nehalem. UCNA (uncorrectable no action required)
errors signaled via CMCI allows a guest to be notified as soon as
uncorrectable memory errors get detected by some background threads,
e.g., threads that migrate guest memory across hosts.

Upon receiving UCNAs, guest kernel isolates the poisoned pages from
future accesses much earlier than a potential fatal Machine Check
Exception due to accesses from a guest thread.

Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P.

Signed-off-by: Jue Wang <juew@google.com>
---
 arch/x86/include/asm/kvm_host.h |  11 +++
 arch/x86/kvm/lapic.c            |  65 ++++++++++++++----
 arch/x86/kvm/lapic.h            |   2 +-
 arch/x86/kvm/vmx/vmx.c          |   1 +
 arch/x86/kvm/x86.c              | 115 +++++++++++++++++++++++++++++---
 5 files changed, 171 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ec9830d2aabf..d57f3d1284a3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -613,6 +613,8 @@ struct kvm_vcpu_xen {
 	unsigned long evtchn_pending_sel;
 };
 
+#define KVM_MCA_REG_PER_BANK 5
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -799,6 +801,15 @@ struct kvm_vcpu_arch {
 	u64 mcg_status;
 	u64 mcg_ctl;
 	u64 mcg_ext_ctl;
+	/*
+	 * 5 registers per bank for up to KVM_MAX_MCE_BANKS.
+	 * Register order within each bank:
+	 * mce_banks[5 * bank]   - IA32_MCi_CTL
+	 * mce_banks[5 * bank + 1] - IA32_MCi_STATUS
+	 * mce_banks[5 * bank + 2] - IA32_MCi_ADDR
+	 * mce_banks[5 * bank + 3] - IA32_MCi_MISC
+	 * mce_banks[5 * bank + 4] - IA32_MCi_CTL2
+	 */
 	u64 *mce_banks;
 
 	/* Cache MMIO info */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9322e6340a74..b388eb82308a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
+#include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/page.h>
 #include <asm/current.h>
@@ -53,8 +54,6 @@
 #define PRIu64 "u"
 #define PRIo64 "o"
 
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
 #define LAPIC_MMIO_LENGTH		(1 << 12)
 /* followed define is not in apicdef.h */
 #define MAX_APIC_VECTOR			256
@@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
 void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 v = APIC_VERSION;
+	int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
+			KVM_APIC_LVT_NUM - 1;
+	/* 14 is the version for Xeon and Pentium 8.4.8*/
+	u32 v = 0x14UL | ((lvt_num - 1) << 16);
 
 	if (!lapic_in_kernel(vcpu))
 		return;
@@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
 	LINT_MASK, LINT_MASK,	/* LVT0-1 */
-	LVT_MASK		/* LVTERR */
+	LVT_MASK,		/* LVTERR */
+	LVT_MASK | APIC_MODE_MASK	/* LVTCMCI */
 };
 
 static int find_highest_vector(void *bitmap)
@@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 		APIC_REG_MASK(APIC_TMCCT) |
 		APIC_REG_MASK(APIC_TDCR);
 
+	if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P)
+		valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
+
 	/* ARBPRI is not valid on x2APIC */
 	if (!apic_x2apic_mode(apic))
 		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
@@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 	}
 }
 
+static int kvm_lvt_reg_by_index(int i)
+{
+	if (i < 0 || i >= KVM_APIC_LVT_NUM) {
+		pr_warn("lvt register index out of bound: %i\n", i);
+		return 0;
+	}
+
+	if (i < KVM_APIC_LVT_NUM - 1)
+		return APIC_LVTT + 0x10 * i;
+	return APIC_LVTCMCI;
+}
+
 int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 {
 	int ret = 0;
@@ -2038,12 +2056,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		if (!(val & APIC_SPIV_APIC_ENABLED)) {
 			int i;
 			u32 lvt_val;
-
-			for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
-				lvt_val = kvm_lapic_get_reg(apic,
-						       APIC_LVTT + 0x10 * i);
-				kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
-					     lvt_val | APIC_LVT_MASKED);
+			int lvt_reg;
+			int lvt_num = apic->vcpu->arch.mcg_cap & MCG_CMCI_P ?
+					KVM_APIC_LVT_NUM : KVM_APIC_LVT_NUM - 1;
+
+			for (i = 0; i < lvt_num; i++) {
+				lvt_reg = kvm_lvt_reg_by_index(i);
+				if (lvt_reg) {
+					lvt_val = kvm_lapic_get_reg(apic, lvt_reg);
+					kvm_lapic_set_reg(apic, lvt_reg,
+							  lvt_val | APIC_LVT_MASKED);
+				}
 			}
 			apic_update_lvtt(apic);
 			atomic_set(&apic->lapic_timer.pending, 0);
@@ -2093,6 +2116,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 		apic_update_lvtt(apic);
 		break;
 
+	case APIC_LVTCMCI:
+		if (!(apic->vcpu->arch.mcg_cap & MCG_CMCI_P)) {
+			ret = 1;
+			break;
+		}
+		if (!kvm_apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+		val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1];
+		kvm_lapic_set_reg(apic, APIC_LVTCMCI, val);
+		break;
+
 	case APIC_TMICT:
 		if (apic_lvtt_tscdeadline(apic))
 			break;
@@ -2322,6 +2356,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u64 msr_val;
 	int i;
+	int lvt_num;
 
 	if (!init_event) {
 		msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
@@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
-	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
-		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+	lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
+			KVM_APIC_LVT_NUM - 1;
+	for (i = 0; i < lvt_num; i++) {
+		int lvt_reg = kvm_lvt_reg_by_index(i);
+
+		if (lvt_reg)
+			kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED);
+	}
 	apic_update_lvtt(apic);
 	if (kvm_vcpu_is_reset_bsp(vcpu) &&
 	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2b44e533fc8d..e2ae097613ca 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,7 +10,7 @@
 
 #define KVM_APIC_INIT		0
 #define KVM_APIC_SIPI		1
-#define KVM_APIC_LVT_NUM	6
+#define KVM_APIC_LVT_NUM	7
 
 #define APIC_SHORT_MASK			0xc0000
 #define APIC_DEST_NOSHORT		0x0
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b730d799c26e..63aa2b3d30ca 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
 	}
 
 	kvm_mce_cap_supported |= MCG_LMCE_P;
+	kvm_mce_cap_supported |= MCG_CMCI_P;
 
 	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
 		return -EINVAL;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb4029660bd9..6626723bf51b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.mcg_ctl = data;
 		break;
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+		{
+			u32 offset;
+			/* BIT[30] - CMCI_ENABLE */
+			/* BIT[0:14] - CMCI_THRESHOLD */
+			u64 mask = (1 << 30) | 0x7fff;
+
+			if (!(mcg_cap & MCG_CMCI_P) &&
+			    (data || !msr_info->host_initiated))
+				return 1;
+			/* An attempt to write a 1 to a reserved bit raises #GP*/
+			if (data & ~mask)
+				return 1;
+			offset = array_index_nospec(
+				msr - MSR_IA32_MC0_CTL2,
+				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
+			vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask);
+		}
+		break;
 	default:
 		if (msr >= MSR_IA32_MC0_CTL &&
 		    msr < MSR_IA32_MCx_CTL(bank_num)) {
@@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 					return -1;
 			}
 
-			vcpu->arch.mce_banks[offset] = data;
+			/* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes
+			 * per bank.
+			 * kvm_vcpu_arch.mce_banks has 5 registers per bank, see
+			 * register layout details in kvm_host.h.
+			 * MSR_IA32_MCi_CTL is the first register in each bank
+			 * within kvm_vcpu_arch.mce_banks.
+			 */
+			vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data;
 			break;
 		}
 		return 1;
@@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		}
 		break;
-	case 0x200 ... 0x2ff:
+	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
 		return kvm_mtrr_set_msr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
 		return kvm_set_apic_base(vcpu, msr_info);
@@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
 		return set_msr_mce(vcpu, msr_info);
 
 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
@@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 	case MSR_IA32_MCG_STATUS:
 		data = vcpu->arch.mcg_status;
 		break;
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+		{
+			u32 offset;
+
+			if (!(mcg_cap & MCG_CMCI_P) && !host)
+				return 1;
+			offset = array_index_nospec(
+				msr - MSR_IA32_MC0_CTL2,
+				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
+			data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4];
+		}
+		break;
 	default:
 		if (msr >= MSR_IA32_MC0_CTL &&
 		    msr < MSR_IA32_MCx_CTL(bank_num)) {
@@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 				msr - MSR_IA32_MC0_CTL,
 				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
 
-			data = vcpu->arch.mce_banks[offset];
+			data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4];
 			break;
 		}
 		return 1;
@@ -3873,7 +3913,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	}
 	case MSR_MTRRcap:
-	case 0x200 ... 0x2ff:
+	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
 	case 0xcd: /* fsb frequency */
 		msr_info->data = 3;
@@ -3989,6 +4030,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
 				   msr_info->host_initiated);
 	case MSR_IA32_XSS:
@@ -4740,15 +4782,64 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	/* Init IA32_MCG_CTL to all 1s */
 	if (mcg_cap & MCG_CTL_P)
 		vcpu->arch.mcg_ctl = ~(u64)0;
-	/* Init IA32_MCi_CTL to all 1s */
-	for (bank = 0; bank < bank_num; bank++)
-		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+	/* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+	for (bank = 0; bank < bank_num; bank++) {
+		vcpu->arch.mce_banks[bank * KVM_MCA_REG_PER_BANK] = ~(u64)0;
+		if (mcg_cap & MCG_CMCI_P)
+			vcpu->arch.mce_banks[bank * KVM_MCA_REG_PER_BANK + 4] = 0;
+	}
 
 	static_call(kvm_x86_setup_mce)(vcpu);
 out:
 	return r;
 }
 
+static bool is_ucna(u64 mcg_status, u64 mci_status)
+{
+	return !mcg_status &&
+		!(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR));
+}
+
+static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
+				       struct kvm_x86_mce *mce)
+{
+	u64 mcg_cap = vcpu->arch.mcg_cap;
+	unsigned int bank_num = mcg_cap & 0xff;
+	u64 *banks = vcpu->arch.mce_banks;
+
+	/* Check for legal bank number in guest */
+	if (mce->bank >= bank_num)
+		return -EINVAL;
+
+	/* Disallow bits that are used for machine check signalling */
+	if (mce->mcg_status ||
+	    (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)))
+		return -EINVAL;
+
+	 /* UCNA must have VAL and UC bits set */
+	if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) !=
+	    (MCI_STATUS_VAL|MCI_STATUS_UC))
+		return -EINVAL;
+
+	banks += KVM_MCA_REG_PER_BANK * mce->bank;
+	banks[1] = mce->status;
+	banks[2] = mce->addr;
+	banks[3] = mce->misc;
+	vcpu->arch.mcg_status = mce->mcg_status;
+
+	/*
+	 * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling
+	 * is disabled for the bank
+	 */
+	if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30)))
+		return 0;
+
+	if (lapic_in_kernel(vcpu))
+		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
+
+	return 0;
+}
+
 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 				      struct kvm_x86_mce *mce)
 {
@@ -4758,14 +4849,18 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 
 	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
 		return -EINVAL;
-	/*
+
+	if (is_ucna(mce->mcg_status, mce->status))
+		return kvm_vcpu_x86_set_ucna(vcpu, mce);
+
+		/*
 	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
 	 * reporting is disabled
 	 */
 	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
 	    vcpu->arch.mcg_ctl != ~(u64)0)
 		return 0;
-	banks += 4 * mce->bank;
+	banks += KVM_MCA_REG_PER_BANK * mce->bank;
 	/*
 	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
 	 * reporting is disabled for the bank
@@ -11126,7 +11221,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 		goto fail_free_lapic;
 	vcpu->arch.pio_data = page_address(page);
 
-	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * KVM_MCA_REG_PER_BANK,
 				       GFP_KERNEL_ACCOUNT);
 	if (!vcpu->arch.mce_banks)
 		goto fail_free_pio_data;
-- 
2.35.1.1021.g381101b075-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-03-23 18:28 [PATCH] KVM: x86: Add support for CMCI and UCNA Jue Wang
@ 2022-04-10 16:56 ` Jue Wang
  2022-04-11 19:08 ` Sean Christopherson
  1 sibling, 0 replies; 10+ messages in thread
From: Jue Wang @ 2022-04-10 16:56 UTC (permalink / raw)
  To: Sean Christopherson, vkuznets, wanpengli, joro, Jim Mattson
  Cc: kvm, Paolo Bonzini

Actually sending to KVM/x86 maintainers.

Thanks,
-Jue

On Wed, Mar 23, 2022 at 11:28 AM Jue Wang <juew@google.com> wrote:
>
> CMCI is supported since Nehalem. UCNA (uncorrectable no action required)
> errors signaled via CMCI allows a guest to be notified as soon as
> uncorrectable memory errors get detected by some background threads,
> e.g., threads that migrate guest memory across hosts.
>
> Upon receiving UCNAs, guest kernel isolates the poisoned pages from
> future accesses much earlier than a potential fatal Machine Check
> Exception due to accesses from a guest thread.
>
> Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P.
>
> Signed-off-by: Jue Wang <juew@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  11 +++
>  arch/x86/kvm/lapic.c            |  65 ++++++++++++++----
>  arch/x86/kvm/lapic.h            |   2 +-
>  arch/x86/kvm/vmx/vmx.c          |   1 +
>  arch/x86/kvm/x86.c              | 115 +++++++++++++++++++++++++++++---
>  5 files changed, 171 insertions(+), 23 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ec9830d2aabf..d57f3d1284a3 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -613,6 +613,8 @@ struct kvm_vcpu_xen {
>         unsigned long evtchn_pending_sel;
>  };
>
> +#define KVM_MCA_REG_PER_BANK 5
> +
>  struct kvm_vcpu_arch {
>         /*
>          * rip and regs accesses must go through
> @@ -799,6 +801,15 @@ struct kvm_vcpu_arch {
>         u64 mcg_status;
>         u64 mcg_ctl;
>         u64 mcg_ext_ctl;
> +       /*
> +        * 5 registers per bank for up to KVM_MAX_MCE_BANKS.
> +        * Register order within each bank:
> +        * mce_banks[5 * bank]   - IA32_MCi_CTL
> +        * mce_banks[5 * bank + 1] - IA32_MCi_STATUS
> +        * mce_banks[5 * bank + 2] - IA32_MCi_ADDR
> +        * mce_banks[5 * bank + 3] - IA32_MCi_MISC
> +        * mce_banks[5 * bank + 4] - IA32_MCi_CTL2
> +        */
>         u64 *mce_banks;
>
>         /* Cache MMIO info */
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9322e6340a74..b388eb82308a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -27,6 +27,7 @@
>  #include <linux/math64.h>
>  #include <linux/slab.h>
>  #include <asm/processor.h>
> +#include <asm/mce.h>
>  #include <asm/msr.h>
>  #include <asm/page.h>
>  #include <asm/current.h>
> @@ -53,8 +54,6 @@
>  #define PRIu64 "u"
>  #define PRIo64 "o"
>
> -/* 14 is the version for Xeon and Pentium 8.4.8*/
> -#define APIC_VERSION                   (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
>  #define LAPIC_MMIO_LENGTH              (1 << 12)
>  /* followed define is not in apicdef.h */
>  #define MAX_APIC_VECTOR                        256
> @@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>  {
>         struct kvm_lapic *apic = vcpu->arch.apic;
> -       u32 v = APIC_VERSION;
> +       int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> +                       KVM_APIC_LVT_NUM - 1;
> +       /* 14 is the version for Xeon and Pentium 8.4.8*/
> +       u32 v = 0x14UL | ((lvt_num - 1) << 16);
>
>         if (!lapic_in_kernel(vcpu))
>                 return;
> @@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
>         LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
>         LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
>         LINT_MASK, LINT_MASK,   /* LVT0-1 */
> -       LVT_MASK                /* LVTERR */
> +       LVT_MASK,               /* LVTERR */
> +       LVT_MASK | APIC_MODE_MASK       /* LVTCMCI */
>  };
>
>  static int find_highest_vector(void *bitmap)
> @@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
>                 APIC_REG_MASK(APIC_TMCCT) |
>                 APIC_REG_MASK(APIC_TDCR);
>
> +       if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P)
> +               valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
> +
>         /* ARBPRI is not valid on x2APIC */
>         if (!apic_x2apic_mode(apic))
>                 valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
> @@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
>         }
>  }
>
> +static int kvm_lvt_reg_by_index(int i)
> +{
> +       if (i < 0 || i >= KVM_APIC_LVT_NUM) {
> +               pr_warn("lvt register index out of bound: %i\n", i);
> +               return 0;
> +       }
> +
> +       if (i < KVM_APIC_LVT_NUM - 1)
> +               return APIC_LVTT + 0x10 * i;
> +       return APIC_LVTCMCI;
> +}
> +
>  int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  {
>         int ret = 0;
> @@ -2038,12 +2056,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>                 if (!(val & APIC_SPIV_APIC_ENABLED)) {
>                         int i;
>                         u32 lvt_val;
> -
> -                       for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
> -                               lvt_val = kvm_lapic_get_reg(apic,
> -                                                      APIC_LVTT + 0x10 * i);
> -                               kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
> -                                            lvt_val | APIC_LVT_MASKED);
> +                       int lvt_reg;
> +                       int lvt_num = apic->vcpu->arch.mcg_cap & MCG_CMCI_P ?
> +                                       KVM_APIC_LVT_NUM : KVM_APIC_LVT_NUM - 1;
> +
> +                       for (i = 0; i < lvt_num; i++) {
> +                               lvt_reg = kvm_lvt_reg_by_index(i);
> +                               if (lvt_reg) {
> +                                       lvt_val = kvm_lapic_get_reg(apic, lvt_reg);
> +                                       kvm_lapic_set_reg(apic, lvt_reg,
> +                                                         lvt_val | APIC_LVT_MASKED);
> +                               }
>                         }
>                         apic_update_lvtt(apic);
>                         atomic_set(&apic->lapic_timer.pending, 0);
> @@ -2093,6 +2116,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>                 apic_update_lvtt(apic);
>                 break;
>
> +       case APIC_LVTCMCI:
> +               if (!(apic->vcpu->arch.mcg_cap & MCG_CMCI_P)) {
> +                       ret = 1;
> +                       break;
> +               }
> +               if (!kvm_apic_sw_enabled(apic))
> +                       val |= APIC_LVT_MASKED;
> +               val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1];
> +               kvm_lapic_set_reg(apic, APIC_LVTCMCI, val);
> +               break;
> +
>         case APIC_TMICT:
>                 if (apic_lvtt_tscdeadline(apic))
>                         break;
> @@ -2322,6 +2356,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>         struct kvm_lapic *apic = vcpu->arch.apic;
>         u64 msr_val;
>         int i;
> +       int lvt_num;
>
>         if (!init_event) {
>                 msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
> @@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>                 kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
>         kvm_apic_set_version(apic->vcpu);
>
> -       for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> -               kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> +       lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> +                       KVM_APIC_LVT_NUM - 1;
> +       for (i = 0; i < lvt_num; i++) {
> +               int lvt_reg = kvm_lvt_reg_by_index(i);
> +
> +               if (lvt_reg)
> +                       kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED);
> +       }
>         apic_update_lvtt(apic);
>         if (kvm_vcpu_is_reset_bsp(vcpu) &&
>             kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 2b44e533fc8d..e2ae097613ca 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -10,7 +10,7 @@
>
>  #define KVM_APIC_INIT          0
>  #define KVM_APIC_SIPI          1
> -#define KVM_APIC_LVT_NUM       6
> +#define KVM_APIC_LVT_NUM       7
>
>  #define APIC_SHORT_MASK                        0xc0000
>  #define APIC_DEST_NOSHORT              0x0
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b730d799c26e..63aa2b3d30ca 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
>         }
>
>         kvm_mce_cap_supported |= MCG_LMCE_P;
> +       kvm_mce_cap_supported |= MCG_CMCI_P;
>
>         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
>                 return -EINVAL;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eb4029660bd9..6626723bf51b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                         return 1;
>                 vcpu->arch.mcg_ctl = data;
>                 break;
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +               {
> +                       u32 offset;
> +                       /* BIT[30] - CMCI_ENABLE */
> +                       /* BIT[0:14] - CMCI_THRESHOLD */
> +                       u64 mask = (1 << 30) | 0x7fff;
> +
> +                       if (!(mcg_cap & MCG_CMCI_P) &&
> +                           (data || !msr_info->host_initiated))
> +                               return 1;
> +                       /* An attempt to write a 1 to a reserved bit raises #GP*/
> +                       if (data & ~mask)
> +                               return 1;
> +                       offset = array_index_nospec(
> +                               msr - MSR_IA32_MC0_CTL2,
> +                               MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> +                       vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask);
> +               }
> +               break;
>         default:
>                 if (msr >= MSR_IA32_MC0_CTL &&
>                     msr < MSR_IA32_MCx_CTL(bank_num)) {
> @@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                                         return -1;
>                         }
>
> -                       vcpu->arch.mce_banks[offset] = data;
> +                       /* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes
> +                        * per bank.
> +                        * kvm_vcpu_arch.mce_banks has 5 registers per bank, see
> +                        * register layout details in kvm_host.h.
> +                        * MSR_IA32_MCi_CTL is the first register in each bank
> +                        * within kvm_vcpu_arch.mce_banks.
> +                        */
> +                       vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data;
>                         break;
>                 }
>                 return 1;
> @@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                         return 1;
>                 }
>                 break;
> -       case 0x200 ... 0x2ff:
> +       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>                 return kvm_mtrr_set_msr(vcpu, msr, data);
>         case MSR_IA32_APICBASE:
>                 return kvm_set_apic_base(vcpu, msr_info);
> @@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>         case MSR_IA32_MCG_CTL:
>         case MSR_IA32_MCG_STATUS:
>         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>                 return set_msr_mce(vcpu, msr_info);
>
>         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> @@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>         case MSR_IA32_MCG_STATUS:
>                 data = vcpu->arch.mcg_status;
>                 break;
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +               {
> +                       u32 offset;
> +
> +                       if (!(mcg_cap & MCG_CMCI_P) && !host)
> +                               return 1;
> +                       offset = array_index_nospec(
> +                               msr - MSR_IA32_MC0_CTL2,
> +                               MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> +                       data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4];
> +               }
> +               break;
>         default:
>                 if (msr >= MSR_IA32_MC0_CTL &&
>                     msr < MSR_IA32_MCx_CTL(bank_num)) {
> @@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>                                 msr - MSR_IA32_MC0_CTL,
>                                 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
>
> -                       data = vcpu->arch.mce_banks[offset];
> +                       data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4];
>                         break;
>                 }
>                 return 1;
> @@ -3873,7 +3913,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                 break;
>         }
>         case MSR_MTRRcap:
> -       case 0x200 ... 0x2ff:
> +       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
>         case 0xcd: /* fsb frequency */
>                 msr_info->data = 3;
> @@ -3989,6 +4030,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>         case MSR_IA32_MCG_CTL:
>         case MSR_IA32_MCG_STATUS:
>         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
>                                    msr_info->host_initiated);
>         case MSR_IA32_XSS:
> @@ -4740,15 +4782,64 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>         /* Init IA32_MCG_CTL to all 1s */
>         if (mcg_cap & MCG_CTL_P)
>                 vcpu->arch.mcg_ctl = ~(u64)0;
> -       /* Init IA32_MCi_CTL to all 1s */
> -       for (bank = 0; bank < bank_num; bank++)
> -               vcpu->arch.mce_banks[bank*4] = ~(u64)0;
> +       /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
> +       for (bank = 0; bank < bank_num; bank++) {
> +               vcpu->arch.mce_banks[bank * KVM_MCA_REG_PER_BANK] = ~(u64)0;
> +               if (mcg_cap & MCG_CMCI_P)
> +                       vcpu->arch.mce_banks[bank * KVM_MCA_REG_PER_BANK + 4] = 0;
> +       }
>
>         static_call(kvm_x86_setup_mce)(vcpu);
>  out:
>         return r;
>  }
>
> +static bool is_ucna(u64 mcg_status, u64 mci_status)
> +{
> +       return !mcg_status &&
> +               !(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR));
> +}
> +
> +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> +                                      struct kvm_x86_mce *mce)
> +{
> +       u64 mcg_cap = vcpu->arch.mcg_cap;
> +       unsigned int bank_num = mcg_cap & 0xff;
> +       u64 *banks = vcpu->arch.mce_banks;
> +
> +       /* Check for legal bank number in guest */
> +       if (mce->bank >= bank_num)
> +               return -EINVAL;
> +
> +       /* Disallow bits that are used for machine check signalling */
> +       if (mce->mcg_status ||
> +           (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)))
> +               return -EINVAL;
> +
> +        /* UCNA must have VAL and UC bits set */
> +       if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) !=
> +           (MCI_STATUS_VAL|MCI_STATUS_UC))
> +               return -EINVAL;
> +
> +       banks += KVM_MCA_REG_PER_BANK * mce->bank;
> +       banks[1] = mce->status;
> +       banks[2] = mce->addr;
> +       banks[3] = mce->misc;
> +       vcpu->arch.mcg_status = mce->mcg_status;
> +
> +       /*
> +        * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling
> +        * is disabled for the bank
> +        */
> +       if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30)))
> +               return 0;
> +
> +       if (lapic_in_kernel(vcpu))
> +               kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> +
> +       return 0;
> +}
> +
>  static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
>                                       struct kvm_x86_mce *mce)
>  {
> @@ -4758,14 +4849,18 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
>
>         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
>                 return -EINVAL;
> -       /*
> +
> +       if (is_ucna(mce->mcg_status, mce->status))
> +               return kvm_vcpu_x86_set_ucna(vcpu, mce);
> +
> +               /*
>          * if IA32_MCG_CTL is not all 1s, the uncorrected error
>          * reporting is disabled
>          */
>         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
>             vcpu->arch.mcg_ctl != ~(u64)0)
>                 return 0;
> -       banks += 4 * mce->bank;
> +       banks += KVM_MCA_REG_PER_BANK * mce->bank;
>         /*
>          * if IA32_MCi_CTL is not all 1s, the uncorrected error
>          * reporting is disabled for the bank
> @@ -11126,7 +11221,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>                 goto fail_free_lapic;
>         vcpu->arch.pio_data = page_address(page);
>
> -       vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
> +       vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * KVM_MCA_REG_PER_BANK,
>                                        GFP_KERNEL_ACCOUNT);
>         if (!vcpu->arch.mce_banks)
>                 goto fail_free_pio_data;
> --
> 2.35.1.1021.g381101b075-goog
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-03-23 18:28 [PATCH] KVM: x86: Add support for CMCI and UCNA Jue Wang
  2022-04-10 16:56 ` Jue Wang
@ 2022-04-11 19:08 ` Sean Christopherson
  2022-04-12 14:22   ` Jue Wang
  2022-04-12 17:44   ` Paolo Bonzini
  1 sibling, 2 replies; 10+ messages in thread
From: Sean Christopherson @ 2022-04-11 19:08 UTC (permalink / raw)
  To: Jue Wang; +Cc: Paolo Bonzini, kvm

On Wed, Mar 23, 2022, Jue Wang wrote:
> CMCI 

Please write Corrected Machine Check Interrupt at least once.

> is supported since Nehalem.

While possibly interesting, this info is defitely not the most interesting tidbit
in the changelong, i.e. shouldn't be the opening line.

>  UCNA (uncorrectable no action required) errors signaled via CMCI allows a
>  guest to be notified as soon as uncorrectable memory errors get detected by
>  some background threads, e.g., threads that migrate guest memory across
>  hosts.
> 
> Upon receiving UCNAs, guest kernel isolates the poisoned pages from
> future accesses much earlier than a potential fatal Machine Check
> Exception due to accesses from a guest thread.
> 
> Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P.

This changelog needs much longer explanation of what exactly is being added, e.g. I
had to read the code to find out that this introduces new userspace functionality
to allow injecting UNCA #MCs and thus CMCI IRQs.

That's also a symptom of this patch needing to be split into a proper series, e.g.
exposing the UNCA injection point to userspace needs to be a separate patch.

Looking through this, 5 or 6 patches is probably appropriate:

  1. Replace existing magic numbers with #defines
  2. Clean up the existing LVT mess
  3. Add in-kernel LVTCMCI support (unreachable until #, but easier to review)
  4. Add support for MSR_IA32_MCx_CTL2 MSRs
  5. Add CMCI support
  6. Add UNCA injection support

I can't tell if #4 is necessary as a separate patch, it might belong with #3.

> Signed-off-by: Jue Wang <juew@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  11 +++
>  arch/x86/kvm/lapic.c            |  65 ++++++++++++++----
>  arch/x86/kvm/lapic.h            |   2 +-
>  arch/x86/kvm/vmx/vmx.c          |   1 +
>  arch/x86/kvm/x86.c              | 115 +++++++++++++++++++++++++++++---
>  5 files changed, 171 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ec9830d2aabf..d57f3d1284a3 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -613,6 +613,8 @@ struct kvm_vcpu_xen {
>  	unsigned long evtchn_pending_sel;
>  };
>  
> +#define KVM_MCA_REG_PER_BANK 5
> +
>  struct kvm_vcpu_arch {
>  	/*
>  	 * rip and regs accesses must go through
> @@ -799,6 +801,15 @@ struct kvm_vcpu_arch {
>  	u64 mcg_status;
>  	u64 mcg_ctl;
>  	u64 mcg_ext_ctl;
> +	/*
> +	 * 5 registers per bank for up to KVM_MAX_MCE_BANKS.
> +	 * Register order within each bank:
> +	 * mce_banks[5 * bank]   - IA32_MCi_CTL
> +	 * mce_banks[5 * bank + 1] - IA32_MCi_STATUS
> +	 * mce_banks[5 * bank + 2] - IA32_MCi_ADDR
> +	 * mce_banks[5 * bank + 3] - IA32_MCi_MISC
> +	 * mce_banks[5 * bank + 4] - IA32_MCi_CTL2
> +	 */
>  	u64 *mce_banks;

Why shove CTL2 into mce_banks?  AFAICT, it just makes everything harder.  Adding
a new "u64 *mce_ctl2_banks" or whatever would simplify everything except the
memory allocation.

>  	/* Cache MMIO info */
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9322e6340a74..b388eb82308a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -27,6 +27,7 @@
>  #include <linux/math64.h>
>  #include <linux/slab.h>
>  #include <asm/processor.h>
> +#include <asm/mce.h>
>  #include <asm/msr.h>
>  #include <asm/page.h>
>  #include <asm/current.h>
> @@ -53,8 +54,6 @@
>  #define PRIu64 "u"
>  #define PRIo64 "o"
>  
> -/* 14 is the version for Xeon and Pentium 8.4.8*/
> -#define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))

Eh, probably worth keeping the APIC_VERSION #define and just move out the LVT crud.

>  #define LAPIC_MMIO_LENGTH		(1 << 12)
>  /* followed define is not in apicdef.h */
>  #define MAX_APIC_VECTOR			256
> @@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
> -	u32 v = APIC_VERSION;
> +	int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> +			KVM_APIC_LVT_NUM - 1;

Retrieving the number of LVT entries belongs in a helper, and this is rather gross
absuse of KVM_APIC_LVT_NUM as there's zero indication that it's pseudo-dynamic.
The code that handles accesses to APIC_LVTCMCI is even worse:

	val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1];

The easiest and best way to handle this is to define an enum, especially since
the LVT entries aren't exactly intuitive (e.g. LVT_LINT0 isn't entry 0)

enum lapic_lvt_entry {
	LVT_TIMER,
	LVT_THERMAL_MONITOR,
	LVT_PERFORMANCE_COUNTER,
	LVT_LINT0,
	LVT_LINT1,
	LVT_ERROR,
	LVT_CMCI,

	KVM_APIC_MAX_NR_LVT_ENTRIES,
}

And use those for the initialization of apic_lvt_mask[] and drop the comments:

static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
	[LVT_TIMER] = LVT_MASK ,      /* timer mode mask added at runtime */
	[LVT_TERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,

	and so on and so forth
};

Then there's no need for the ugly KVM_APIC_LVT_NUM - 1 shenanigans to access the
CMCI entry, and the only place that needs to be aware at all is the helper to
query the number of LVT entries.  Heh, and if we wanted to be clever/supid...

static inline kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
{
	return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
}

> +	/* 14 is the version for Xeon and Pentium 8.4.8*/
> +	u32 v = 0x14UL | ((lvt_num - 1) << 16);
>  
>  	if (!lapic_in_kernel(vcpu))
>  		return;
> @@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
>  	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
>  	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
>  	LINT_MASK, LINT_MASK,	/* LVT0-1 */
> -	LVT_MASK		/* LVTERR */
> +	LVT_MASK,		/* LVTERR */
> +	LVT_MASK | APIC_MODE_MASK	/* LVTCMCI */
>  };
>  
>  static int find_highest_vector(void *bitmap)
> @@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
>  		APIC_REG_MASK(APIC_TMCCT) |
>  		APIC_REG_MASK(APIC_TDCR);
>  
> +	if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P)

As alluded to above, this belongs in a helper too.

> +		valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
> +
>  	/* ARBPRI is not valid on x2APIC */
>  	if (!apic_x2apic_mode(apic))
>  		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
> @@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
>  	}
>  }
>  
> +static int kvm_lvt_reg_by_index(int i)
> +{
> +	if (i < 0 || i >= KVM_APIC_LVT_NUM) {
> +		pr_warn("lvt register index out of bound: %i\n", i);

This sanity check is unnecessary, @i is fully KVM controlled.  And a pr_warn() in
that case is nowhere near strong enough, e.g. at minimum this should be WARN_ON,
though again, I don't think that's necessary.

Actually, if we really wanted to sanity check @i, we could make this __always_inline
and turn it into a BUILD_BUG_ON(), though I bet there's a config option that will
result in the compiler not unrolling the callers and ruining that idea.

> +		return 0;
> +	}
> +
> +	if (i < KVM_APIC_LVT_NUM - 1)

Far better is

	if (i == LVT_CMCI)
		return APIC_LVTCMCI;

	return return APIC_LVTT + 0x10 * i;

Though given the nature of the usage, it might actually be better to bury this in
a macro (or a helper function masquerading as a macro by having a wierd name).

#define APIC_LVTx(x)							\
({									\
	int __apic_reg;							\
									\
	if ((x) != LVT_CMCI)						\
		__apic_reg = APIC_LVTCMCI;				\
	else								\
		__apic_reg = APIC_LVTT + 0x10 * (x);			\
	__apic_reg;							\
}


Then the usage stays quite readable and doesn't need temp variables, e.g.

			for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
				lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
				kvm_lapic_set_reg(apic, APIC_LVTx(i),
						  lvt_val | APIC_LVT_MASKED);
			}

> +		return APIC_LVTT + 0x10 * i;
> +	return APIC_LVTCMCI;
> +}
> +

...

> @@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
>  	kvm_apic_set_version(apic->vcpu);
>  
> -	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> -		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> +	lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> +			KVM_APIC_LVT_NUM - 1;
> +	for (i = 0; i < lvt_num; i++) {
> +		int lvt_reg = kvm_lvt_reg_by_index(i);
> +
> +		if (lvt_reg)
> +			kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED);
> +	}
>  	apic_update_lvtt(apic);
>  	if (kvm_vcpu_is_reset_bsp(vcpu) &&
>  	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 2b44e533fc8d..e2ae097613ca 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -10,7 +10,7 @@
>  
>  #define KVM_APIC_INIT		0
>  #define KVM_APIC_SIPI		1
> -#define KVM_APIC_LVT_NUM	6
> +#define KVM_APIC_LVT_NUM	7
>  
>  #define APIC_SHORT_MASK			0xc0000
>  #define APIC_DEST_NOSHORT		0x0
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b730d799c26e..63aa2b3d30ca 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
>  	}
>  
>  	kvm_mce_cap_supported |= MCG_LMCE_P;
> +	kvm_mce_cap_supported |= MCG_CMCI_P;
>  
>  	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
>  		return -EINVAL;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eb4029660bd9..6626723bf51b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		vcpu->arch.mcg_ctl = data;
>  		break;
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +		{

Kernel style when curly braces is needed for a case statement is to put the opening
braces with the case and not indent.  Though my vote is to hoist "offset" to be
declared at the function level so that each statement doesn't need curly braces
just to define a fairly common varaible.

> +			u32 offset;
> +			/* BIT[30] - CMCI_ENABLE */
> +			/* BIT[0:14] - CMCI_THRESHOLD */
> +			u64 mask = (1 << 30) | 0x7fff;

Add proper #defines, not comments.

> +
> +			if (!(mcg_cap & MCG_CMCI_P) &&
> +			    (data || !msr_info->host_initiated))

This looks wrong, userspace should either be able to write the MSR or not, '0'
isn't special.  Unless there's a danger to KVM, which I don't think there is,
userspace should be allowed to ignore architectural restrictions, i.e. bypass
the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
ioctls.  I.e. this should be:

		if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
			return 1;

> +				return 1;
> +			/* An attempt to write a 1 to a reserved bit raises #GP*/
> +			if (data & ~mask)
> +				return 1;
> +			offset = array_index_nospec(
> +				msr - MSR_IA32_MC0_CTL2,
> +				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);

The existing code is gross, don't copy it :-)  I'd rather this run over the 80 char
soft limit.

> +			vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask);

The AND with the mask is pointless, @data has already been verified.

With all of the above, this becomes:

	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
		if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
			return 1;

		if (data & ~(MCx_CTL2_CMCI_ENABLE | MCx_CTL2_CMCI_THRESHOLD))
			return 1;

		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
					    MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
		vcpu->arch.mce_ctl2_banks = [offset] = data;
		break;


> +		}
> +		break;
>  	default:
>  		if (msr >= MSR_IA32_MC0_CTL &&
>  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
> @@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  					return -1;
>  			}
>  
> -			vcpu->arch.mce_banks[offset] = data;
> +			/* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes
> +			 * per bank.
> +			 * kvm_vcpu_arch.mce_banks has 5 registers per bank, see
> +			 * register layout details in kvm_host.h.
> +			 * MSR_IA32_MCi_CTL is the first register in each bank
> +			 * within kvm_vcpu_arch.mce_banks.
> +			 */
> +			vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data;

This mess goes away if CTL2 gets a separate array.

>  			break;
>  		}
>  		return 1;
> @@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		}
>  		break;
> -	case 0x200 ... 0x2ff:
> +	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>  		return kvm_mtrr_set_msr(vcpu, msr, data);
>  	case MSR_IA32_APICBASE:
>  		return kvm_set_apic_base(vcpu, msr_info);
> @@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	case MSR_IA32_MCG_CTL:
>  	case MSR_IA32_MCG_STATUS:
>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>  		return set_msr_mce(vcpu, msr_info);
>  
>  	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> @@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>  	case MSR_IA32_MCG_STATUS:
>  		data = vcpu->arch.mcg_status;
>  		break;
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +		{
> +			u32 offset;
> +
> +			if (!(mcg_cap & MCG_CMCI_P) && !host)
> +				return 1;
> +			offset = array_index_nospec(
> +				msr - MSR_IA32_MC0_CTL2,
> +				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> +			data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4];

Same comments as the write path.

> +		}
> +		break;
>  	default:
>  		if (msr >= MSR_IA32_MC0_CTL &&
>  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
> @@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>  				msr - MSR_IA32_MC0_CTL,
>  				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
>  
> -			data = vcpu->arch.mce_banks[offset];
> +			data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4];
>  			break;
>  		}
>  		return 1;

...
  
> +static bool is_ucna(u64 mcg_status, u64 mci_status)
> +{
> +	return !mcg_status &&
> +		!(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR));

Spaces around the '|'.

> +}
> +
> +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> +				       struct kvm_x86_mce *mce)
> +{
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned int bank_num = mcg_cap & 0xff;
> +	u64 *banks = vcpu->arch.mce_banks;
> +
> +	/* Check for legal bank number in guest */
> +	if (mce->bank >= bank_num)
> +		return -EINVAL;
> +
> +	/* Disallow bits that are used for machine check signalling */

This needs a more verbose comment/explanation.  I can kinda sorta piece things
together, but the intent is unclear.

> +	if (mce->mcg_status ||
> +	    (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)))
> +		return -EINVAL;
> +
> +	 /* UCNA must have VAL and UC bits set */
> +	if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) !=
> +	    (MCI_STATUS_VAL|MCI_STATUS_UC))

Spaces again, though my personal preference would be:

	if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUC_UC))

> +		return -EINVAL;
> +
> +	banks += KVM_MCA_REG_PER_BANK * mce->bank;
> +	banks[1] = mce->status;
> +	banks[2] = mce->addr;
> +	banks[3] = mce->misc;
> +	vcpu->arch.mcg_status = mce->mcg_status;
> +
> +	/*
> +	 * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling
> +	 * is disabled for the bank
> +	 */
> +	if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30)))

#defines, not comments for the BIT 32 thing.

> +		return 0;
> +
> +	if (lapic_in_kernel(vcpu))

Any reason to support UNCA injection without an in-kernel APIC?

> +		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> +
> +	return 0;
> +}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-11 19:08 ` Sean Christopherson
@ 2022-04-12 14:22   ` Jue Wang
  2022-04-12 14:29     ` Jue Wang
  2022-04-12 16:54     ` Sean Christopherson
  2022-04-12 17:44   ` Paolo Bonzini
  1 sibling, 2 replies; 10+ messages in thread
From: Jue Wang @ 2022-04-12 14:22 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Paolo Bonzini, kvm

Thanks a lot for the review feedback, Sean!

I will break the current patch into a several patch series and
incorporate these feedbacks in the V2 to be sent out.

Thanks,
-Jue

On Mon, Apr 11, 2022 at 12:08 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Mar 23, 2022, Jue Wang wrote:
> > CMCI
>
> Please write Corrected Machine Check Interrupt at least once.
>
> > is supported since Nehalem.
>
> While possibly interesting, this info is defitely not the most interesting tidbit
> in the changelong, i.e. shouldn't be the opening line.
>
> >  UCNA (uncorrectable no action required) errors signaled via CMCI allows a
> >  guest to be notified as soon as uncorrectable memory errors get detected by
> >  some background threads, e.g., threads that migrate guest memory across
> >  hosts.
> >
> > Upon receiving UCNAs, guest kernel isolates the poisoned pages from
> > future accesses much earlier than a potential fatal Machine Check
> > Exception due to accesses from a guest thread.
> >
> > Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P.
>
> This changelog needs much longer explanation of what exactly is being added, e.g. I
> had to read the code to find out that this introduces new userspace functionality
> to allow injecting UNCA #MCs and thus CMCI IRQs.
>
> That's also a symptom of this patch needing to be split into a proper series, e.g.
> exposing the UNCA injection point to userspace needs to be a separate patch.
>
> Looking through this, 5 or 6 patches is probably appropriate:
>
>   1. Replace existing magic numbers with #defines
>   2. Clean up the existing LVT mess
>   3. Add in-kernel LVTCMCI support (unreachable until #, but easier to review)
>   4. Add support for MSR_IA32_MCx_CTL2 MSRs
>   5. Add CMCI support
>   6. Add UNCA injection support
>
> I can't tell if #4 is necessary as a separate patch, it might belong with #3.
>
> > Signed-off-by: Jue Wang <juew@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  11 +++
> >  arch/x86/kvm/lapic.c            |  65 ++++++++++++++----
> >  arch/x86/kvm/lapic.h            |   2 +-
> >  arch/x86/kvm/vmx/vmx.c          |   1 +
> >  arch/x86/kvm/x86.c              | 115 +++++++++++++++++++++++++++++---
> >  5 files changed, 171 insertions(+), 23 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index ec9830d2aabf..d57f3d1284a3 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -613,6 +613,8 @@ struct kvm_vcpu_xen {
> >       unsigned long evtchn_pending_sel;
> >  };
> >
> > +#define KVM_MCA_REG_PER_BANK 5
> > +
> >  struct kvm_vcpu_arch {
> >       /*
> >        * rip and regs accesses must go through
> > @@ -799,6 +801,15 @@ struct kvm_vcpu_arch {
> >       u64 mcg_status;
> >       u64 mcg_ctl;
> >       u64 mcg_ext_ctl;
> > +     /*
> > +      * 5 registers per bank for up to KVM_MAX_MCE_BANKS.
> > +      * Register order within each bank:
> > +      * mce_banks[5 * bank]   - IA32_MCi_CTL
> > +      * mce_banks[5 * bank + 1] - IA32_MCi_STATUS
> > +      * mce_banks[5 * bank + 2] - IA32_MCi_ADDR
> > +      * mce_banks[5 * bank + 3] - IA32_MCi_MISC
> > +      * mce_banks[5 * bank + 4] - IA32_MCi_CTL2
> > +      */
> >       u64 *mce_banks;
>
> Why shove CTL2 into mce_banks?  AFAICT, it just makes everything harder.  Adding
> a new "u64 *mce_ctl2_banks" or whatever would simplify everything except the
> memory allocation.
>
> >       /* Cache MMIO info */
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 9322e6340a74..b388eb82308a 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/math64.h>
> >  #include <linux/slab.h>
> >  #include <asm/processor.h>
> > +#include <asm/mce.h>
> >  #include <asm/msr.h>
> >  #include <asm/page.h>
> >  #include <asm/current.h>
> > @@ -53,8 +54,6 @@
> >  #define PRIu64 "u"
> >  #define PRIo64 "o"
> >
> > -/* 14 is the version for Xeon and Pentium 8.4.8*/
> > -#define APIC_VERSION                 (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
>
> Eh, probably worth keeping the APIC_VERSION #define and just move out the LVT crud.
>
> >  #define LAPIC_MMIO_LENGTH            (1 << 12)
> >  /* followed define is not in apicdef.h */
> >  #define MAX_APIC_VECTOR                      256
> > @@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
> >  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
> >  {
> >       struct kvm_lapic *apic = vcpu->arch.apic;
> > -     u32 v = APIC_VERSION;
> > +     int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> > +                     KVM_APIC_LVT_NUM - 1;
>
> Retrieving the number of LVT entries belongs in a helper, and this is rather gross
> absuse of KVM_APIC_LVT_NUM as there's zero indication that it's pseudo-dynamic.
> The code that handles accesses to APIC_LVTCMCI is even worse:
>
>         val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1];
>
> The easiest and best way to handle this is to define an enum, especially since
> the LVT entries aren't exactly intuitive (e.g. LVT_LINT0 isn't entry 0)
>
> enum lapic_lvt_entry {
>         LVT_TIMER,
>         LVT_THERMAL_MONITOR,
>         LVT_PERFORMANCE_COUNTER,
>         LVT_LINT0,
>         LVT_LINT1,
>         LVT_ERROR,
>         LVT_CMCI,
>
>         KVM_APIC_MAX_NR_LVT_ENTRIES,
> }
>
> And use those for the initialization of apic_lvt_mask[] and drop the comments:
>
> static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
>         [LVT_TIMER] = LVT_MASK ,      /* timer mode mask added at runtime */
>         [LVT_TERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
>
>         and so on and so forth
> };
>
> Then there's no need for the ugly KVM_APIC_LVT_NUM - 1 shenanigans to access the
> CMCI entry, and the only place that needs to be aware at all is the helper to
> query the number of LVT entries.  Heh, and if we wanted to be clever/supid...
>
> static inline kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
> {
>         return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
> }
>
> > +     /* 14 is the version for Xeon and Pentium 8.4.8*/
> > +     u32 v = 0x14UL | ((lvt_num - 1) << 16);
> >
> >       if (!lapic_in_kernel(vcpu))
> >               return;
> > @@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
> >       LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
> >       LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
> >       LINT_MASK, LINT_MASK,   /* LVT0-1 */
> > -     LVT_MASK                /* LVTERR */
> > +     LVT_MASK,               /* LVTERR */
> > +     LVT_MASK | APIC_MODE_MASK       /* LVTCMCI */
> >  };
> >
> >  static int find_highest_vector(void *bitmap)
> > @@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
> >               APIC_REG_MASK(APIC_TMCCT) |
> >               APIC_REG_MASK(APIC_TDCR);
> >
> > +     if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P)
>
> As alluded to above, this belongs in a helper too.
>
> > +             valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
> > +
> >       /* ARBPRI is not valid on x2APIC */
> >       if (!apic_x2apic_mode(apic))
> >               valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
> > @@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
> >       }
> >  }
> >
> > +static int kvm_lvt_reg_by_index(int i)
> > +{
> > +     if (i < 0 || i >= KVM_APIC_LVT_NUM) {
> > +             pr_warn("lvt register index out of bound: %i\n", i);
>
> This sanity check is unnecessary, @i is fully KVM controlled.  And a pr_warn() in
> that case is nowhere near strong enough, e.g. at minimum this should be WARN_ON,
> though again, I don't think that's necessary.
>
> Actually, if we really wanted to sanity check @i, we could make this __always_inline
> and turn it into a BUILD_BUG_ON(), though I bet there's a config option that will
> result in the compiler not unrolling the callers and ruining that idea.
>
> > +             return 0;
> > +     }
> > +
> > +     if (i < KVM_APIC_LVT_NUM - 1)
>
> Far better is
>
>         if (i == LVT_CMCI)
>                 return APIC_LVTCMCI;
>
>         return return APIC_LVTT + 0x10 * i;
>
> Though given the nature of the usage, it might actually be better to bury this in
> a macro (or a helper function masquerading as a macro by having a wierd name).
>
> #define APIC_LVTx(x)                                                    \
> ({                                                                      \
>         int __apic_reg;                                                 \
>                                                                         \
>         if ((x) != LVT_CMCI)                                            \
>                 __apic_reg = APIC_LVTCMCI;                              \
>         else                                                            \
>                 __apic_reg = APIC_LVTT + 0x10 * (x);                    \
>         __apic_reg;                                                     \
> }
>
>
> Then the usage stays quite readable and doesn't need temp variables, e.g.
>
>                         for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
>                                 lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
>                                 kvm_lapic_set_reg(apic, APIC_LVTx(i),
>                                                   lvt_val | APIC_LVT_MASKED);
>                         }
>
> > +             return APIC_LVTT + 0x10 * i;
> > +     return APIC_LVTCMCI;
> > +}
> > +
>
> ...
>
> > @@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> >               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
> >       kvm_apic_set_version(apic->vcpu);
> >
> > -     for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> > -             kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> > +     lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> > +                     KVM_APIC_LVT_NUM - 1;
> > +     for (i = 0; i < lvt_num; i++) {
> > +             int lvt_reg = kvm_lvt_reg_by_index(i);
> > +
> > +             if (lvt_reg)
> > +                     kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED);
> > +     }
> >       apic_update_lvtt(apic);
> >       if (kvm_vcpu_is_reset_bsp(vcpu) &&
> >           kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 2b44e533fc8d..e2ae097613ca 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -10,7 +10,7 @@
> >
> >  #define KVM_APIC_INIT                0
> >  #define KVM_APIC_SIPI                1
> > -#define KVM_APIC_LVT_NUM     6
> > +#define KVM_APIC_LVT_NUM     7
> >
> >  #define APIC_SHORT_MASK                      0xc0000
> >  #define APIC_DEST_NOSHORT            0x0
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index b730d799c26e..63aa2b3d30ca 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
> >       }
> >
> >       kvm_mce_cap_supported |= MCG_LMCE_P;
> > +     kvm_mce_cap_supported |= MCG_CMCI_P;
> >
> >       if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
> >               return -EINVAL;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index eb4029660bd9..6626723bf51b 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >                       return 1;
> >               vcpu->arch.mcg_ctl = data;
> >               break;
> > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> > +             {
>
> Kernel style when curly braces is needed for a case statement is to put the opening
> braces with the case and not indent.  Though my vote is to hoist "offset" to be
> declared at the function level so that each statement doesn't need curly braces
> just to define a fairly common varaible.
>
> > +                     u32 offset;
> > +                     /* BIT[30] - CMCI_ENABLE */
> > +                     /* BIT[0:14] - CMCI_THRESHOLD */
> > +                     u64 mask = (1 << 30) | 0x7fff;
>
> Add proper #defines, not comments.
>
> > +
> > +                     if (!(mcg_cap & MCG_CMCI_P) &&
> > +                         (data || !msr_info->host_initiated))
>
> This looks wrong, userspace should either be able to write the MSR or not, '0'
> isn't special.  Unless there's a danger to KVM, which I don't think there is,
> userspace should be allowed to ignore architectural restrictions, i.e. bypass
> the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
> ioctls.  I.e. this should be:

I think the idea was writing unsupported values into an MSR should
cause #GP. The code is consistent (copied) from the code that handles
MSR_IA32_MCG_CTL:

https://elixir.bootlin.com/linux/v5.17-rc8/source/arch/x86/kvm/x86.c#L3177

Can you elaborate what unnecessary dependency between ioctls this may cause?

>
>                 if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
>                         return 1;
>
> > +                             return 1;
> > +                     /* An attempt to write a 1 to a reserved bit raises #GP*/
> > +                     if (data & ~mask)
> > +                             return 1;
> > +                     offset = array_index_nospec(
> > +                             msr - MSR_IA32_MC0_CTL2,
> > +                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
>
> The existing code is gross, don't copy it :-)  I'd rather this run over the 80 char
> soft limit.
>
> > +                     vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask);
>
> The AND with the mask is pointless, @data has already been verified.
>
> With all of the above, this becomes:
>
>         case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>                 if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
>                         return 1;
>
>                 if (data & ~(MCx_CTL2_CMCI_ENABLE | MCx_CTL2_CMCI_THRESHOLD))
>                         return 1;
>
>                 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
>                                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
>                 vcpu->arch.mce_ctl2_banks = [offset] = data;
>                 break;
>
>
> > +             }
> > +             break;
> >       default:
> >               if (msr >= MSR_IA32_MC0_CTL &&
> >                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> > @@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >                                       return -1;
> >                       }
> >
> > -                     vcpu->arch.mce_banks[offset] = data;
> > +                     /* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes
> > +                      * per bank.
> > +                      * kvm_vcpu_arch.mce_banks has 5 registers per bank, see
> > +                      * register layout details in kvm_host.h.
> > +                      * MSR_IA32_MCi_CTL is the first register in each bank
> > +                      * within kvm_vcpu_arch.mce_banks.
> > +                      */
> > +                     vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data;
>
> This mess goes away if CTL2 gets a separate array.
>
> >                       break;
> >               }
> >               return 1;
> > @@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >                       return 1;
> >               }
> >               break;
> > -     case 0x200 ... 0x2ff:
> > +     case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> > +     case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
> >               return kvm_mtrr_set_msr(vcpu, msr, data);
> >       case MSR_IA32_APICBASE:
> >               return kvm_set_apic_base(vcpu, msr_info);
> > @@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >       case MSR_IA32_MCG_CTL:
> >       case MSR_IA32_MCG_STATUS:
> >       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> >               return set_msr_mce(vcpu, msr_info);
> >
> >       case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> > @@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> >       case MSR_IA32_MCG_STATUS:
> >               data = vcpu->arch.mcg_status;
> >               break;
> > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> > +             {
> > +                     u32 offset;
> > +
> > +                     if (!(mcg_cap & MCG_CMCI_P) && !host)
> > +                             return 1;
> > +                     offset = array_index_nospec(
> > +                             msr - MSR_IA32_MC0_CTL2,
> > +                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> > +                     data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4];
>
> Same comments as the write path.
>
> > +             }
> > +             break;
> >       default:
> >               if (msr >= MSR_IA32_MC0_CTL &&
> >                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> > @@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> >                               msr - MSR_IA32_MC0_CTL,
> >                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
> >
> > -                     data = vcpu->arch.mce_banks[offset];
> > +                     data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4];
> >                       break;
> >               }
> >               return 1;
>
> ...
>
> > +static bool is_ucna(u64 mcg_status, u64 mci_status)
> > +{
> > +     return !mcg_status &&
> > +             !(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR));
>
> Spaces around the '|'.
>
> > +}
> > +
> > +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> > +                                    struct kvm_x86_mce *mce)
> > +{
> > +     u64 mcg_cap = vcpu->arch.mcg_cap;
> > +     unsigned int bank_num = mcg_cap & 0xff;
> > +     u64 *banks = vcpu->arch.mce_banks;
> > +
> > +     /* Check for legal bank number in guest */
> > +     if (mce->bank >= bank_num)
> > +             return -EINVAL;
> > +
> > +     /* Disallow bits that are used for machine check signalling */
>
> This needs a more verbose comment/explanation.  I can kinda sorta piece things
> together, but the intent is unclear.
>
> > +     if (mce->mcg_status ||
> > +         (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)))
> > +             return -EINVAL;
> > +
> > +      /* UCNA must have VAL and UC bits set */
> > +     if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) !=
> > +         (MCI_STATUS_VAL|MCI_STATUS_UC))
>
> Spaces again, though my personal preference would be:
>
>         if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUC_UC))
>
> > +             return -EINVAL;
> > +
> > +     banks += KVM_MCA_REG_PER_BANK * mce->bank;
> > +     banks[1] = mce->status;
> > +     banks[2] = mce->addr;
> > +     banks[3] = mce->misc;
> > +     vcpu->arch.mcg_status = mce->mcg_status;
> > +
> > +     /*
> > +      * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling
> > +      * is disabled for the bank
> > +      */
> > +     if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30)))
>
> #defines, not comments for the BIT 32 thing.
>
> > +             return 0;
> > +
> > +     if (lapic_in_kernel(vcpu))
>
> Any reason to support UNCA injection without an in-kernel APIC?
>
> > +             kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> > +
> > +     return 0;
> > +}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-12 14:22   ` Jue Wang
@ 2022-04-12 14:29     ` Jue Wang
  2022-04-12 16:57       ` Sean Christopherson
  2022-04-12 16:54     ` Sean Christopherson
  1 sibling, 1 reply; 10+ messages in thread
From: Jue Wang @ 2022-04-12 14:29 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Paolo Bonzini, kvm

On Tue, Apr 12, 2022 at 7:22 AM Jue Wang <juew@google.com> wrote:
>
> Thanks a lot for the review feedback, Sean!
>
> I will break the current patch into a several patch series and
> incorporate these feedbacks in the V2 to be sent out.
>
> Thanks,
> -Jue
>
> On Mon, Apr 11, 2022 at 12:08 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Wed, Mar 23, 2022, Jue Wang wrote:
> > > CMCI
> >
> > Please write Corrected Machine Check Interrupt at least once.
> >
> > > is supported since Nehalem.
> >
> > While possibly interesting, this info is defitely not the most interesting tidbit
> > in the changelong, i.e. shouldn't be the opening line.
> >
> > >  UCNA (uncorrectable no action required) errors signaled via CMCI allows a
> > >  guest to be notified as soon as uncorrectable memory errors get detected by
> > >  some background threads, e.g., threads that migrate guest memory across
> > >  hosts.
> > >
> > > Upon receiving UCNAs, guest kernel isolates the poisoned pages from
> > > future accesses much earlier than a potential fatal Machine Check
> > > Exception due to accesses from a guest thread.
> > >
> > > Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P.
> >
> > This changelog needs much longer explanation of what exactly is being added, e.g. I
> > had to read the code to find out that this introduces new userspace functionality
> > to allow injecting UNCA #MCs and thus CMCI IRQs.
> >
> > That's also a symptom of this patch needing to be split into a proper series, e.g.
> > exposing the UNCA injection point to userspace needs to be a separate patch.
> >
> > Looking through this, 5 or 6 patches is probably appropriate:
> >
> >   1. Replace existing magic numbers with #defines
> >   2. Clean up the existing LVT mess
> >   3. Add in-kernel LVTCMCI support (unreachable until #, but easier to review)
> >   4. Add support for MSR_IA32_MCx_CTL2 MSRs
> >   5. Add CMCI support
> >   6. Add UNCA injection support
> >
> > I can't tell if #4 is necessary as a separate patch, it might belong with #3.
> >
> > > Signed-off-by: Jue Wang <juew@google.com>
> > > ---
> > >  arch/x86/include/asm/kvm_host.h |  11 +++
> > >  arch/x86/kvm/lapic.c            |  65 ++++++++++++++----
> > >  arch/x86/kvm/lapic.h            |   2 +-
> > >  arch/x86/kvm/vmx/vmx.c          |   1 +
> > >  arch/x86/kvm/x86.c              | 115 +++++++++++++++++++++++++++++---
> > >  5 files changed, 171 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index ec9830d2aabf..d57f3d1284a3 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -613,6 +613,8 @@ struct kvm_vcpu_xen {
> > >       unsigned long evtchn_pending_sel;
> > >  };
> > >
> > > +#define KVM_MCA_REG_PER_BANK 5
> > > +
> > >  struct kvm_vcpu_arch {
> > >       /*
> > >        * rip and regs accesses must go through
> > > @@ -799,6 +801,15 @@ struct kvm_vcpu_arch {
> > >       u64 mcg_status;
> > >       u64 mcg_ctl;
> > >       u64 mcg_ext_ctl;
> > > +     /*
> > > +      * 5 registers per bank for up to KVM_MAX_MCE_BANKS.
> > > +      * Register order within each bank:
> > > +      * mce_banks[5 * bank]   - IA32_MCi_CTL
> > > +      * mce_banks[5 * bank + 1] - IA32_MCi_STATUS
> > > +      * mce_banks[5 * bank + 2] - IA32_MCi_ADDR
> > > +      * mce_banks[5 * bank + 3] - IA32_MCi_MISC
> > > +      * mce_banks[5 * bank + 4] - IA32_MCi_CTL2
> > > +      */
> > >       u64 *mce_banks;
> >
> > Why shove CTL2 into mce_banks?  AFAICT, it just makes everything harder.  Adding
> > a new "u64 *mce_ctl2_banks" or whatever would simplify everything except the
> > memory allocation.
> >
> > >       /* Cache MMIO info */
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index 9322e6340a74..b388eb82308a 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/math64.h>
> > >  #include <linux/slab.h>
> > >  #include <asm/processor.h>
> > > +#include <asm/mce.h>
> > >  #include <asm/msr.h>
> > >  #include <asm/page.h>
> > >  #include <asm/current.h>
> > > @@ -53,8 +54,6 @@
> > >  #define PRIu64 "u"
> > >  #define PRIo64 "o"
> > >
> > > -/* 14 is the version for Xeon and Pentium 8.4.8*/
> > > -#define APIC_VERSION                 (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
> >
> > Eh, probably worth keeping the APIC_VERSION #define and just move out the LVT crud.
> >
> > >  #define LAPIC_MMIO_LENGTH            (1 << 12)
> > >  /* followed define is not in apicdef.h */
> > >  #define MAX_APIC_VECTOR                      256
> > > @@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
> > >  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
> > >  {
> > >       struct kvm_lapic *apic = vcpu->arch.apic;
> > > -     u32 v = APIC_VERSION;
> > > +     int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> > > +                     KVM_APIC_LVT_NUM - 1;
> >
> > Retrieving the number of LVT entries belongs in a helper, and this is rather gross
> > absuse of KVM_APIC_LVT_NUM as there's zero indication that it's pseudo-dynamic.
> > The code that handles accesses to APIC_LVTCMCI is even worse:
> >
> >         val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1];
> >
> > The easiest and best way to handle this is to define an enum, especially since
> > the LVT entries aren't exactly intuitive (e.g. LVT_LINT0 isn't entry 0)
> >
> > enum lapic_lvt_entry {
> >         LVT_TIMER,
> >         LVT_THERMAL_MONITOR,
> >         LVT_PERFORMANCE_COUNTER,
> >         LVT_LINT0,
> >         LVT_LINT1,
> >         LVT_ERROR,
> >         LVT_CMCI,
> >
> >         KVM_APIC_MAX_NR_LVT_ENTRIES,
> > }
> >
> > And use those for the initialization of apic_lvt_mask[] and drop the comments:
> >
> > static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
> >         [LVT_TIMER] = LVT_MASK ,      /* timer mode mask added at runtime */
> >         [LVT_TERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
> >
> >         and so on and so forth
> > };
> >
> > Then there's no need for the ugly KVM_APIC_LVT_NUM - 1 shenanigans to access the
> > CMCI entry, and the only place that needs to be aware at all is the helper to
> > query the number of LVT entries.  Heh, and if we wanted to be clever/supid...
> >
> > static inline kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
> > {
> >         return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
> > }
> >
> > > +     /* 14 is the version for Xeon and Pentium 8.4.8*/
> > > +     u32 v = 0x14UL | ((lvt_num - 1) << 16);
> > >
> > >       if (!lapic_in_kernel(vcpu))
> > >               return;
> > > @@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
> > >       LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
> > >       LVT_MASK | APIC_MODE_MASK,      /* LVTPC */
> > >       LINT_MASK, LINT_MASK,   /* LVT0-1 */
> > > -     LVT_MASK                /* LVTERR */
> > > +     LVT_MASK,               /* LVTERR */
> > > +     LVT_MASK | APIC_MODE_MASK       /* LVTCMCI */
> > >  };
> > >
> > >  static int find_highest_vector(void *bitmap)
> > > @@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
> > >               APIC_REG_MASK(APIC_TMCCT) |
> > >               APIC_REG_MASK(APIC_TDCR);
> > >
> > > +     if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P)
> >
> > As alluded to above, this belongs in a helper too.
> >
> > > +             valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
> > > +
> > >       /* ARBPRI is not valid on x2APIC */
> > >       if (!apic_x2apic_mode(apic))
> > >               valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
> > > @@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
> > >       }
> > >  }
> > >
> > > +static int kvm_lvt_reg_by_index(int i)
> > > +{
> > > +     if (i < 0 || i >= KVM_APIC_LVT_NUM) {
> > > +             pr_warn("lvt register index out of bound: %i\n", i);
> >
> > This sanity check is unnecessary, @i is fully KVM controlled.  And a pr_warn() in
> > that case is nowhere near strong enough, e.g. at minimum this should be WARN_ON,
> > though again, I don't think that's necessary.
> >
> > Actually, if we really wanted to sanity check @i, we could make this __always_inline
> > and turn it into a BUILD_BUG_ON(), though I bet there's a config option that will
> > result in the compiler not unrolling the callers and ruining that idea.
> >
> > > +             return 0;
> > > +     }
> > > +
> > > +     if (i < KVM_APIC_LVT_NUM - 1)
> >
> > Far better is
> >
> >         if (i == LVT_CMCI)
> >                 return APIC_LVTCMCI;
> >
> >         return return APIC_LVTT + 0x10 * i;
> >
> > Though given the nature of the usage, it might actually be better to bury this in
> > a macro (or a helper function masquerading as a macro by having a wierd name).
> >
> > #define APIC_LVTx(x)                                                    \
> > ({                                                                      \
> >         int __apic_reg;                                                 \
> >                                                                         \
> >         if ((x) != LVT_CMCI)                                            \
> >                 __apic_reg = APIC_LVTCMCI;                              \
> >         else                                                            \
> >                 __apic_reg = APIC_LVTT + 0x10 * (x);                    \
> >         __apic_reg;                                                     \
> > }
> >
> >
> > Then the usage stays quite readable and doesn't need temp variables, e.g.
> >
> >                         for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
> >                                 lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
> >                                 kvm_lapic_set_reg(apic, APIC_LVTx(i),
> >                                                   lvt_val | APIC_LVT_MASKED);
> >                         }
> >
> > > +             return APIC_LVTT + 0x10 * i;
> > > +     return APIC_LVTCMCI;
> > > +}
> > > +
> >
> > ...
> >
> > > @@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
> > >               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
> > >       kvm_apic_set_version(apic->vcpu);
> > >
> > > -     for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> > > -             kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> > > +     lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> > > +                     KVM_APIC_LVT_NUM - 1;
> > > +     for (i = 0; i < lvt_num; i++) {
> > > +             int lvt_reg = kvm_lvt_reg_by_index(i);
> > > +
> > > +             if (lvt_reg)
> > > +                     kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED);
> > > +     }
> > >       apic_update_lvtt(apic);
> > >       if (kvm_vcpu_is_reset_bsp(vcpu) &&
> > >           kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> > > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > > index 2b44e533fc8d..e2ae097613ca 100644
> > > --- a/arch/x86/kvm/lapic.h
> > > +++ b/arch/x86/kvm/lapic.h
> > > @@ -10,7 +10,7 @@
> > >
> > >  #define KVM_APIC_INIT                0
> > >  #define KVM_APIC_SIPI                1
> > > -#define KVM_APIC_LVT_NUM     6
> > > +#define KVM_APIC_LVT_NUM     7
> > >
> > >  #define APIC_SHORT_MASK                      0xc0000
> > >  #define APIC_DEST_NOSHORT            0x0
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index b730d799c26e..63aa2b3d30ca 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
> > >       }
> > >
> > >       kvm_mce_cap_supported |= MCG_LMCE_P;
> > > +     kvm_mce_cap_supported |= MCG_CMCI_P;
> > >
> > >       if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
> > >               return -EINVAL;
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index eb4029660bd9..6626723bf51b 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >                       return 1;
> > >               vcpu->arch.mcg_ctl = data;
> > >               break;
> > > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> > > +             {
> >
> > Kernel style when curly braces is needed for a case statement is to put the opening
> > braces with the case and not indent.  Though my vote is to hoist "offset" to be
> > declared at the function level so that each statement doesn't need curly braces
> > just to define a fairly common varaible.
> >
> > > +                     u32 offset;
> > > +                     /* BIT[30] - CMCI_ENABLE */
> > > +                     /* BIT[0:14] - CMCI_THRESHOLD */
> > > +                     u64 mask = (1 << 30) | 0x7fff;
> >
> > Add proper #defines, not comments.
> >
> > > +
> > > +                     if (!(mcg_cap & MCG_CMCI_P) &&
> > > +                         (data || !msr_info->host_initiated))
> >
> > This looks wrong, userspace should either be able to write the MSR or not, '0'
> > isn't special.  Unless there's a danger to KVM, which I don't think there is,
> > userspace should be allowed to ignore architectural restrictions, i.e. bypass
> > the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
> > ioctls.  I.e. this should be:
>
> I think the idea was writing unsupported values into an MSR should
> cause #GP. The code is consistent (copied) from the code that handles
> MSR_IA32_MCG_CTL:
>
> https://elixir.bootlin.com/linux/v5.17-rc8/source/arch/x86/kvm/x86.c#L3177
>
> Can you elaborate what unnecessary dependency between ioctls this may cause?
>
> >
> >                 if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
> >                         return 1;
> >
> > > +                             return 1;
> > > +                     /* An attempt to write a 1 to a reserved bit raises #GP*/
> > > +                     if (data & ~mask)
> > > +                             return 1;
> > > +                     offset = array_index_nospec(
> > > +                             msr - MSR_IA32_MC0_CTL2,
> > > +                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> >
> > The existing code is gross, don't copy it :-)  I'd rather this run over the 80 char
> > soft limit.
> >
> > > +                     vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask);
> >
> > The AND with the mask is pointless, @data has already been verified.
> >
> > With all of the above, this becomes:
> >
> >         case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> >                 if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
> >                         return 1;
> >
> >                 if (data & ~(MCx_CTL2_CMCI_ENABLE | MCx_CTL2_CMCI_THRESHOLD))
> >                         return 1;
> >
> >                 offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
> >                                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> >                 vcpu->arch.mce_ctl2_banks = [offset] = data;
> >                 break;
> >
> >
> > > +             }
> > > +             break;
> > >       default:
> > >               if (msr >= MSR_IA32_MC0_CTL &&
> > >                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> > > @@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >                                       return -1;
> > >                       }
> > >
> > > -                     vcpu->arch.mce_banks[offset] = data;
> > > +                     /* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes
> > > +                      * per bank.
> > > +                      * kvm_vcpu_arch.mce_banks has 5 registers per bank, see
> > > +                      * register layout details in kvm_host.h.
> > > +                      * MSR_IA32_MCi_CTL is the first register in each bank
> > > +                      * within kvm_vcpu_arch.mce_banks.
> > > +                      */
> > > +                     vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data;
> >
> > This mess goes away if CTL2 gets a separate array.
> >
> > >                       break;
> > >               }
> > >               return 1;
> > > @@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >                       return 1;
> > >               }
> > >               break;
> > > -     case 0x200 ... 0x2ff:
> > > +     case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> > > +     case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
> > >               return kvm_mtrr_set_msr(vcpu, msr, data);
> > >       case MSR_IA32_APICBASE:
> > >               return kvm_set_apic_base(vcpu, msr_info);
> > > @@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >       case MSR_IA32_MCG_CTL:
> > >       case MSR_IA32_MCG_STATUS:
> > >       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> > > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> > >               return set_msr_mce(vcpu, msr_info);
> > >
> > >       case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> > > @@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> > >       case MSR_IA32_MCG_STATUS:
> > >               data = vcpu->arch.mcg_status;
> > >               break;
> > > +     case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> > > +             {
> > > +                     u32 offset;
> > > +
> > > +                     if (!(mcg_cap & MCG_CMCI_P) && !host)
> > > +                             return 1;
> > > +                     offset = array_index_nospec(
> > > +                             msr - MSR_IA32_MC0_CTL2,
> > > +                             MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> > > +                     data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4];
> >
> > Same comments as the write path.
> >
> > > +             }
> > > +             break;
> > >       default:
> > >               if (msr >= MSR_IA32_MC0_CTL &&
> > >                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> > > @@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
> > >                               msr - MSR_IA32_MC0_CTL,
> > >                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
> > >
> > > -                     data = vcpu->arch.mce_banks[offset];
> > > +                     data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4];
> > >                       break;
> > >               }
> > >               return 1;
> >
> > ...
> >
> > > +static bool is_ucna(u64 mcg_status, u64 mci_status)
> > > +{
> > > +     return !mcg_status &&
> > > +             !(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR));
> >
> > Spaces around the '|'.
> >
> > > +}
> > > +
> > > +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> > > +                                    struct kvm_x86_mce *mce)
> > > +{
> > > +     u64 mcg_cap = vcpu->arch.mcg_cap;
> > > +     unsigned int bank_num = mcg_cap & 0xff;
> > > +     u64 *banks = vcpu->arch.mce_banks;
> > > +
> > > +     /* Check for legal bank number in guest */
> > > +     if (mce->bank >= bank_num)
> > > +             return -EINVAL;
> > > +
> > > +     /* Disallow bits that are used for machine check signalling */
> >
> > This needs a more verbose comment/explanation.  I can kinda sorta piece things
> > together, but the intent is unclear.
> >
> > > +     if (mce->mcg_status ||
> > > +         (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)))
> > > +             return -EINVAL;
> > > +
> > > +      /* UCNA must have VAL and UC bits set */
> > > +     if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) !=
> > > +         (MCI_STATUS_VAL|MCI_STATUS_UC))
> >
> > Spaces again, though my personal preference would be:
> >
> >         if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUC_UC))
> >
> > > +             return -EINVAL;
> > > +
> > > +     banks += KVM_MCA_REG_PER_BANK * mce->bank;
> > > +     banks[1] = mce->status;
> > > +     banks[2] = mce->addr;
> > > +     banks[3] = mce->misc;
> > > +     vcpu->arch.mcg_status = mce->mcg_status;
> > > +
> > > +     /*
> > > +      * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling
> > > +      * is disabled for the bank
> > > +      */
> > > +     if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30)))
> >
> > #defines, not comments for the BIT 32 thing.
> >
> > > +             return 0;
> > > +
> > > +     if (lapic_in_kernel(vcpu))
> >
> > Any reason to support UNCA injection without an in-kernel APIC?

Even without a viable path for CMCI signaling, the UCNA errors should
still be logged in registers so it gives consistent semantics to
hardware for e.g., MCE overflow checks on an MCE signaled later.

> >
> > > +             kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> > > +
> > > +     return 0;
> > > +}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-12 14:22   ` Jue Wang
  2022-04-12 14:29     ` Jue Wang
@ 2022-04-12 16:54     ` Sean Christopherson
  1 sibling, 0 replies; 10+ messages in thread
From: Sean Christopherson @ 2022-04-12 16:54 UTC (permalink / raw)
  To: Jue Wang; +Cc: Paolo Bonzini, kvm

On Tue, Apr 12, 2022, Jue Wang wrote:
> > > +
> > > +                     if (!(mcg_cap & MCG_CMCI_P) &&
> > > +                         (data || !msr_info->host_initiated))
> >
> > This looks wrong, userspace should either be able to write the MSR or not, '0'
> > isn't special.  Unless there's a danger to KVM, which I don't think there is,
> > userspace should be allowed to ignore architectural restrictions, i.e. bypass
> > the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
> > ioctls.  I.e. this should be:
> 
> I think the idea was writing unsupported values into an MSR should
> cause #GP.

Right, but when userspace is stuffing MSR values KVM only needs to prevent userspace
from writing values that are completely bogus, i.e. not supported by KVM at all.
Userspace is allowed to violated the guest vCPU model.

> The code is consistent (copied) from the code that handles MSR_IA32_MCG_CTL:
> 
> https://elixir.bootlin.com/linux/v5.17-rc8/source/arch/x86/kvm/x86.c#L3177

Ah.  Sadly, KVM has these sorts of bad examples lurking in dark corners :-/

> Can you elaborate what unnecessary dependency between ioctls this may cause?

Enforcing the vCPU model checks on userspace write means userspace must do
KVM_X86_SETUP_MCE before KVM_SET_MSRS, otherwise stuffing the MSRs will fail.
It's not a big deal since userspace is likely using this ordering given that no
one has complained about MSR_IA32_MCG_CTL, but it's easy to remedy and it makes
the code simpler.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-12 14:29     ` Jue Wang
@ 2022-04-12 16:57       ` Sean Christopherson
  0 siblings, 0 replies; 10+ messages in thread
From: Sean Christopherson @ 2022-04-12 16:57 UTC (permalink / raw)
  To: Jue Wang; +Cc: Paolo Bonzini, kvm

On Tue, Apr 12, 2022, Jue Wang wrote:
> On Tue, Apr 12, 2022 at 7:22 AM Jue Wang <juew@google.com> wrote:
> > > > +             return 0;
> > > > +
> > > > +     if (lapic_in_kernel(vcpu))
> > >
> > > Any reason to support UNCA injection without an in-kernel APIC?
> 
> Even without a viable path for CMCI signaling, the UCNA errors should
> still be logged in registers so it gives consistent semantics to
> hardware for e.g., MCE overflow checks on an MCE signaled later.

Right, what I was suggesting is that KVM reject the ioctl() if the VM doesn't
have an in-kernel APIC, i.e. force userspace to use KVM's local APIC if userspace
wants to also support UNCA injection.

Side topic, please trim your replies, i.e. delete all the unnecessary quoted context.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-11 19:08 ` Sean Christopherson
  2022-04-12 14:22   ` Jue Wang
@ 2022-04-12 17:44   ` Paolo Bonzini
  2022-04-12 18:19     ` Sean Christopherson
  1 sibling, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2022-04-12 17:44 UTC (permalink / raw)
  To: Sean Christopherson, Jue Wang; +Cc: kvm

On 4/11/22 21:08, Sean Christopherson wrote:
>> +			if (!(mcg_cap & MCG_CMCI_P) &&
>> +			    (data || !msr_info->host_initiated))
> This looks wrong, userspace should either be able to write the MSR or not, '0'
> isn't special.  Unless there's a danger to KVM, which I don't think there is,
> userspace should be allowed to ignore architectural restrictions, i.e. bypass
> the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
> ioctls.  I.e. this should be:
> 
> 		if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
> 			return 1;
> 

This is somewhat dangerous as it complicates (or removes) the invariants 
that other code can rely on.  Thus, usually, only the default value is 
allowed for KVM_SET_MSR.

See commit b1e34d325397 ("KVM: x86: Forbid VMM to set SYNIC/STIMER MSRs 
when SynIC wasn't activated", 2022-03-29) for a case where this practice 
avoids a NULL pointer dereference.  Though in there, Vitaly wrote:

     Note, it would've been better to forbid writing anything to
     SYNIC/STIMER MSRs, including zeroes, however, at least QEMU tries
     clearing HV_X64_MSR_STIMER0_CONFIG without SynIC.

and I don't really agree with him, in that writing the default value of 
an MSR is safe and should always be allowed for userspace.


Paolo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-12 17:44   ` Paolo Bonzini
@ 2022-04-12 18:19     ` Sean Christopherson
  2022-04-12 19:52       ` Jue Wang
  0 siblings, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2022-04-12 18:19 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Jue Wang, kvm

On Tue, Apr 12, 2022, Paolo Bonzini wrote:
> On 4/11/22 21:08, Sean Christopherson wrote:
> > > +			if (!(mcg_cap & MCG_CMCI_P) &&
> > > +			    (data || !msr_info->host_initiated))
> > This looks wrong, userspace should either be able to write the MSR or not, '0'
> > isn't special.  Unless there's a danger to KVM, which I don't think there is,
> > userspace should be allowed to ignore architectural restrictions, i.e. bypass
> > the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
> > ioctls.  I.e. this should be:
> > 
> > 		if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
> > 			return 1;
> > 
> 
> This is somewhat dangerous as it complicates (or removes) the invariants
> that other code can rely on.  Thus, usually, only the default value is
> allowed for KVM_SET_MSR.

Heh, I don't know if "usually" is the right word, that implies KVM is consistent
enough to have a simple majority for any behavior, whatever that behavior may be :-)

Anyways, on second look, I agree that KVM should require that userspace first enable
CMCI via mcg_cap.  I thought that vcpu->arch.mcg_cap could be written via the MSR
interface, i.e. via userspace writes to MSR_IA32_MCG_CAP, and could create dependencies
within KVM_SET_MSRS.  But KVM only allows reading the MSR.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.
  2022-04-12 18:19     ` Sean Christopherson
@ 2022-04-12 19:52       ` Jue Wang
  0 siblings, 0 replies; 10+ messages in thread
From: Jue Wang @ 2022-04-12 19:52 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Paolo Bonzini, kvm

On Tue, Apr 12, 2022 at 11:19 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Apr 12, 2022, Paolo Bonzini wrote:
> > On 4/11/22 21:08, Sean Christopherson wrote:
> > > > +                 if (!(mcg_cap & MCG_CMCI_P) &&
> > > > +                     (data || !msr_info->host_initiated))
> > > This looks wrong, userspace should either be able to write the MSR or not, '0'
> > > isn't special.  Unless there's a danger to KVM, which I don't think there is,
> > > userspace should be allowed to ignore architectural restrictions, i.e. bypass
> > > the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
> > > ioctls.  I.e. this should be:
> > >
> > >             if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
> > >                     return 1;
> > >
> >
> > This is somewhat dangerous as it complicates (or removes) the invariants
> > that other code can rely on.  Thus, usually, only the default value is
> > allowed for KVM_SET_MSR.
>
> Heh, I don't know if "usually" is the right word, that implies KVM is consistent
> enough to have a simple majority for any behavior, whatever that behavior may be :-)
>
> Anyways, on second look, I agree that KVM should require that userspace first enable
> CMCI via mcg_cap.  I thought that vcpu->arch.mcg_cap could be written via the MSR
> interface, i.e. via userspace writes to MSR_IA32_MCG_CAP, and could create dependencies
> within KVM_SET_MSRS.  But KVM only allows reading the MSR.

vcpu->arch.mcg_cap is written via kvm_vcpu_ioctl_x86_setup_mce and it
requires explicit enablement in kvm_mce_cap_supported for MCG_CMCI_P.

The pattern of KVM_SET/GET_MSRS depending on bits in
vcpu->arch.mcg_cap seems common so I will keep the check of MCG_CMCI_P
in V2.

As only allowing 0 to be set to MCi_CTL2 in set_msr_mce, it is more
consistent to hardware behavior and the invariants that other code
depends on as Paolo pointed out. I will keep this implementation in V2
as well.

Thanks,
-Jue

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-04-12 19:58 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-03-23 18:28 [PATCH] KVM: x86: Add support for CMCI and UCNA Jue Wang
2022-04-10 16:56 ` Jue Wang
2022-04-11 19:08 ` Sean Christopherson
2022-04-12 14:22   ` Jue Wang
2022-04-12 14:29     ` Jue Wang
2022-04-12 16:57       ` Sean Christopherson
2022-04-12 16:54     ` Sean Christopherson
2022-04-12 17:44   ` Paolo Bonzini
2022-04-12 18:19     ` Sean Christopherson
2022-04-12 19:52       ` Jue Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox