All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v9 10/10] x86emul: misc additions
@ 2025-11-24 14:56 Jan Beulich
  2025-11-24 14:57 ` [PATCH v9 01/10] x86emul: support LKGS Jan Beulich
                   ` (10 more replies)
  0 siblings, 11 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 14:56 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

01: x86emul: support LKGS
02: x86emul+VMX: support {RD,WR}MSRLIST
03: x86emul: support USER_MSR instructions
04: x86/cpu-policy: re-arrange no-VMX logic
05: VMX: support USER_MSR
06: VMX: support MSR-IMM
07: x86emul: support MSR_IMM instructions
08: x86emul: support non-SIMD MOVRS
09: x86: use / "support" UDB
10: x86emul: support AVX512-BMM

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 01/10] x86emul: support LKGS
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
@ 2025-11-24 14:57 ` Jan Beulich
  2025-11-24 14:58 ` [PATCH v9 02/10] x86emul+VMX: support {RD,WR}MSRLIST Jan Beulich
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 14:57 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

Provide support for this insn, which is a prereq to FRED. CPUID-wise,
while its and FRED's enumerators were already introduced, their dependency
still needs adding.

While adding a testcase, also add a SWAPGS one. In order to not affect
the behavior of pre-existing tests, install write_{segment,msr} hooks
only transiently.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Instead of ->read_segment() we could of course also use ->read_msr() to
fetch the original GS base. I don't think I can see a clear advantage of
either approach; the way it's done it matches how we handle SWAPGS.

For PV save_segments() would need adjustment, but the insn being
restricted to ring 0 means PV guests can't use it anyway (unless we
wanted to emulate it as another privileged insn).
---
v9: Re-base.
v8: Re-base.
v6: Use MSR constants in test harness. S->s in cpufeatureset.h. Add
    NMI_SRC feature bits. Re-base.
v5: Re-base.
v3: Add dependency on LM. Re-base.
v2: Use X86_EXC_*. Add comments.

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -326,6 +326,7 @@ static const struct {
     { { 0x00, 0x18 }, { 2, 2 }, T, R }, /* ltr */
     { { 0x00, 0x20 }, { 2, 2 }, T, R }, /* verr */
     { { 0x00, 0x28 }, { 2, 2 }, T, R }, /* verw */
+    { { 0x00, 0x30 }, { 0, 2 }, T, R, pfx_f2 }, /* lkgs */
     { { 0x01, 0x00 }, { 2, 2 }, F, W }, /* sgdt */
     { { 0x01, 0x08 }, { 2, 2 }, F, W }, /* sidt */
     { { 0x01, 0x10 }, { 2, 2 }, F, R }, /* lgdt */
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -672,6 +672,10 @@ static int blk(
     return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt);
 }
 
+#ifdef __x86_64__
+static unsigned long gs_base, gs_base_shadow;
+#endif
+
 static int read_segment(
     enum x86_segment seg,
     struct segment_register *reg,
@@ -681,8 +685,30 @@ static int read_segment(
         return X86EMUL_UNHANDLEABLE;
     memset(reg, 0, sizeof(*reg));
     reg->p = 1;
+
+#ifdef __x86_64__
+    if ( seg == x86_seg_gs )
+        reg->base = gs_base;
+#endif
+
+    return X86EMUL_OKAY;
+}
+
+#ifdef __x86_64__
+static int write_segment(
+    enum x86_segment seg,
+    const struct segment_register *reg,
+    struct x86_emulate_ctxt *ctxt)
+{
+    if ( !is_x86_user_segment(seg) )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( seg == x86_seg_gs )
+        gs_base = reg->base;
+
     return X86EMUL_OKAY;
 }
+#endif
 
 static int read_msr(
     unsigned int reg,
@@ -695,6 +721,20 @@ static int read_msr(
         *val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0;
         return X86EMUL_OKAY;
 
+#ifdef __x86_64__
+    case MSR_GS_BASE:
+        if ( ctxt->addr_size < 64 )
+            break;
+        *val = gs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( ctxt->addr_size < 64 )
+            break;
+        *val = gs_base_shadow;
+        return X86EMUL_OKAY;
+#endif
+
     case MSR_TSC_AUX:
 #define TSC_AUX_VALUE 0xCACACACA
         *val = TSC_AUX_VALUE;
@@ -704,6 +744,31 @@ static int read_msr(
     return X86EMUL_UNHANDLEABLE;
 }
 
+#ifdef __x86_64__
+static int write_msr(
+    unsigned int reg,
+    uint64_t val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    switch ( reg )
+    {
+    case MSR_GS_BASE:
+        if ( ctxt->addr_size < 64 || !is_canonical_address(val) )
+            break;
+        gs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( ctxt->addr_size < 64 || !is_canonical_address(val) )
+            break;
+        gs_base_shadow = val;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+#endif
+
 #define INVPCID_ADDR 0x12345678
 #define INVPCID_PCID 0x123
 
@@ -1338,6 +1403,41 @@ int main(int argc, char **argv)
         printf("%u bytes read - ", bytes_read);
         goto fail;
     }
+    printf("okay\n");
+
+    emulops.write_segment = write_segment;
+    emulops.write_msr     = write_msr;
+
+    printf("%-40s", "Testing swapgs...");
+    instr[0] = 0x0f; instr[1] = 0x01; instr[2] = 0xf8;
+    regs.eip = (unsigned long)&instr[0];
+    gs_base = 0xffffeeeecccc8888UL;
+    gs_base_shadow = 0x0000111122224444UL;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.eip != (unsigned long)&instr[3]) ||
+         (gs_base != 0x0000111122224444UL) ||
+         (gs_base_shadow != 0xffffeeeecccc8888UL) )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing lkgs 2(%rdx)...");
+    instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x00; instr[3] = 0x72; instr[4] = 0x02;
+    regs.eip = (unsigned long)&instr[0];
+    regs.edx = (unsigned long)res;
+    res[0]   = 0x00004444;
+    res[1]   = 0x8888cccc;
+    i = cpu_policy.extd.nscb; cpu_policy.extd.nscb = true; /* for AMD */
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.eip != (unsigned long)&instr[5]) ||
+         (gs_base != 0x0000111122224444UL) ||
+         gs_base_shadow )
+        goto fail;
+
+    cpu_policy.extd.nscb = i;
+    emulops.write_segment = NULL;
+    emulops.write_msr     = NULL;
 #endif
     printf("okay\n");
 
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -85,6 +85,7 @@ bool emul_test_init(void)
     cpu_policy.feat.invpcid = true;
     cpu_policy.feat.adx = true;
     cpu_policy.feat.rdpid = true;
+    cpu_policy.feat.lkgs = true;
     cpu_policy.feat.wrmsrns = true;
     cpu_policy.extd.clzero = true;
 
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -744,8 +744,12 @@ decode_twobyte(struct x86_emulate_state
         case 0:
             s->desc |= DstMem | SrcImplicit | Mov;
             break;
+        case 6:
+            if ( !(s->modrm_reg & 1) && mode_64bit() )
+            {
         case 2: case 4:
-            s->desc |= SrcMem16;
+                s->desc |= SrcMem16;
+            }
             break;
         }
         break;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -608,6 +608,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx_vnni()    (ctxt->cpuid->feat.avx_vnni)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
 #define vcpu_has_cmpccxadd()   (ctxt->cpuid->feat.cmpccxadd)
+#define vcpu_has_lkgs()        (ctxt->cpuid->feat.lkgs)
 #define vcpu_has_wrmsrns()     (ctxt->cpuid->feat.wrmsrns)
 #define vcpu_has_avx_ifma()    (ctxt->cpuid->feat.avx_ifma)
 #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2899,8 +2899,35 @@ x86_emulate(
                 break;
             }
             break;
-        default:
-            generate_exception_if(true, X86_EXC_UD);
+        case 6: /* lkgs */
+            generate_exception_if((modrm_reg & 1) || vex.pfx != vex_f2,
+                                  X86_EXC_UD);
+            generate_exception_if(!mode_64bit() || !mode_ring0(), X86_EXC_UD);
+            vcpu_must_have(lkgs);
+            fail_if(!ops->read_segment || !ops->read_msr ||
+                    !ops->write_segment || !ops->write_msr);
+            if ( (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
+                                     ctxt)) != X86EMUL_OKAY ||
+                 (rc = ops->read_segment(x86_seg_gs, &sreg,
+                                         ctxt)) != X86EMUL_OKAY )
+                goto done;
+            dst.orig_val = sreg.base; /* Preserve full GS Base. */
+            if ( (rc = protmode_load_seg(x86_seg_gs, src.val, false, &sreg,
+                                         ctxt, ops)) != X86EMUL_OKAY ||
+                 /* Write (32-bit) base into SHADOW_GS. */
+                 (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
+                                      ctxt)) != X86EMUL_OKAY )
+                goto done;
+            sreg.base = dst.orig_val; /* Reinstate full GS Base. */
+            if ( (rc = ops->write_segment(x86_seg_gs, &sreg,
+                                          ctxt)) != X86EMUL_OKAY )
+            {
+                /* Best effort unwind (i.e. no real error checking). */
+                if ( ops->write_msr(MSR_SHADOW_GS_BASE, msr_val,
+                                    ctxt) == X86EMUL_EXCEPTION )
+                    x86_emul_reset_event(ctxt);
+                goto done;
+            }
             break;
         }
         break;
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -282,7 +282,8 @@ def crunch_numbers(state):
         # superpages, PCID and PKU are only available in 4 level paging.
         # NO_LMSL indicates the absense of Long Mode Segment Limits, which
         # have been dropped in hardware.
-        LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD],
+        LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
+             LKGS],
 
         # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
         # standard 3DNow in the earlier K6 processors.
@@ -351,6 +352,9 @@ def crunch_numbers(state):
         # computational instructions.  All further AMX features are built on top
         # of AMX-TILE.
         AMX_TILE: [AMX_BF16, AMX_INT8, AMX_FP16, AMX_COMPLEX],
+
+        # FRED builds on the LKGS instruction.
+        LKGS: [FRED],
     }
 
     deep_features = tuple(sorted(deps.keys()))



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 02/10] x86emul+VMX: support {RD,WR}MSRLIST
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
  2025-11-24 14:57 ` [PATCH v9 01/10] x86emul: support LKGS Jan Beulich
@ 2025-11-24 14:58 ` Jan Beulich
  2025-11-24 14:58 ` [PATCH v9 03/10] x86emul: support USER_MSR instructions Jan Beulich
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 14:58 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

These are "compound" instructions to issue a series of RDMSR / WRMSR
respectively. In the emulator we can therefore implement them by using
the existing msr_{read,write}() hooks. The memory accesses utilize that
the HVM ->read() / ->write() hooks are already linear-address
(x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling
this case).

Preemption is being checked for in WRMSRLIST handling only, as only MSR
writes are expected to possibly take long.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: In vmx_vmexit_handler() handling is forwarded to the emulator
     blindly. Alternatively we could consult the exit qualification and
     process just a single MSR at a time (without involving the
     emulator), exiting back to the guest after every iteration. (I
     don't think a mix of both models makes a lot of sense.)

The precise behavior of MSR_BARRIER is still not spelled out in ISE 050,
so the (minimal) implementation continues to be a guess for now.

Wouldn't calculate_hvm_max_policy() for MPX better behave the same way
as done here, at least from an abstract perspective (assuming that AMD
won't add such functionality now that Intel have deprecated it)?
---
v8: Re-base.
v6: Use MSR constants in test harness. Re-base.
v5: Add missing vmx_init_vmcs_config() and construct_vmcs() adjustments.
    Avoid unnecessary uses of r(). Re-base.
v3: Add dependency on LM. Limit exposure to HVM. Utilize new info from
    ISE 050. Re-base.
v2: Use X86_EXC_*. Add preemption checking to WRMSRLIST handling. Remove
    the feature from "max" when the VMX counterpart isn't available.

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -342,6 +342,8 @@ static const struct {
     { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */
     { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */
     { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */
+    { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */
+    { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */
     { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */
     { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */
     { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -625,7 +625,7 @@ static int write(
     if ( verbose )
         printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes);
 
-    if ( !is_x86_user_segment(seg) )
+    if ( !is_x86_user_segment(seg) && seg != x86_seg_none )
         return X86EMUL_UNHANDLEABLE;
     memcpy((void *)offset, p_data, bytes);
     return X86EMUL_OKAY;
@@ -717,6 +717,10 @@ static int read_msr(
 {
     switch ( reg )
     {
+    case MSR_BARRIER:
+        *val = 0;
+        return X86EMUL_OKAY;
+
     case MSR_EFER:
         *val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0;
         return X86EMUL_OKAY;
@@ -1434,9 +1438,53 @@ int main(int argc, char **argv)
          (gs_base != 0x0000111122224444UL) ||
          gs_base_shadow )
         goto fail;
+    printf("okay\n");
 
     cpu_policy.extd.nscb = i;
     emulops.write_segment = NULL;
+
+    printf("%-40s", "Testing rdmsrlist...");
+    instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rsi = (unsigned long)(res + 0x80);
+    regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2);
+    regs.rcx = 0x0002000100008000UL;
+    gs_base_shadow = 0x0000222244446666UL;
+    memset(res + 0x80, ~0, 0x40 * 8 * 2);
+    res[0x80 + 0x0f * 2] = MSR_GS_BASE;
+    res[0x80 + 0x0f * 2 + 1] = 0;
+    res[0x80 + 0x20 * 2] = MSR_SHADOW_GS_BASE;
+    res[0x80 + 0x20 * 2 + 1] = 0;
+    res[0x80 + 0x31 * 2] = MSR_BARRIER;
+    res[0x80 + 0x31 * 2 + 1] = 0;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[4]) ||
+         regs.rcx ||
+         (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) ||
+         (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) ||
+         (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) ||
+         (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) ||
+         res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing wrmsrlist...");
+    instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
+    regs.eip = (unsigned long)&instr[0];
+    regs.rsi -= 0x11 * 8;
+    regs.rdi -= 0x11 * 8;
+    regs.rcx = 0x0002000100000000UL;
+    res[0x80 + 0x0f * 2] = MSR_SHADOW_GS_BASE;
+    res[0x80 + 0x20 * 2] = MSR_GS_BASE;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[4]) ||
+         regs.rcx ||
+         (gs_base != 0x0000222244446666UL) ||
+         (gs_base_shadow != 0x0000111122224444UL) )
+        goto fail;
+
     emulops.write_msr     = NULL;
 #endif
     printf("okay\n");
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -87,6 +87,7 @@ bool emul_test_init(void)
     cpu_policy.feat.rdpid = true;
     cpu_policy.feat.lkgs = true;
     cpu_policy.feat.wrmsrns = true;
+    cpu_policy.feat.msrlist = true;
     cpu_policy.extd.clzero = true;
 
     if ( cpu_has_xsave )
--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -813,6 +813,9 @@ static void __init calculate_hvm_max_pol
             __clear_bit(X86_FEATURE_XSAVES, fs);
     }
 
+    if ( !cpu_has_vmx_msrlist )
+        __clear_bit(X86_FEATURE_MSRLIST, fs);
+
     /*
      * Xen doesn't use PKS, so the guest support for it has opted to not use
      * the VMCS load/save controls for efficiency reasons.  This depends on
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -361,8 +361,9 @@ static int vmx_init_vmcs_config(bool bsp
 
     if ( caps.cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS )
     {
-        uint64_t opt = (TERTIARY_EXEC_VIRT_SPEC_CTRL |
-                        TERTIARY_EXEC_EPT_PAGING_WRITE);
+        uint64_t opt = TERTIARY_EXEC_EPT_PAGING_WRITE |
+                       TERTIARY_EXEC_ENABLE_MSRLIST |
+                       TERTIARY_EXEC_VIRT_SPEC_CTRL;
 
         caps.tertiary_exec_control = adjust_vmx_controls2(
             "Tertiary Exec Control", 0, opt,
@@ -1119,7 +1120,8 @@ static int construct_vmcs(struct vcpu *v
         v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
 
     v->arch.hvm.vmx.secondary_exec_control = vmx_caps.secondary_exec_control;
-    v->arch.hvm.vmx.tertiary_exec_control  = vmx_caps.tertiary_exec_control;
+    v->arch.hvm.vmx.tertiary_exec_control  = vmx_caps.tertiary_exec_control &
+                                             ~TERTIARY_EXEC_ENABLE_MSRLIST;
 
     /*
      * Disable features which we don't want active by default:
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -891,6 +891,20 @@ static void cf_check vmx_cpuid_policy_ch
     else
         vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW);
 
+    if ( cp->feat.msrlist )
+    {
+        vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
+        v->arch.hvm.vmx.tertiary_exec_control |= TERTIARY_EXEC_ENABLE_MSRLIST;
+        vmx_update_tertiary_exec_control(v);
+    }
+    else if ( v->arch.hvm.vmx.tertiary_exec_control &
+              TERTIARY_EXEC_ENABLE_MSRLIST )
+    {
+        vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
+        v->arch.hvm.vmx.tertiary_exec_control &= ~TERTIARY_EXEC_ENABLE_MSRLIST;
+        vmx_update_tertiary_exec_control(v);
+    }
+
  out:
     vmx_vmcs_exit(v);
 
@@ -3897,6 +3911,22 @@ gp_fault:
     return X86EMUL_EXCEPTION;
 }
 
+static bool cf_check is_msrlist(
+    const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt)
+{
+
+    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) )
+    {
+        unsigned int rm, reg;
+        int mode = x86_insn_modrm(state, &rm, &reg);
+
+        /* This also includes WRMSRNS; should be okay. */
+        return mode == 3 && rm == 6 && !reg;
+    }
+
+    return false;
+}
+
 static void vmx_do_extint(struct cpu_user_regs *regs)
 {
     unsigned long vector;
@@ -4704,6 +4734,17 @@ void asmlinkage vmx_vmexit_handler(struc
         }
         break;
 
+    case EXIT_REASON_RDMSRLIST:
+    case EXIT_REASON_WRMSRLIST:
+        if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist )
+        {
+            ASSERT_UNREACHABLE();
+            hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
+        }
+        else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") )
+            hvm_inject_hw_exception(X86_EXC_GP, 0);
+        break;
+
     case EXIT_REASON_VMXOFF:
     case EXIT_REASON_VMXON:
     case EXIT_REASON_VMCLEAR:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -275,8 +275,13 @@ void vmx_vmcs_reload(struct vcpu *v);
 #define TERTIARY_EXEC_EPT_PAGING_WRITE          BIT(2, UL)
 #define TERTIARY_EXEC_GUEST_PAGING_VERIFY       BIT(3, UL)
 #define TERTIARY_EXEC_IPI_VIRT                  BIT(4, UL)
+#define TERTIARY_EXEC_ENABLE_MSRLIST            BIT(6, UL)
 #define TERTIARY_EXEC_VIRT_SPEC_CTRL            BIT(7, UL)
 
+#define cpu_has_vmx_msrlist \
+    (IS_ENABLED(CONFIG_INTEL_VMX) && \
+     (vmx_caps.tertiary_exec_control & TERTIARY_EXEC_ENABLE_MSRLIST))
+
 #define cpu_has_vmx_virt_spec_ctrl \
      (vmx_caps.tertiary_exec_control & TERTIARY_EXEC_VIRT_SPEC_CTRL)
 
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -201,6 +201,8 @@ static inline void pi_clear_sn(struct pi
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_BUS_LOCK            74
 #define EXIT_REASON_NOTIFY              75
+#define EXIT_REASON_RDMSRLIST           78
+#define EXIT_REASON_WRMSRLIST           79
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -24,6 +24,8 @@
 #define  APIC_BASE_ENABLE                   (_AC(1, ULL) << 11)
 #define  APIC_BASE_ADDR_MASK                _AC(0x000ffffffffff000, ULL)
 
+#define MSR_BARRIER                         0x0000002f
+
 #define MSR_TEST_CTRL                       0x00000033
 #define  TEST_CTRL_SPLITLOCK_DETECT         (_AC(1, ULL) << 29)
 #define  TEST_CTRL_SPLITLOCK_DISABLE        (_AC(1, ULL) << 31)
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 76
+#define VMX_PERF_EXIT_REASON_SIZE 80
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -148,6 +148,12 @@ int guest_rdmsr(struct vcpu *v, uint32_t
     case MSR_AMD_PPIN:
         goto gp_fault;
 
+    case MSR_BARRIER:
+        if ( !cp->feat.msrlist )
+            goto gp_fault;
+        *val = 0;
+        break;
+
     case MSR_IA32_FEATURE_CONTROL:
         /*
          * Architecturally, availability of this MSR is enumerated by the
@@ -429,6 +435,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t
         uint64_t rsvd;
 
         /* Read-only */
+    case MSR_BARRIER:
     case MSR_IA32_PLATFORM_ID:
     case MSR_CORE_CAPABILITIES:
     case MSR_INTEL_CORE_THREAD_COUNT:
--- a/xen/arch/x86/x86_emulate/0f01.c
+++ b/xen/arch/x86/x86_emulate/0f01.c
@@ -11,6 +11,7 @@
 #include "private.h"
 
 #ifdef __XEN__
+#include <xen/event.h>
 #include <asm/prot-key.h>
 #endif
 
@@ -28,6 +29,7 @@ int x86emul_0f01(struct x86_emulate_stat
     switch ( s->modrm )
     {
         unsigned long base, limit, cr0, cr0w, cr4;
+        unsigned int n;
         struct segment_register sreg;
         uint64_t msr_val;
 
@@ -42,6 +44,64 @@ int x86emul_0f01(struct x86_emulate_stat
                                 ((uint64_t)regs->r(dx) << 32) | regs->eax,
                                 ctxt);
             goto done;
+
+        case vex_f3: /* wrmsrlist */
+            vcpu_must_have(msrlist);
+            generate_exception_if(!mode_64bit(), X86_EXC_UD);
+            generate_exception_if(!mode_ring0() || (regs->esi & 7) ||
+                                  (regs->edi & 7),
+                                  X86_EXC_GP, 0);
+            fail_if(!ops->write_msr);
+            while ( regs->r(cx) )
+            {
+                n = __builtin_ffsl(regs->r(cx)) - 1;
+                if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
+                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+                    break;
+                generate_exception_if(msr_val != (uint32_t)msr_val,
+                                      X86_EXC_GP, 0);
+                base = msr_val;
+                if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8,
+                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY ||
+                     (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY )
+                    break;
+                regs->r(cx) &= ~(1UL << n);
+
+#ifdef __XEN__
+                if ( regs->r(cx) && local_events_need_delivery() )
+                {
+                    rc = X86EMUL_RETRY;
+                    break;
+                }
+#endif
+            }
+            goto done;
+
+        case vex_f2: /* rdmsrlist */
+            vcpu_must_have(msrlist);
+            generate_exception_if(!mode_64bit(), X86_EXC_UD);
+            generate_exception_if(!mode_ring0() || (regs->esi & 7) ||
+                                  (regs->edi & 7),
+                                  X86_EXC_GP, 0);
+            fail_if(!ops->read_msr || !ops->write);
+            while ( regs->r(cx) )
+            {
+                n = __builtin_ffsl(regs->r(cx)) - 1;
+                if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
+                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+                    break;
+                generate_exception_if(msr_val != (uint32_t)msr_val,
+                                      X86_EXC_GP, 0);
+                if ( (rc = ops->read_msr(msr_val, &msr_val,
+                                         ctxt)) != X86EMUL_OKAY ||
+                     (rc = ops->write(x86_seg_none, regs->r(di) + n * 8,
+                                      &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+                    break;
+                regs->r(cx) &= ~(1UL << n);
+            }
+            if ( rc != X86EMUL_OKAY )
+                ctxt->regs->r(cx) = regs->r(cx);
+            goto done;
         }
         generate_exception(X86_EXC_UD);
 
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -611,6 +611,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_lkgs()        (ctxt->cpuid->feat.lkgs)
 #define vcpu_has_wrmsrns()     (ctxt->cpuid->feat.wrmsrns)
 #define vcpu_has_avx_ifma()    (ctxt->cpuid->feat.avx_ifma)
+#define vcpu_has_msrlist()     (ctxt->cpuid->feat.msrlist)
 #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
 #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
 #define vcpu_has_avx_vnni_int16() (ctxt->cpuid->feat.avx_vnni_int16)
--- a/xen/arch/x86/x86_emulate/util.c
+++ b/xen/arch/x86/x86_emulate/util.c
@@ -100,6 +100,9 @@ bool cf_check x86_insn_is_mem_access(con
         break;
 
     case X86EMUL_OPC(0x0f, 0x01):
+        /* {RD,WR}MSRLIST */
+        if ( mode_64bit() && s->modrm == 0xc6 )
+            return s->vex.pfx >= vex_f3;
         /* Cover CLZERO. */
         return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
     }
@@ -160,7 +163,11 @@ bool cf_check x86_insn_is_mem_write(cons
         case 0xff: /* Grp5 */
             break;
 
-        case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */
+        case X86EMUL_OPC(0x0f, 0x01):
+            /* RDMSRLIST */
+            if ( mode_64bit() && s->modrm == 0xc6 )
+                return s->vex.pfx == vex_f2;
+            /* CLZERO is another odd one. */
             return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
 
         default:
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -283,7 +283,7 @@ def crunch_numbers(state):
         # NO_LMSL indicates the absense of Long Mode Segment Limits, which
         # have been dropped in hardware.
         LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
-             LKGS],
+             LKGS, MSRLIST],
 
         # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
         # standard 3DNow in the earlier K6 processors.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 03/10] x86emul: support USER_MSR instructions
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
  2025-11-24 14:57 ` [PATCH v9 01/10] x86emul: support LKGS Jan Beulich
  2025-11-24 14:58 ` [PATCH v9 02/10] x86emul+VMX: support {RD,WR}MSRLIST Jan Beulich
@ 2025-11-24 14:58 ` Jan Beulich
  2025-11-24 14:59 ` [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic Jan Beulich
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 14:58 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

While UWRMSR probably isn't of much use as long as we don't support
UINTR, URDMSR may well be useful to guests even without that (depending
on what OSes are willing to permit access to).

Since the two VEX encodings introduce a lonely opcode point in map 7,
for now don't bother introducing a full 256-entry table.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
The retaining of (possible) #PF from the bitmap access is "speculative"
(the spec doesn't mention #PF as a possible exception; conceivably this
might also need converting to #GP).

I'm a little wary of the "MSRs Writeable by UWRMSR" table that the spec
has, and that our code thus also enforces: As new MSRs are added to that
table, we'll need piecemeal updates to that switch() statement.

The forced setting of cpu_policy.feat.utmr could likely be done
globally, i.e. early in main(). Limiting its scope is merely "just in
case". Thoughts?
---
v8: Switch to using fallthrough pseudo-keyword. Re-base.
v7.1: Add MSR-specific feature checks for UWRMSR (incl the UTMR feature
      bit and its overriding in the test harness).
v7: Add missing vcpu_must_have() and override in emul_test_init(). Use
    MSR constants even more.
v6: Add MSR_UINTR_TIMER to header. Use MSR constants in test harness.
    Re-base.
v5: Correct ModR/M.reg check for VEX-encoded forms. Cosmetic test
    harness adjustment. Re-base.
v4: MSR index input regs are 64-bit (albeit only the APX spec has it
    this way for now).
v3: New.

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -864,7 +864,9 @@ static const struct {
     { { 0xf6 }, { 2, 2 }, T, R, pfx_66 }, /* adcx */
     { { 0xf6 }, { 2, 2 }, T, R, pfx_f3 }, /* adox */
     { { 0xf8 }, { 2, 2 }, F, W, pfx_66 }, /* movdir64b */
+    { { 0xf8, 0xc0 }, { 0, 2 }, F, N, pfx_f3 }, /* uwrmsr */
     { { 0xf8 }, { 2, 2 }, F, W, pfx_f3 }, /* enqcmds */
+    { { 0xf8, 0xc0 }, { 0, 2 }, F, N, pfx_f2 }, /* urdmsr */
     { { 0xf8 }, { 2, 2 }, F, W, pfx_f2 }, /* enqcmd */
     { { 0xf9 }, { 2, 2 }, F, W }, /* movdiri */
 };
@@ -1516,6 +1518,9 @@ static const struct vex {
     { { 0xde }, 3, T, R, pfx_66, W0, L0 }, /* vsm3rnds2 */
     { { 0xdf }, 3, T, R, pfx_66, WIG, Ln }, /* vaeskeygenassist */
     { { 0xf0 }, 3, T, R, pfx_f2, Wn, L0 }, /* rorx */
+}, vex_map7[] = {
+    { { 0xf8, 0xc0 }, 6, F, N, pfx_f3, W0, L0 }, /* uwrmsr */
+    { { 0xf8, 0xc0 }, 6, F, N, pfx_f2, W0, L0 }, /* urdmsr */
 };
 
 static const struct {
@@ -1525,6 +1530,10 @@ static const struct {
     { vex_0f,   ARRAY_SIZE(vex_0f) },
     { vex_0f38, ARRAY_SIZE(vex_0f38) },
     { vex_0f3a, ARRAY_SIZE(vex_0f3a) },
+    { NULL,     0 }, /* map 4 */
+    { NULL,     0 }, /* map 5 */
+    { NULL,     0 }, /* map 6 */
+    { vex_map7, ARRAY_SIZE(vex_map7) },
 };
 
 static const struct xop {
@@ -2420,7 +2429,8 @@ void predicates_test(void *instr, struct
 
                 if ( vex[x].tbl[t].w == WIG || (vex[x].tbl[t].w & W0) )
                 {
-                    memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len);
+                    memcpy(ptr, vex[x].tbl[t].opc,
+                           MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc)));
 
                     if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L0) )
                         do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr),
@@ -2430,7 +2440,8 @@ void predicates_test(void *instr, struct
                     if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L1) )
                     {
                         ptr[-1] |= 4;
-                        memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len);
+                        memcpy(ptr, vex[x].tbl[t].opc,
+                               MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc)));
 
                         do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr),
                                 vex[x].tbl[t].modrm ? (void *)ptr - instr + 1 : 0,
@@ -2441,7 +2452,8 @@ void predicates_test(void *instr, struct
                 if ( vex[x].tbl[t].w == WIG || (vex[x].tbl[t].w & W1) )
                 {
                     ptr[-1] = 0xf8 | vex[x].tbl[t].pfx;
-                    memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len);
+                    memcpy(ptr, vex[x].tbl[t].opc,
+                           MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc)));
 
                     if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L0) )
                         do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr),
@@ -2451,7 +2463,8 @@ void predicates_test(void *instr, struct
                     if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L1) )
                     {
                         ptr[-1] |= 4;
-                        memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len);
+                        memcpy(ptr, vex[x].tbl[t].opc,
+                               MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc)));
 
                         do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr),
                                 vex[x].tbl[t].modrm ? (void *)ptr - instr + 1 : 0,
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -674,6 +674,7 @@ static int blk(
 
 #ifdef __x86_64__
 static unsigned long gs_base, gs_base_shadow;
+static unsigned long uintr_timer;
 #endif
 
 static int read_segment(
@@ -708,6 +709,15 @@ static int write_segment(
 
     return X86EMUL_OKAY;
 }
+
+static const uint8_t __attribute__((aligned(0x1000))) umsr_bitmap[0x1000] = {
+#define RD(msr) [(msr) >> 3] = 1 << ((msr) & 7)
+#define WR(msr) [0x800 + ((msr) >> 3)] = 1 << ((msr) & 7)
+    RD(MSR_IA32_APERF),
+    WR(MSR_UINTR_TIMER),
+#undef WR
+#undef RD
+};
 #endif
 
 static int read_msr(
@@ -717,10 +727,22 @@ static int read_msr(
 {
     switch ( reg )
     {
+#ifdef __x86_64__
+    case MSR_USER_MSR_CTL:
+        *val = (unsigned long)umsr_bitmap | 1;
+        return X86EMUL_OKAY;
+#endif
+
     case MSR_BARRIER:
         *val = 0;
         return X86EMUL_OKAY;
 
+    case MSR_IA32_APERF:
+#define APERF_LO_VALUE 0xAEAEAEAE
+#define APERF_HI_VALUE 0xEAEAEAEA
+        *val = ((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE;
+        return X86EMUL_OKAY;
+
     case MSR_EFER:
         *val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0;
         return X86EMUL_OKAY;
@@ -756,6 +778,12 @@ static int write_msr(
 {
     switch ( reg )
     {
+    case MSR_UINTR_TIMER:
+        if ( ctxt->addr_size < 64 )
+            break;
+        uintr_timer = val;
+        return X86EMUL_OKAY;
+
     case MSR_GS_BASE:
         if ( ctxt->addr_size < 64 || !is_canonical_address(val) )
             break;
@@ -1484,6 +1512,68 @@ int main(int argc, char **argv)
          (gs_base != 0x0000222244446666UL) ||
          (gs_base_shadow != 0x0000111122224444UL) )
         goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing urdmsr %rdx,%rcx...");
+    instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8; instr[4] = 0xd1;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rdx = MSR_IA32_APERF;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[5]) ||
+         (regs.rcx != (((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE)) )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing urdmsr $MSR_IA32_APERF,%rdx...");
+    instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7b; instr[3] = 0xf8; instr[4] = 0xc2;
+    *(uint32_t *)&instr[5] = MSR_IA32_APERF;
+    regs.rip = (unsigned long)&instr[0];
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[9]) ||
+         (regs.rdx != (((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE)) )
+        goto fail;
+    printf("okay\n");
+
+    /* Our write_msr() knows of MSR_UINTR_TIMER. */
+    i = cpu_policy.feat.utmr;
+    cpu_policy.feat.utmr = true;
+
+    printf("%-40s", "Testing uwrmsr %rdi,%rsi...");
+    instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8; instr[4] = 0xf7;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rsi = MSR_UINTR_TIMER;
+    regs.rdi = 0x0011223344556677UL;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[5]) ||
+         (uintr_timer != 0x0011223344556677UL) )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing uwrmsr %rsi,$MSR_UINTR_TIMER...");
+    instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7a; instr[3] = 0xf8; instr[4] = 0xc6;
+    *(uint32_t *)&instr[5] = MSR_UINTR_TIMER;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rsi = 0x8877665544332211UL;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[9]) ||
+         (uintr_timer != 0x8877665544332211UL) )
+        goto fail;
+    printf("okay\n");
+
+    cpu_policy.feat.utmr = i;
+
+    printf("%-40s", "Testing uwrmsr %rsi,$MSR_UARCH_MISC_CTRL...");
+    *(uint32_t *)&instr[5] = MSR_UARCH_MISC_CTRL;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rsi = 0;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_EXCEPTION) ||
+         (regs.rip != (unsigned long)&instr[0]) )
+        goto fail;
 
     emulops.write_msr     = NULL;
 #endif
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -88,6 +88,7 @@ bool emul_test_init(void)
     cpu_policy.feat.lkgs = true;
     cpu_policy.feat.wrmsrns = true;
     cpu_policy.feat.msrlist = true;
+    cpu_policy.feat.user_msr = true;
     cpu_policy.extd.clzero = true;
 
     if ( cpu_has_xsave )
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -24,6 +24,10 @@
 #define  APIC_BASE_ENABLE                   (_AC(1, ULL) << 11)
 #define  APIC_BASE_ADDR_MASK                _AC(0x000ffffffffff000, ULL)
 
+#define MSR_USER_MSR_CTL                    0x0000001c
+#define  USER_MSR_ENABLE                    (_AC(1, ULL) <<  0)
+#define  USER_MSR_ADDR_MASK                 0xfffffffffffff000ULL
+
 #define MSR_BARRIER                         0x0000002f
 
 #define MSR_TEST_CTRL                       0x00000033
@@ -206,6 +210,8 @@
 #define  MCU_CONTROL_DIS_MCU_LOAD           (_AC(1, ULL) <<  1)
 #define  MCU_CONTROL_EN_SMM_BYPASS          (_AC(1, ULL) <<  2)
 
+#define MSR_UINTR_TIMER                     0x00001b00
+
 #define MSR_UARCH_MISC_CTRL                 0x00001b01
 #define  UARCH_CTRL_DOITM                   (_AC(1, ULL) <<  0)
 
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -903,7 +903,7 @@ decode_0f38(struct x86_emulate_state *s,
     {
     case 0x00 ... 0xef:
     case 0xf2 ... 0xf5:
-    case 0xf7 ... 0xf8:
+    case 0xf7:
     case 0xfa ... 0xff:
         s->op_bytes = 0;
         /* fall through */
@@ -948,6 +948,18 @@ decode_0f38(struct x86_emulate_state *s,
     case X86EMUL_OPC_VEX_F2(0, 0xf7): /* shrx */
         break;
 
+    case 0xf8:
+        if ( s->modrm_mod == 3 ) /* u{rd,wr}msr */
+        {
+            s->desc = DstMem | SrcReg | Mov;
+            s->op_bytes = 8;
+            s->simd_size = simd_none;
+        }
+        else /* movdir64b / enqcmd{,s} */
+            s->op_bytes = 0;
+        ctxt->opcode |= MASK_INSR(s->vex.pfx, X86EMUL_OPC_PFX_MASK);
+        break;
+
     default:
         s->op_bytes = 0;
         break;
@@ -1246,6 +1258,16 @@ int x86emul_decode(struct x86_emulate_st
                          */
                         d = twobyte_table[0x38].desc;
                         break;
+
+                    case vex_map7:
+                        opcode |= MASK_INSR(7, X86EMUL_OPC_EXT_MASK);
+                        /*
+                         * No table lookup here for now, as there's only a single
+                         * opcode point (0xf8) populated in map 7.
+                         */
+                        d = DstMem | SrcImm | ModRM | Mov;
+                        s->op_bytes = 8;
+                        break;
                     }
                 }
                 else if ( s->ext < ext_8f08 + ARRAY_SIZE(xop_table) )
@@ -1602,6 +1624,7 @@ int x86emul_decode(struct x86_emulate_st
             s->simd_size = ext8f09_table[b].simd_size;
             break;
 
+        case ext_map7:
         case ext_8f08:
         case ext_8f0a:
             /*
@@ -1816,6 +1839,7 @@ int x86emul_decode(struct x86_emulate_st
 
     case ext_map5:
     case ext_map6:
+    case ext_map7:
     case ext_8f09:
     case ext_8f0a:
         break;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -203,6 +203,7 @@ enum vex_opcx {
     vex_0f3a,
     evex_map5 = 5,
     evex_map6,
+    vex_map7,
 };
 
 enum vex_pfx {
@@ -259,6 +260,7 @@ struct x86_emulate_state {
         ext_0f3a = vex_0f3a,
         ext_map5 = evex_map5,
         ext_map6 = evex_map6,
+        ext_map7 = vex_map7,
         /*
          * For XOP use values such that the respective instruction field
          * can be used without adjustment.
@@ -615,6 +617,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
 #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
 #define vcpu_has_avx_vnni_int16() (ctxt->cpuid->feat.avx_vnni_int16)
+#define vcpu_has_user_msr()    (ctxt->cpuid->feat.user_msr)
 
 #define vcpu_must_have(feat) \
     generate_exception_if(!vcpu_has_##feat(), X86_EXC_UD)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7024,10 +7024,73 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
-    case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 */
-    case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 */
+    case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 / uwrmsr r64,r32 */
+    case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 / urdmsr r32,r64 */
+        if ( ea.type == OP_MEM )
+            goto enqcmd;
+        imm1 = src.val;
+        fallthrough;
+    case X86EMUL_OPC_VEX_F3(7, 0xf8): /* uwrmsr r64,imm32 */
+    case X86EMUL_OPC_VEX_F2(7, 0xf8): /* urdmsr imm32,r64 */
+        generate_exception_if(!mode_64bit() || ea.type != OP_REG, X86_EXC_UD);
+        generate_exception_if(vex.l || vex.w, X86_EXC_UD);
+        generate_exception_if(vex.opcx && ((modrm_reg & 7) || vex.reg != 0xf),
+                              X86_EXC_UD);
+        vcpu_must_have(user_msr);
+        fail_if(!ops->read_msr);
+        if ( ops->read_msr(MSR_USER_MSR_CTL, &msr_val, ctxt) != X86EMUL_OKAY )
+        {
+            x86_emul_reset_event(ctxt);
+            msr_val = 0;
+        }
+        generate_exception_if(!(msr_val & USER_MSR_ENABLE), X86_EXC_UD);
+        generate_exception_if(imm1 & ~0x3fff, X86_EXC_GP, 0);
+
+        /* Check the corresponding bitmap. */
+        ea.mem.off = msr_val & ~0xfff;
+        if ( vex.pfx != vex_f2 )
+            ea.mem.off += 0x800;
+        ea.mem.off += imm1 >> 3;
+        if ( (rc = ops->read(x86_seg_sys, ea.mem.off, &b, 1,
+                             ctxt)) != X86EMUL_OKAY )
+            goto done;
+        generate_exception_if(!(b & (1 << (imm1 & 7))), X86_EXC_GP, 0);
+
+        /* Carry out the actual MSR access. */
+        if ( vex.pfx == vex_f2 )
+        {
+            /* urdmsr */
+            if ( (rc = ops->read_msr(imm1, &msr_val, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            dst.val = msr_val;
+            ASSERT(dst.type == OP_REG);
+            dst.bytes = 8;
+        }
+        else
+        {
+            /* uwrmsr */
+            switch ( imm1 )
+            {
+            case 0x1b00: /* UINTR_TIMER */
+                generate_exception_if(!cp->feat.utmr, X86_EXC_GP, 0);
+                break;
+
+            case 0x1b01: /* UARCH_MISC_CTL */
+                generate_exception_if(!cp->arch_caps.doitm, X86_EXC_GP, 0);
+                break;
+
+            default:
+                generate_exception(X86_EXC_GP, 0);
+            }
+            fail_if(!ops->write_msr);
+            if ( (rc = ops->write_msr(imm1, dst.val, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            dst.type = OP_NONE;
+        }
+        break;
+
+    enqcmd:
         host_and_vcpu_must_have(enqcmd);
-        generate_exception_if(ea.type != OP_MEM, X86_EXC_UD);
         generate_exception_if(vex.pfx != vex_f2 && !mode_ring0(), X86_EXC_GP, 0);
         src.val = truncate_ea(*dst.reg);
         generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops),
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -358,7 +358,9 @@ XEN_CPUFEATURE(AVX_VNNI_INT8,      15*32
 XEN_CPUFEATURE(AVX_NE_CONVERT,     15*32+ 5) /*A  AVX-NE-CONVERT Instructions */
 XEN_CPUFEATURE(AMX_COMPLEX,        15*32+ 8) /*   AMX Complex Instructions */
 XEN_CPUFEATURE(AVX_VNNI_INT16,     15*32+10) /*A  AVX-VNNI-INT16 Instructions */
+XEN_CPUFEATURE(UTMR,               15*32+13) /*   User Timer */
 XEN_CPUFEATURE(PREFETCHI,          15*32+14) /*A  PREFETCHIT{0,1} Instructions */
+XEN_CPUFEATURE(USER_MSR,           15*32+15) /*   U{RD,WR}MSR Instructions */
 XEN_CPUFEATURE(UIRET_UIF,          15*32+17) /*   UIRET updates UIF */
 XEN_CPUFEATURE(CET_SSS,            15*32+18) /*   CET Supervisor Shadow Stacks safe to use */
 XEN_CPUFEATURE(SLSM,               15*32+24) /*   Static Lockstep Mode */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -283,7 +283,7 @@ def crunch_numbers(state):
         # NO_LMSL indicates the absense of Long Mode Segment Limits, which
         # have been dropped in hardware.
         LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
-             LKGS, MSRLIST],
+             LKGS, MSRLIST, USER_MSR],
 
         # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
         # standard 3DNow in the earlier K6 processors.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (2 preceding siblings ...)
  2025-11-24 14:58 ` [PATCH v9 03/10] x86emul: support USER_MSR instructions Jan Beulich
@ 2025-11-24 14:59 ` Jan Beulich
  2026-04-07 21:58   ` Andrew Cooper
  2025-11-24 15:00 ` [PATCH v9 05/10] VMX: support USER-MSR Jan Beulich
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 14:59 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

Move the PKS check into an "else" for the corresponding "if()", such
that further adjustments (like for USER_MSR) can easily be put there as
well.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: Re-base.
v4: New.

--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -812,19 +812,20 @@ static void __init calculate_hvm_max_pol
         if ( !cpu_has_vmx_xsaves )
             __clear_bit(X86_FEATURE_XSAVES, fs);
     }
+    else
+    {
+        /*
+         * Xen doesn't use PKS, so the guest support for it has opted to not use
+         * the VMCS load/save controls for efficiency reasons.  This depends on
+         * the exact vmentry/exit behaviour, so don't expose PKS in other
+         * situations until someone has cross-checked the behaviour for safety.
+         */
+        __clear_bit(X86_FEATURE_PKS, fs);
+    }
 
     if ( !cpu_has_vmx_msrlist )
         __clear_bit(X86_FEATURE_MSRLIST, fs);
 
-    /*
-     * Xen doesn't use PKS, so the guest support for it has opted to not use
-     * the VMCS load/save controls for efficiency reasons.  This depends on
-     * the exact vmentry/exit behaviour, so don't expose PKS in other
-     * situations until someone has cross-checked the behaviour for safety.
-     */
-    if ( !cpu_has_vmx )
-        __clear_bit(X86_FEATURE_PKS, fs);
-
     /* 
      * Make adjustments to possible (nested) virtualization features exposed
      * to the guest



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 05/10] VMX: support USER-MSR
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (3 preceding siblings ...)
  2025-11-24 14:59 ` [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic Jan Beulich
@ 2025-11-24 15:00 ` Jan Beulich
  2025-11-24 15:00 ` [PATCH v9 06/10] x86emul: support MSR_IMM instructions Jan Beulich
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:00 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

Hook up the new VM exit codes and handle guest accesses, context switch,
and save/restore. At least for now don't allow the guest direct access
to the control MSR; this may need changing if guests were to frequently
access it (e.g. on their own context switch path).

While there also correct a one-off in union ldt_or_tr_instr_info's
comment.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Needing to change two places in hvm.c continues to be unhelpful; I
recall I already did forget to also adjust hvm_load_cpu_msrs() for XFD.
Considering that MSRs typically arrive in the order the table has it,
couldn't we incrementally look up the incoming MSR index there, falling
back to a full lookup only when the incremental lookup failed (and thus
not normally re-iterating through the initial part of the array)?

Said comment in union ldt_or_tr_instr_info is further odd (same for
union gdt_or_idt_instr_info's) in that Instruction Information is only a
32-bit field. Hence bits 32-63 aren't undefined, but simply don't exist.

RFC: The wee attempt to "deal" with nested is likely wrong, but I'm
     afraid I simply don't know where such enforcement would be done
     properly. Returning an error there is also commented out, for
     domain_cpu_policy_changed() returning void without "x86/xstate:
     re-size save area when CPUID policy changes" in place.
---
v9: Use wrmsrns(). Do renames where bits are also going to be used for
    MSR-IMM. Re-base.
v8: Re-base.
v5: Introduce user_msr_gpr().
v4: New.

--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -821,6 +821,12 @@ static void __init calculate_hvm_max_pol
          * situations until someone has cross-checked the behaviour for safety.
          */
         __clear_bit(X86_FEATURE_PKS, fs);
+
+        /*
+         * Don't expose USER-MSR until it is known how (if at all) it is
+         * virtualized on SVM.
+         */
+        __clear_bit(X86_FEATURE_USER_MSR, fs);
     }
 
     if ( !cpu_has_vmx_msrlist )
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -452,6 +452,10 @@ void domain_cpu_policy_changed(struct do
         }
     }
 
+    /* Nested doesn't have the necessary processing, yet. */
+    if ( nestedhvm_enabled(d) && p->feat.user_msr )
+        return /* -EINVAL */;
+
     for_each_vcpu ( d, v )
     {
         cpu_policy_updated(v);
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1391,6 +1391,7 @@ static int cf_check hvm_load_cpu_xsave_s
 
 #define HVM_CPU_MSR_SIZE(cnt) offsetof(struct hvm_msr, msr[cnt])
 static const uint32_t msrs_to_send[] = {
+    MSR_USER_MSR_CTL,
     MSR_SPEC_CTRL,
     MSR_INTEL_MISC_FEATURES_ENABLES,
     MSR_PKRS,
@@ -1545,6 +1546,7 @@ static int cf_check hvm_load_cpu_msrs(st
         {
             int rc;
 
+        case MSR_USER_MSR_CTL:
         case MSR_SPEC_CTRL:
         case MSR_INTEL_MISC_FEATURES_ENABLES:
         case MSR_PKRS:
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -696,13 +696,18 @@ static void cf_check vmx_vcpu_destroy(st
 }
 
 /*
- * To avoid MSR save/restore at every VM exit/entry time, we restore
- * the x86_64 specific MSRs at domain switch time. Since these MSRs
- * are not modified once set for para domains, we don't save them,
- * but simply reset them to values set in percpu_traps_init().
+ * To avoid MSR save/restore at every VM exit/entry time, we restore the
+ * x86_64 specific MSRs at vcpu switch time. Since these MSRs are not
+ * modified once set for para domains, we don't save them, but simply clear
+ * them or reset them to values set in percpu_traps_init().
  */
-static void vmx_restore_host_msrs(void)
+static void vmx_restore_host_msrs(const struct vcpu *v)
 {
+    const struct vcpu_msrs *msrs = v->arch.msrs;
+
+    if ( msrs->user_msr_ctl.enable )
+        wrmsrns(MSR_USER_MSR_CTL, 0);
+
     /* No PV guests?  No need to restore host SYSCALL infrastructure. */
     if ( !IS_ENABLED(CONFIG_PV) )
         return;
@@ -756,6 +761,9 @@ static void vmx_restore_guest_msrs(struc
 
     if ( cp->feat.pks )
         wrpkrs(msrs->pkrs);
+
+    if ( msrs->user_msr_ctl.enable )
+        wrmsrns(MSR_USER_MSR_CTL, msrs->user_msr_ctl.raw);
 }
 
 void vmx_update_cpu_exec_control(struct vcpu *v)
@@ -1199,7 +1207,7 @@ static void cf_check vmx_ctxt_switch_fro
     if ( !v->arch.fully_eager_fpu )
         vmx_fpu_leave(v);
     vmx_save_guest_msrs(v);
-    vmx_restore_host_msrs();
+    vmx_restore_host_msrs(v);
     vmx_save_dr(v);
 
     if ( v->domain->arch.hvm.pi_ops.flags & PI_CSW_FROM )
@@ -4245,6 +4253,14 @@ static int vmx_handle_apic_write(void)
     return vlapic_apicv_write(current, exit_qualification & 0xfff);
 }
 
+static unsigned int msr_imm_gpr(void)
+{
+    msr_imm_instr_info_t info;
+
+    __vmread(VMX_INSTRUCTION_INFO, &info.raw);
+    return info.gpr;
+}
+
 static void undo_nmis_unblocked_by_iret(void)
 {
     unsigned long guest_info;
@@ -4745,6 +4761,41 @@ void asmlinkage vmx_vmexit_handler(struc
             hvm_inject_hw_exception(X86_EXC_GP, 0);
         break;
 
+    case EXIT_REASON_URDMSR:
+    {
+        uint64_t msr_content = 0;
+
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+        switch ( hvm_msr_read_intercept(exit_qualification, &msr_content) )
+        {
+        case X86EMUL_OKAY:
+            *decode_gpr(regs, msr_imm_gpr()) = msr_content;
+            update_guest_eip(); /* Safe: URDMSR */
+            break;
+
+        case X86EMUL_EXCEPTION:
+            hvm_inject_hw_exception(X86_EXC_GP, 0);
+            break;
+        }
+        break;
+    }
+
+    case EXIT_REASON_UWRMSR:
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+        switch ( hvm_msr_write_intercept(exit_qualification,
+                                         *decode_gpr(regs, msr_imm_gpr()),
+                                         true) )
+        {
+        case X86EMUL_OKAY:
+            update_guest_eip(); /* Safe: UWRMSR */
+            break;
+
+        case X86EMUL_EXCEPTION:
+            hvm_inject_hw_exception(X86_EXC_GP, 0);
+            break;
+        }
+        break;
+
     case EXIT_REASON_VMXOFF:
     case EXIT_REASON_VMXON:
     case EXIT_REASON_VMCLEAR:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -203,6 +203,8 @@ static inline void pi_clear_sn(struct pi
 #define EXIT_REASON_NOTIFY              75
 #define EXIT_REASON_RDMSRLIST           78
 #define EXIT_REASON_WRMSRLIST           79
+#define EXIT_REASON_URDMSR              80
+#define EXIT_REASON_UWRMSR              81
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
@@ -578,8 +580,18 @@ typedef union ldt_or_tr_instr_info {
         base_reg_invalid        :1,  /* bit 27 - Base register invalid */
         instr_identity          :1,  /* bit 28 - 0:LDT, 1:TR */
         instr_write             :1,  /* bit 29 - 0:store, 1:load */
-                                :34; /* bits 31:63 - Undefined */
+                                :34; /* bits 30:63 - Undefined */
     };
 } ldt_or_tr_instr_info_t;
 
+/* VM-Exit instruction info for URDMSR and UWRMSR */
+typedef union msr_imm_instr_info {
+    unsigned long raw;
+    struct {
+        unsigned int            :3,  /* Bits 0:2 - Undefined */
+        gpr                     :4,  /* Bits 3:6 - Source/Destination register */
+                                :25; /* bits 7:31 - Undefined */
+    };
+} msr_imm_instr_info_t;
+
 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
--- a/xen/arch/x86/include/asm/guest-msr.h
+++ b/xen/arch/x86/include/asm/guest-msr.h
@@ -8,6 +8,20 @@
 struct vcpu_msrs
 {
     /*
+     * 0x0000001c - MSR_USER_MSR_CTL
+     *
+     * Value is guest chosen, and always loaded in vcpu context.
+     */
+    union {
+        uint64_t raw;
+        struct {
+            bool enable:1;
+            unsigned int :11;
+            unsigned long bitmap:52;
+        };
+    } user_msr_ctl;
+
+    /*
      * 0x00000048 - MSR_SPEC_CTRL
      * 0xc001011f - MSR_VIRT_SPEC_CTRL (if X86_FEATURE_AMD_SSBD)
      *
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 80
+#define VMX_PERF_EXIT_REASON_SIZE 82
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -280,6 +280,12 @@ int guest_rdmsr(struct vcpu *v, uint32_t
         *val = msrs->xss.raw;
         break;
 
+    case MSR_USER_MSR_CTL:
+        if ( !cp->feat.user_msr )
+            goto gp_fault;
+        *val = msrs->user_msr_ctl.raw;
+        break;
+
     case 0x40000000 ... 0x400001ff:
         if ( is_viridian_domain(d) )
         {
@@ -618,6 +624,19 @@ int guest_wrmsr(struct vcpu *v, uint32_t
         msrs->xss.raw = val;
         break;
 
+    case MSR_USER_MSR_CTL:
+        if ( !cp->feat.user_msr )
+            goto gp_fault;
+
+        if ( (val & ~(USER_MSR_ENABLE | USER_MSR_ADDR_MASK)) ||
+             !is_canonical_address(val) )
+            goto gp_fault;
+
+        msrs->user_msr_ctl.raw = val;
+        if ( v == curr )
+            wrmsrns(MSR_USER_MSR_CTL, val);
+        break;
+
     case 0x40000000 ... 0x400001ff:
         if ( is_viridian_domain(d) )
         {
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -360,7 +360,7 @@ XEN_CPUFEATURE(AMX_COMPLEX,        15*32
 XEN_CPUFEATURE(AVX_VNNI_INT16,     15*32+10) /*A  AVX-VNNI-INT16 Instructions */
 XEN_CPUFEATURE(UTMR,               15*32+13) /*   User Timer */
 XEN_CPUFEATURE(PREFETCHI,          15*32+14) /*A  PREFETCHIT{0,1} Instructions */
-XEN_CPUFEATURE(USER_MSR,           15*32+15) /*   U{RD,WR}MSR Instructions */
+XEN_CPUFEATURE(USER_MSR,           15*32+15) /*s  U{RD,WR}MSR Instructions */
 XEN_CPUFEATURE(UIRET_UIF,          15*32+17) /*   UIRET updates UIF */
 XEN_CPUFEATURE(CET_SSS,            15*32+18) /*   CET Supervisor Shadow Stacks safe to use */
 XEN_CPUFEATURE(SLSM,               15*32+24) /*   Static Lockstep Mode */



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 06/10] x86emul: support MSR_IMM instructions
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (4 preceding siblings ...)
  2025-11-24 15:00 ` [PATCH v9 05/10] VMX: support USER-MSR Jan Beulich
@ 2025-11-24 15:00 ` Jan Beulich
  2025-11-24 15:00 ` [PATCH v9 07/10] VMX: support MSR-IMM Jan Beulich
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:00 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

Encoding-wise these are very similar to URDMSR/UWRMSR, so existing logic
is easy to extend.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC only for now, as the VMX part is missing: The existing intercepts
can't be re-used unmodified, as those require the MSR index to be
fetched from guest ECX.
---
v8: Don't mark the feature 's' just yet. Re-base.
v7: New.

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -1519,6 +1519,8 @@ static const struct vex {
     { { 0xdf }, 3, T, R, pfx_66, WIG, Ln }, /* vaeskeygenassist */
     { { 0xf0 }, 3, T, R, pfx_f2, Wn, L0 }, /* rorx */
 }, vex_map7[] = {
+    { { 0xf6, 0xc0 }, 6, F, N, pfx_f3, W0, L0 }, /* wrmsrns */
+    { { 0xf6, 0xc0 }, 6, F, N, pfx_f2, W0, L0 }, /* rdmsr */
     { { 0xf8, 0xc0 }, 6, F, N, pfx_f3, W0, L0 }, /* uwrmsr */
     { { 0xf8, 0xc0 }, 6, F, N, pfx_f2, W0, L0 }, /* urdmsr */
 };
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1574,6 +1574,30 @@ int main(int argc, char **argv)
     if ( (rc != X86EMUL_EXCEPTION) ||
          (regs.rip != (unsigned long)&instr[0]) )
         goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing rdmsr $MSR_GS_BASE,%rdx...");
+    instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7b; instr[3] = 0xf6; instr[4] = 0xc2;
+    *(uint32_t *)&instr[5] = MSR_GS_BASE;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rdx = ~gs_base;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[9]) ||
+         (regs.rdx != gs_base) )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing wrmsrns %rsi,$MSR_SHADOW_GS_BASE...");
+    instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7a; instr[3] = 0xf6; instr[4] = 0xc6;
+    *(uint32_t *)&instr[5] = MSR_SHADOW_GS_BASE;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rsi = 0x665544332211UL;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[9]) ||
+         (gs_base_shadow != 0x665544332211UL) )
+        goto fail;
 
     emulops.write_msr     = NULL;
 #endif
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -88,6 +88,7 @@ bool emul_test_init(void)
     cpu_policy.feat.lkgs = true;
     cpu_policy.feat.wrmsrns = true;
     cpu_policy.feat.msrlist = true;
+    cpu_policy.feat.msr_imm = true;
     cpu_policy.feat.user_msr = true;
     cpu_policy.extd.clzero = true;
 
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -1262,8 +1262,9 @@ int x86emul_decode(struct x86_emulate_st
                     case vex_map7:
                         opcode |= MASK_INSR(7, X86EMUL_OPC_EXT_MASK);
                         /*
-                         * No table lookup here for now, as there's only a single
-                         * opcode point (0xf8) populated in map 7.
+                         * No table lookup here for now, as there are only two
+                         * (very similar) opcode points (0xf6, 0xf8) populated
+                         * in map 7.
                          */
                         d = DstMem | SrcImm | ModRM | Mov;
                         s->op_bytes = 8;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -614,6 +614,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_wrmsrns()     (ctxt->cpuid->feat.wrmsrns)
 #define vcpu_has_avx_ifma()    (ctxt->cpuid->feat.avx_ifma)
 #define vcpu_has_msrlist()     (ctxt->cpuid->feat.msrlist)
+#define vcpu_has_msr_imm()     (ctxt->cpuid->feat.msr_imm)
 #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
 #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
 #define vcpu_has_avx_vnni_int16() (ctxt->cpuid->feat.avx_vnni_int16)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7024,6 +7024,34 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
+    case X86EMUL_OPC_VEX_F3(7, 0xf6): /* wrmsrns r64,imm32 */
+    case X86EMUL_OPC_VEX_F2(7, 0xf6): /* rdmsr imm32,r64 */
+        generate_exception_if(!mode_64bit() || ea.type != OP_REG, X86_EXC_UD);
+        generate_exception_if(vex.l || vex.w, X86_EXC_UD);
+        generate_exception_if(vex.opcx && ((modrm_reg & 7) || vex.reg != 0xf),
+                              X86_EXC_UD);
+        vcpu_must_have(msr_imm);
+        generate_exception_if(!mode_ring0(), X86_EXC_GP, 0);
+        if ( vex.pfx == vex_f2 )
+        {
+            /* urdmsr */
+            fail_if(!ops->read_msr);
+            if ( (rc = ops->read_msr(imm1, &msr_val, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            dst.val = msr_val;
+            ASSERT(dst.type == OP_REG);
+            dst.bytes = 8;
+        }
+        else
+        {
+            /* wrmsrns */
+            fail_if(!ops->write_msr);
+            if ( (rc = ops->write_msr(imm1, dst.val, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            dst.type = OP_NONE;
+        }
+        break;
+
     case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 / uwrmsr r64,r32 */
     case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 / urdmsr r32,r64 */
         if ( ea.type == OP_MEM )
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -352,6 +352,7 @@ XEN_CPUFEATURE(MCDT_NO,            13*32
 XEN_CPUFEATURE(UC_LOCK_DIS,        13*32+ 6) /*   UC-lock disable */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */
+XEN_CPUFEATURE(MSR_IMM,            14*32+ 5) /*   RDMSR/WRMSRNS with immediate operand */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */
 XEN_CPUFEATURE(AVX_VNNI_INT8,      15*32+ 4) /*A  AVX-VNNI-INT8 Instructions */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -283,7 +283,7 @@ def crunch_numbers(state):
         # NO_LMSL indicates the absense of Long Mode Segment Limits, which
         # have been dropped in hardware.
         LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
-             LKGS, MSRLIST, USER_MSR],
+             LKGS, MSRLIST, USER_MSR, MSR_IMM],
 
         # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
         # standard 3DNow in the earlier K6 processors.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 07/10] VMX: support MSR-IMM
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (5 preceding siblings ...)
  2025-11-24 15:00 ` [PATCH v9 06/10] x86emul: support MSR_IMM instructions Jan Beulich
@ 2025-11-24 15:00 ` Jan Beulich
  2025-11-26 18:50   ` Andrew Cooper
  2025-11-24 15:01 ` [PATCH v9 08/10] x86emul: support non-SIMD MOVRS Jan Beulich
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:00 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

Hook up the new VM exit codes and handle guest uses of the insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: New.
---
The lack of an enable bit is concerning; at least for the nested case
that's a security issue afaict (when L0 isn't aware of the insns, or more
specifically the exit codes).

--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -352,7 +352,7 @@ XEN_CPUFEATURE(MCDT_NO,            13*32
 XEN_CPUFEATURE(UC_LOCK_DIS,        13*32+ 6) /*   UC-lock disable */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */
-XEN_CPUFEATURE(MSR_IMM,            14*32+ 5) /*   RDMSR/WRMSRNS with immediate operand */
+XEN_CPUFEATURE(MSR_IMM,            14*32+ 5) /*s  RDMSR/WRMSRNS with immediate operand */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */
 XEN_CPUFEATURE(AVX_VNNI_INT8,      15*32+ 4) /*A  AVX-VNNI-INT8 Instructions */
--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -823,10 +823,11 @@ static void __init calculate_hvm_max_pol
         __clear_bit(X86_FEATURE_PKS, fs);
 
         /*
-         * Don't expose USER-MSR until it is known how (if at all) it is
-         * virtualized on SVM.
+         * Don't expose USER-MSR and MSR-IMM until it is known how (if at all)
+         * they are virtualized on SVM.
          */
         __clear_bit(X86_FEATURE_USER_MSR, fs);
+        __clear_bit(X86_FEATURE_MSR_IMM, fs);
     }
 
     if ( !cpu_has_vmx_msrlist )
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -453,7 +453,7 @@ void domain_cpu_policy_changed(struct do
     }
 
     /* Nested doesn't have the necessary processing, yet. */
-    if ( nestedhvm_enabled(d) && p->feat.user_msr )
+    if ( nestedhvm_enabled(d) && (p->feat.user_msr || p->feat.msr_imm) )
         return /* -EINVAL */;
 
     for_each_vcpu ( d, v )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4762,6 +4762,7 @@ void asmlinkage vmx_vmexit_handler(struc
         break;
 
     case EXIT_REASON_URDMSR:
+    case EXIT_REASON_RDMSR_IMM:
     {
         uint64_t msr_content = 0;
 
@@ -4770,7 +4771,7 @@ void asmlinkage vmx_vmexit_handler(struc
         {
         case X86EMUL_OKAY:
             *decode_gpr(regs, msr_imm_gpr()) = msr_content;
-            update_guest_eip(); /* Safe: URDMSR */
+            update_guest_eip(); /* Safe: URDMSR / RDMSR <imm> */
             break;
 
         case X86EMUL_EXCEPTION:
@@ -4781,13 +4782,14 @@ void asmlinkage vmx_vmexit_handler(struc
     }
 
     case EXIT_REASON_UWRMSR:
+    case EXIT_REASON_WRMSRNS_IMM:
         __vmread(EXIT_QUALIFICATION, &exit_qualification);
         switch ( hvm_msr_write_intercept(exit_qualification,
                                          *decode_gpr(regs, msr_imm_gpr()),
                                          true) )
         {
         case X86EMUL_OKAY:
-            update_guest_eip(); /* Safe: UWRMSR */
+            update_guest_eip(); /* Safe: UWRMSR / WRMSRNS <imm> */
             break;
 
         case X86EMUL_EXCEPTION:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -205,6 +205,8 @@ static inline void pi_clear_sn(struct pi
 #define EXIT_REASON_WRMSRLIST           79
 #define EXIT_REASON_URDMSR              80
 #define EXIT_REASON_UWRMSR              81
+#define EXIT_REASON_RDMSR_IMM           84
+#define EXIT_REASON_WRMSRNS_IMM         85
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 82
+#define VMX_PERF_EXIT_REASON_SIZE 86
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 08/10] x86emul: support non-SIMD MOVRS
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (6 preceding siblings ...)
  2025-11-24 15:00 ` [PATCH v9 07/10] VMX: support MSR-IMM Jan Beulich
@ 2025-11-24 15:01 ` Jan Beulich
  2025-11-24 15:01 ` [PATCH v9 09/10] x86: use / "support" UDB Jan Beulich
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:01 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

As we ignore cachability aspects of insns, they're treated like simple
MOVs.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
SDE: -dmr
---
v7: New.

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -843,6 +843,9 @@ static const struct {
     { { 0x80 }, { 2, 2 }, T, R, pfx_66 }, /* invept */
     { { 0x81 }, { 2, 2 }, T, R, pfx_66 }, /* invvpid */
     { { 0x82 }, { 2, 2 }, T, R, pfx_66 }, /* invpcid */
+    { { 0x8a }, { 2, 2 }, T, R, pfx_no }, /* movrsb */
+    { { 0x8b }, { 2, 2 }, T, R, pfx_no }, /* movrs{d,q} */
+    { { 0x8b }, { 2, 2 }, T, R, pfx_66 }, /* movrsw */
     { { 0xc8 }, { 2, 2 }, T, R, pfx_no }, /* sha1nexte */
     { { 0xc9 }, { 2, 2 }, T, R, pfx_no }, /* sha1msg1 */
     { { 0xca }, { 2, 2 }, T, R, pfx_no }, /* sha1msg2 */
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1869,6 +1869,29 @@ int main(int argc, char **argv)
     }
     else
         printf("skipped\n");
+
+    {
+        /* For the non-SIMD forms the emulator doesn't itself use MOVRS. */
+        bool movrs = cpu_policy.feat.movrs;
+
+        cpu_policy.feat.movrs = true;
+
+        printf("%-40s", "Testing movrs 6(%rdi),%si...");
+        instr[0] = 0x66; instr[1] = 0x0f; instr[2] = 0x38;
+        instr[3] = 0x8b; instr[4] = 0x77; instr[5] = 0x06;
+        regs.rip = (unsigned long)&instr[0];
+        regs.rsi = 0x8888777766665555UL;
+        regs.rdi = (unsigned long)res;
+        res[1]   = 0x88777788U;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             (regs.rip != (unsigned long)&instr[6]) ||
+             (regs.rsi != 0x8888777766668877UL) )
+            goto fail;
+        printf("okay\n");
+
+        cpu_policy.feat.movrs = movrs;
+    }
 #endif /* x86-64 */
 
     printf("%-40s", "Testing shld $1,%ecx,(%edx)...");
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -901,7 +901,8 @@ decode_0f38(struct x86_emulate_state *s,
 {
     switch ( ctxt->opcode & X86EMUL_OPC_MASK )
     {
-    case 0x00 ... 0xef:
+    case 0x00 ... 0x89:
+    case 0x8c ... 0xef:
     case 0xf2 ... 0xf5:
     case 0xf7:
     case 0xfa ... 0xff:
@@ -912,6 +913,13 @@ decode_0f38(struct x86_emulate_state *s,
         ctxt->opcode |= MASK_INSR(s->vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
+    case 0x8a ... 0x8b: /* movrs */
+        s->desc = DstReg | SrcMem | Mov;
+        if ( !(ctxt->opcode & 1) )
+            s->desc |= ByteOp;
+        s->simd_size = simd_none;
+        break;
+
     case X86EMUL_OPC_VEX_66(0, 0x2d): /* vmaskmovpd */
         s->simd_size = simd_packed_fp;
         break;
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -614,6 +614,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_wrmsrns()     (ctxt->cpuid->feat.wrmsrns)
 #define vcpu_has_avx_ifma()    (ctxt->cpuid->feat.avx_ifma)
 #define vcpu_has_msrlist()     (ctxt->cpuid->feat.msrlist)
+#define vcpu_has_movrs()       (ctxt->cpuid->feat.movrs)
 #define vcpu_has_msr_imm()     (ctxt->cpuid->feat.msr_imm)
 #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
 #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -6323,6 +6323,16 @@ x86_emulate(
         fault_suppression = false;
         goto avx512f_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+
+    case X86EMUL_OPC(0x0f38, 0x8a)
+     ... X86EMUL_OPC(0x0f38, 0x8b): /* movrs */
+        vcpu_must_have(movrs);
+        dst.val = src.val;
+        break;
+
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, X86_EXC_UD);
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -319,6 +319,7 @@ XEN_CPUFEATURE(AVX_IFMA,     10*32+23) /
 XEN_CPUFEATURE(LAM,          10*32+26) /*   Linear Address Masking */
 XEN_CPUFEATURE(MSRLIST,      10*32+27) /*   {RD,WR}MSRLIST instructions */
 XEN_CPUFEATURE(NO_INVD,      10*32+30) /*   INVD instruction unusable */
+XEN_CPUFEATURE(MOVRS,        10*32+31) /*a  MOV-read-shared instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */
 XEN_CPUFEATURE(NO_NEST_BP,         11*32+ 0) /*A  No Nested Data Breakpoints */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -283,7 +283,7 @@ def crunch_numbers(state):
         # NO_LMSL indicates the absense of Long Mode Segment Limits, which
         # have been dropped in hardware.
         LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
-             LKGS, MSRLIST, USER_MSR, MSR_IMM],
+             LKGS, MSRLIST, USER_MSR, MSR_IMM, MOVRS],
 
         # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
         # standard 3DNow in the earlier K6 processors.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 09/10] x86: use / "support" UDB
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (7 preceding siblings ...)
  2025-11-24 15:01 ` [PATCH v9 08/10] x86emul: support non-SIMD MOVRS Jan Beulich
@ 2025-11-24 15:01 ` Jan Beulich
  2025-12-05 12:01   ` Andrew Cooper
  2025-11-24 15:02 ` [PATCH v9 10/10] x86emul: support AVX512-BMM Jan Beulich
  2025-11-24 15:03 ` [PATCH v9 00/10] x86emul: misc additions Jan Beulich
  10 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:01 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

With opcode D6h now firmly reserved as another #UD-raising one in 64-bit
mode, use that instead of the two-byte UD2 for bug frame marking.

While there also make the respective adjustment to the emulator.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Should we also switch {svm,vmx}_init_hypercall_page()?

Furthermore x86_64/kexec_reloc.S also has two uses. Question is whether
"tailcall" is being open-coded there, or whether that's deliberately not
using the macro we have.

One of the table entries in stub_selftest() uses UD1, yet not in quite
an appropriate way: The 0x90 following it (presumably meant to be a NOP)
really is a ModR/M byte, requiring a displacement to follow. Wouldn't we
better adjust that (e.g. using 0xcc instead)?
---
v9: New.

--- a/xen/arch/x86/include/asm/bug.h
+++ b/xen/arch/x86/include/asm/bug.h
@@ -21,7 +21,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define BUG_INSTR       "ud2"
+#define BUG_INSTR       ".byte 0xd6" /* UDB */
 #define BUG_ASM_CONST   "c"
 
 #else  /* !__ASSEMBLY__ */
@@ -37,7 +37,7 @@
         .error "Invalid BUGFRAME index"
     .endif
 
-    .L\@ud: ud2a
+    .L\@ud: .byte 0xd6 /* UDB */
 
     .pushsection .rodata.str1, "aMS", @progbits, 1
          .L\@s1: .asciz "\file_str"
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1256,7 +1256,7 @@ void asmlinkage do_trap(struct cpu_user_
 
 void asmlinkage do_invalid_op(struct cpu_user_regs *regs)
 {
-    u8 bug_insn[2];
+    uint8_t bug_insn;
     const void *eip = (const void *)regs->rip;
     int id;
 
@@ -1268,8 +1268,8 @@ void asmlinkage do_invalid_op(struct cpu
     }
 
     if ( !is_active_kernel_text(regs->rip) ||
-         copy_from_unsafe(bug_insn, eip, sizeof(bug_insn)) ||
-         memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
+         copy_from_unsafe(&bug_insn, eip, sizeof(bug_insn)) ||
+         bug_insn != 0xd6 /* UDB */ )
         goto die;
 
     id = do_bug_frame(regs, regs->rip);
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -651,7 +651,7 @@ decode_onebyte(struct x86_emulate_state
     case 0xce: /* into */
     case 0xd4: /* aam */
     case 0xd5: /* aad */
-    case 0xd6: /* salc */
+        /* 0xd6 (salc) omitted here, for #UD to be raised in 64-bit mode. */
         s->not_64bit = true;
         break;
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2470,9 +2470,17 @@ x86_emulate(
         _regs.eflags |= even_parity(_regs.al) ? X86_EFLAGS_PF : 0;
         break;
 
-    case 0xd6: /* salc */
-        _regs.al = (_regs.eflags & X86_EFLAGS_CF) ? 0xff : 0x00;
-        break;
+    case 0xd6: /* salc / udb */
+        if ( !mode_64bit() )
+        {
+            _regs.al = (_regs.eflags & X86_EFLAGS_CF) ? 0xff : 0x00;
+            break;
+        }
+        fallthrough;
+    case X86EMUL_OPC(0x0f, 0x0b): /* ud2 */
+    case X86EMUL_OPC(0x0f, 0xb9): /* ud1 */
+    case X86EMUL_OPC(0x0f, 0xff): /* ud0 */
+        generate_exception(X86_EXC_UD);
 
     case 0xd7: /* xlat */ {
         unsigned long al;
@@ -3204,11 +3212,6 @@ x86_emulate(
             goto done;
         break;
 
-    case X86EMUL_OPC(0x0f, 0x0b): /* ud2 */
-    case X86EMUL_OPC(0x0f, 0xb9): /* ud1 */
-    case X86EMUL_OPC(0x0f, 0xff): /* ud0 */
-        generate_exception(X86_EXC_UD);
-
     case X86EMUL_OPC(0x0f, 0x0d): /* GrpP (prefetch) */
     case X86EMUL_OPC(0x0f, 0x18): /* Grp16 (prefetch/nop) */
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v9 10/10] x86emul: support AVX512-BMM
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (8 preceding siblings ...)
  2025-11-24 15:01 ` [PATCH v9 09/10] x86: use / "support" UDB Jan Beulich
@ 2025-11-24 15:02 ` Jan Beulich
  2025-12-05 12:33   ` Andrew Cooper
  2025-11-24 15:03 ` [PATCH v9 00/10] x86emul: misc additions Jan Beulich
  10 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:02 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

EVEX.W meaning is unusual for VBMAC{,X}OR16x16x16, but that needs taking
care of only in the test harness.

Like already proposed in "x86emul: support AVX10.1", use just
vcpu_must_have(), not host_and_vcpu_must_have().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
The Disp8Shift settings are guesswork; the binutils submission bogusly(?)
suggests no scaling at all.

No idea how to test this without having access to capable hardware. AMD,
to my knowledge, offers no equivalent to Intel's SDE.
---
v9: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -689,6 +689,15 @@ static const struct test avx512_fp16_128
     INSN(movw, 66, map5, 7e, el, fp16, el),
 };
 
+static const struct test avx512_bmm_all[] = {
+    INSN(bitrev,   , map6, 81, vl, b, vl),
+};
+
+static const struct test avx512_bmm_no128[] = {
+    INSN(bmacor16x16x16,    , map6, 81, vl, w, vl),
+    INSN(bmacxor16x16x16,   , map6, 81, vl, w, vl),
+};
+
 static const struct test gfni_all[] = {
     INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
     INSN(gf2p8affineqb,    66, 0f3a, ce, vl, q, vl),
@@ -817,6 +826,12 @@ static void test_one(const struct test *
 
     case ESZ_w:
         evex.w = 1;
+        /*
+         * VBMAC{,X}OR16x16x16 don't follow the general pattern: EVEX.W controls
+         * reduction kind there, not element size.
+         */
+        if ( test->spc == SPC_map6 && !test->pfx && test->opc == 0x80 )
+            evex.w = test->mnemonic[4] == 'x';
         /* fall through */
     case ESZ_fp16:
         esz = 2;
@@ -1087,6 +1102,8 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512_vpopcntdq, all);
     RUN(avx512_fp16, all);
     RUN(avx512_fp16, 128);
+    RUN(avx512_bmm, all);
+    RUN(avx512_bmm, no128);
 
     if ( cpu_has_avx512f )
     {
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -2157,6 +2157,9 @@ static const struct evex {
     { { 0x56 }, 2, T, R, pfx_f2, W0, Ln }, /* vfcmaddcph */
     { { 0x57 }, 2, T, R, pfx_f3, W0, LIG }, /* vfmaddcsh */
     { { 0x57 }, 2, T, R, pfx_f2, W0, LIG }, /* vfcmaddcsh */
+    { { 0x80 }, 2, T, R, pfx_no, W0, L1 | L2 }, /* vbmacor16x16x16 */
+    { { 0x80 }, 2, T, R, pfx_no, W1, L1 | L2 }, /* vbmacxor16x16x16 */
+    { { 0x81 }, 2, T, R, pfx_no, W0, Ln }, /* vbitrev */
     { { 0x96 }, 2, T, R, pfx_66, W0, Ln }, /* vfmaddsub132ph */
     { { 0x97 }, 2, T, R, pfx_66, W0, Ln }, /* vfmsubadd132ph */
     { { 0x98 }, 2, T, R, pfx_66, W0, Ln }, /* vfmadd132ph */
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -223,6 +223,8 @@ void wrpkru(unsigned int val);
 #define cpu_has_xop                 (cpu_policy.extd.xop  && xcr0_mask(6))
 #define cpu_has_fma4                (cpu_policy.extd.fma4 && xcr0_mask(6))
 #define cpu_has_tbm                  cpu_policy.extd.tbm
+#define cpu_has_avx512_bmm          (cpu_policy.extd.avx512_bmm && \
+                                     xcr0_mask(0xe6))
 
 int emul_test_cpuid(
     uint32_t leaf,
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -334,6 +334,7 @@ XEN_CPUFEATURE(CPUID_USER_DIS,     11*32
 XEN_CPUFEATURE(EPSF,               11*32+18) /*A  Enhanced Predictive Store Forwarding */
 XEN_CPUFEATURE(FSRSC,              11*32+19) /*A  Fast Short REP SCASB */
 XEN_CPUFEATURE(AMD_PREFETCHI,      11*32+20) /*A  PREFETCHIT{0,1} Instructions */
+XEN_CPUFEATURE(AVX512_BMM,         11*32+23) /*a  AVX512 Bitmap Manipulation Instructions */
 XEN_CPUFEATURE(SBPB,               11*32+27) /*A  Selective Branch Predictor Barrier */
 XEN_CPUFEATURE(IBPB_BRTYPE,        11*32+28) /*A  IBPB flushes Branch Type predictions too */
 XEN_CPUFEATURE(SRSO_NO,            11*32+29) /*A  Hardware not vulnerable to Speculative Return Stack Overflow */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -308,7 +308,7 @@ def crunch_numbers(state):
         # dependents of AVX512BW (as to requiring wider than 16-bit mask
         # registers), despite the SDM not formally making this connection.
         AVX512BW: [AVX512_VBMI, AVX512_VBMI2, AVX512_BITALG, AVX512_BF16,
-                   AVX512_FP16],
+                   AVX512_FP16, AVX512_BMM],
 
         # Extensions with VEX/EVEX encodings keyed to a separate feature
         # flag are made dependents of their respective legacy feature.
--- a/xen/arch/x86/x86_emulate/decode.c
+++ b/xen/arch/x86/x86_emulate/decode.c
@@ -387,6 +387,7 @@ static const struct ext0f38_table {
     [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
     [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x80 ... 0x81] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x82] = { .simd_size = simd_other },
     [0x83] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x88] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_dq },
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -568,6 +568,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_clzero()      (ctxt->cpuid->extd.clzero)
 #define vcpu_has_wbnoinvd()    (ctxt->cpuid->extd.wbnoinvd)
 #define vcpu_has_nscb()        (ctxt->cpuid->extd.nscb)
+#define vcpu_has_avx512_bmm()  (ctxt->cpuid->extd.avx512_bmm)
 
 #define vcpu_has_bmi1()        (ctxt->cpuid->feat.bmi1)
 #define vcpu_has_hle()         (ctxt->cpuid->feat.hle)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7990,6 +7990,19 @@ x86_emulate(
         goto simd_zmm;
     }
 
+    case X86EMUL_OPC_EVEX(6, 0x80): /* vbmac{,x}or16x16x16 [xyz]mm/mem,[xyz]mm,[xyz]mm */
+        vcpu_must_have(avx512_bmm);
+        generate_exception_if(!evex.lr || evex.brs || evex.opmsk, X86_EXC_UD);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX(6, 0x81): /* vbitrev [xyz]mm/mem,[xyz]mm */
+        vcpu_must_have(avx512_bmm);
+        generate_exception_if(evex.w || evex.brs || evex.reg != 0xf || !evex.RX,
+                              X86_EXC_UD);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 00/10] x86emul: misc additions
  2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
                   ` (9 preceding siblings ...)
  2025-11-24 15:02 ` [PATCH v9 10/10] x86emul: support AVX512-BMM Jan Beulich
@ 2025-11-24 15:03 ` Jan Beulich
  10 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-24 15:03 UTC (permalink / raw)
  To: xen-devel@lists.xenproject.org; +Cc: Andrew Cooper, Roger Pau Monné

I screwed up in two ways here: As the cover letter, this is 00/10, and ...

On 24.11.2025 15:56, Jan Beulich wrote:
> 01: x86emul: support LKGS
> 02: x86emul+VMX: support {RD,WR}MSRLIST
> 03: x86emul: support USER_MSR instructions
> 04: x86/cpu-policy: re-arrange no-VMX logic
> 05: VMX: support USER_MSR
> 06: VMX: support MSR-IMM
> 07: x86emul: support MSR_IMM instructions

... the order of 06 and 07 are the wrong way round here.

Jan

> 08: x86emul: support non-SIMD MOVRS
> 09: x86: use / "support" UDB
> 10: x86emul: support AVX512-BMM
> 
> Jan



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 07/10] VMX: support MSR-IMM
  2025-11-24 15:00 ` [PATCH v9 07/10] VMX: support MSR-IMM Jan Beulich
@ 2025-11-26 18:50   ` Andrew Cooper
  2025-11-27  8:18     ` Jan Beulich
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Cooper @ 2025-11-26 18:50 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org; +Cc: Roger Pau Monné

On 24/11/2025 3:00 pm, Jan Beulich wrote:
> Hook up the new VM exit codes and handle guest uses of the insns.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v9: New.
> ---
> The lack of an enable bit is concerning; at least for the nested case
> that's a security issue afaict (when L0 isn't aware of the insns, or more
> specifically the exit codes).

This is why we need support statements of new CPUs.

Intel say that unknown VMExits turning into #UD is the expected course
of action.  That covers all of these cases which don't have an explicit
enable.

This is better than our current behaviour, which is non-architectural
for supervisor code and practically the most unhelpful course of action
going.

Obviously, logic turning on a new feature is expected to handle all the
VMExit cases it can produce.

The corollary for nested virt is that L0 must never make a Virtual
VMExit with case that isn't enabled.  Combined with #UD in the unknown
case, that covers things reasonably well.

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -453,7 +453,7 @@ void domain_cpu_policy_changed(struct do
>      }
>  
>      /* Nested doesn't have the necessary processing, yet. */
> -    if ( nestedhvm_enabled(d) && p->feat.user_msr )
> +    if ( nestedhvm_enabled(d) && (p->feat.user_msr || p->feat.msr_imm) )
>          return /* -EINVAL */;

What processing is missing?  (Aside from correcting the unknown case.)

>  
>      for_each_vcpu ( d, v )
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -4762,6 +4762,7 @@ void asmlinkage vmx_vmexit_handler(struc
>          break;
>  
>      case EXIT_REASON_URDMSR:
> +    case EXIT_REASON_RDMSR_IMM:

Instructions which aren't enumerated in CPUID have reserved behaviour.

The exit handler needs to check cp->feat.msr_imm and inject #UD.

It's not perfect; un-intercepted MSRs will happen to execute correctly
on capable hardware, but most MSRs are not intercepted and it's far
closer to adequate behaviour than omitting the #UD check.

~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 07/10] VMX: support MSR-IMM
  2025-11-26 18:50   ` Andrew Cooper
@ 2025-11-27  8:18     ` Jan Beulich
  0 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-11-27  8:18 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Roger Pau Monné, xen-devel@lists.xenproject.org

On 26.11.2025 19:50, Andrew Cooper wrote:
> On 24/11/2025 3:00 pm, Jan Beulich wrote:
>> Hook up the new VM exit codes and handle guest uses of the insns.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v9: New.
>> ---
>> The lack of an enable bit is concerning; at least for the nested case
>> that's a security issue afaict (when L0 isn't aware of the insns, or more
>> specifically the exit codes).
> 
> This is why we need support statements of new CPUs.

I hope you don't mean the lack thereof to be a blocking factor for this
change?

> Intel say that unknown VMExits turning into #UD is the expected course
> of action.  That covers all of these cases which don't have an explicit
> enable.

Do you have a pointer?

> This is better than our current behaviour, which is non-architectural
> for supervisor code and practically the most unhelpful course of action
> going.
> 
> Obviously, logic turning on a new feature is expected to handle all the
> VMExit cases it can produce.

What you say here ...

> The corollary for nested virt is that L0 must never make a Virtual
> VMExit with case that isn't enabled.  Combined with #UD in the unknown
> case, that covers things reasonably well.
> 
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -453,7 +453,7 @@ void domain_cpu_policy_changed(struct do
>>      }
>>  
>>      /* Nested doesn't have the necessary processing, yet. */
>> -    if ( nestedhvm_enabled(d) && p->feat.user_msr )
>> +    if ( nestedhvm_enabled(d) && (p->feat.user_msr || p->feat.msr_imm) )
>>          return /* -EINVAL */;
> 
> What processing is missing?  (Aside from correcting the unknown case.)

... is what would need doing for this check to disappear. Going the #UD
route looks to make sense, but it still wouldn't feel quite right then
to drop this check. If we expose the feature, we shouldn't convert
respective exits to #UD. They aren't "unknown" (anymore) after all.

And then again the question is: Are you expecting me to deal with
switching to the #UD model as a prereq (if, as per above, that's
relevant at all here)? If so, I have to admit that it's not quite clear
to me what exactly this would be meant to look like: Alter the default
cases of the big switch()es in both the normal and nested exit handlers?
Or merely the latter?

>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -4762,6 +4762,7 @@ void asmlinkage vmx_vmexit_handler(struc
>>          break;
>>  
>>      case EXIT_REASON_URDMSR:
>> +    case EXIT_REASON_RDMSR_IMM:
> 
> Instructions which aren't enumerated in CPUID have reserved behaviour.
> 
> The exit handler needs to check cp->feat.msr_imm and inject #UD.
> 
> It's not perfect; un-intercepted MSRs will happen to execute correctly
> on capable hardware, but most MSRs are not intercepted and it's far
> closer to adequate behaviour than omitting the #UD check.

Hmm, okay, I can certainly do it this way.

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-11-24 15:01 ` [PATCH v9 09/10] x86: use / "support" UDB Jan Beulich
@ 2025-12-05 12:01   ` Andrew Cooper
  2025-12-05 12:40     ` Andrew Cooper
  2025-12-05 13:09     ` Jan Beulich
  0 siblings, 2 replies; 26+ messages in thread
From: Andrew Cooper @ 2025-12-05 12:01 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org
  Cc: andrew.cooper3, Roger Pau Monné

On 24/11/2025 3:01 pm, Jan Beulich wrote:
> With opcode D6h now firmly reserved as another #UD-raising one in 64-bit
> mode, use that instead of the two-byte UD2 for bug frame marking.
>
> While there also make the respective adjustment to the emulator.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Should we also switch {svm,vmx}_init_hypercall_page()?

The hypercall pages are 32/64-agnostic right now, and used by 32bit
guests.  UDB isn't safe to use in those cases.

> Furthermore x86_64/kexec_reloc.S also has two uses. Question is whether
> "tailcall" is being open-coded there, or whether that's deliberately not
> using the macro we have.

The code in kexec_reloc.S long predates the tailcall macro.  Also I now
regret calling it tailcall; terminalcall might be more accurate but it's
overly long.

Very counter-intuitively, Linux has plain KEXEC which is phrase as
"call", and KEXEC_JUMP which the ability for the invoked kernel to return.

For the regular cases, Linux invokes the new kernel by spilling a
register to the stack, and RET-ing to it.  For KEXEC_JUMP, it does call
the target kernel, after some games to set up a stack in the target image.

Either way, we can convert these two to plain JMPs.  One of the ud2's in
particular cannot be converted to UDB because of being in 32bit code.

I can do a patch if you'd like?

>
> One of the table entries in stub_selftest() uses UD1, yet not in quite
> an appropriate way: The 0x90 following it (presumably meant to be a NOP)
> really is a ModR/M byte, requiring a displacement to follow. Wouldn't we
> better adjust that (e.g. using 0xcc instead)?

Oh lovely...  That was my mistake in 2eb1132f7963, where I was
converting what looked to be a double ret.

Can't we use 0x00 so it doesn't look like an int3 either, and update the
comment to make it explicit that there's a ModRM byte?

> ---
> v9: New.
>
> --- a/xen/arch/x86/include/asm/bug.h
> +++ b/xen/arch/x86/include/asm/bug.h
> @@ -21,7 +21,7 @@
>  
>  #ifndef __ASSEMBLY__
>  
> -#define BUG_INSTR       "ud2"
> +#define BUG_INSTR       ".byte 0xd6" /* UDB */
>  #define BUG_ASM_CONST   "c"
>  
>  #else  /* !__ASSEMBLY__ */
> @@ -37,7 +37,7 @@
>          .error "Invalid BUGFRAME index"
>      .endif
>  
> -    .L\@ud: ud2a
> +    .L\@ud: .byte 0xd6 /* UDB */

add/remove: 0/0 grow/shrink: 1/520 up/down: 284/-1024 (-740)
Function                                     old     new   delta
x86_emulate                               144961  145245    +284
zap_low_mappings                              82      81      -1
xstate_set_init                              298     297      -1
...
paging_mark_dirty                            280     261     -19
csched_schedule                             3711    3691     -20
vmx_create_vmcs                             3740    3663     -77


I think this says quite a lot about how bad the vmread/write helpers
really are.

> --- a/xen/arch/x86/x86_emulate/decode.c
> +++ b/xen/arch/x86/x86_emulate/decode.c
> @@ -651,7 +651,7 @@ decode_onebyte(struct x86_emulate_state
>      case 0xce: /* into */
>      case 0xd4: /* aam */
>      case 0xd5: /* aad */
> -    case 0xd6: /* salc */
> +        /* 0xd6 (salc) omitted here, for #UD to be raised in 64-bit mode. */
>          s->not_64bit = true;
>          break;

Why does the not_64bit logic not suffice?  #UD is exactly what we do:

    generate_exception_if(state->not_64bit && mode_64bit(), X86_EXC_UD);


~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 10/10] x86emul: support AVX512-BMM
  2025-11-24 15:02 ` [PATCH v9 10/10] x86emul: support AVX512-BMM Jan Beulich
@ 2025-12-05 12:33   ` Andrew Cooper
  2025-12-05 12:47     ` Jan Beulich
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Cooper @ 2025-12-05 12:33 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org
  Cc: andrew.cooper3, Roger Pau Monné

On 24/11/2025 3:02 pm, Jan Beulich wrote:
> EVEX.W meaning is unusual for VBMAC{,X}OR16x16x16, but that needs taking
> care of only in the test harness.
>
> Like already proposed in "x86emul: support AVX10.1", use just
> vcpu_must_have(), not host_and_vcpu_must_have().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> The Disp8Shift settings are guesswork; the binutils submission bogusly(?)
> suggests no scaling at all.
>
> No idea how to test this without having access to capable hardware. AMD,
> to my knowledge, offers no equivalent to Intel's SDE.

I'm not aware of anything equivalent for AMD.

IIRC, the binutils thread says Zen6 for these instructions?  I'm still
trying to get access myself.  No ETA yet.

Very tentatively Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Given that .W is wonky for these instructions, I wouldn't quite so
easily rule out other wonkyness.

Would the test harness pick that up?  Not AFAICT.

~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-12-05 12:01   ` Andrew Cooper
@ 2025-12-05 12:40     ` Andrew Cooper
  2025-12-05 13:13       ` Jan Beulich
  2025-12-05 13:15       ` Jan Beulich
  2025-12-05 13:09     ` Jan Beulich
  1 sibling, 2 replies; 26+ messages in thread
From: Andrew Cooper @ 2025-12-05 12:40 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org
  Cc: andrew.cooper3, Roger Pau Monné

On 05/12/2025 12:01 pm, Andrew Cooper wrote:
> On 24/11/2025 3:01 pm, Jan Beulich wrote:
>> --- a/xen/arch/x86/include/asm/bug.h
>> +++ b/xen/arch/x86/include/asm/bug.h
>> @@ -21,7 +21,7 @@
>>  
>>  #ifndef __ASSEMBLY__
>>  
>> -#define BUG_INSTR       "ud2"
>> +#define BUG_INSTR       ".byte 0xd6" /* UDB */
>>  #define BUG_ASM_CONST   "c"
>>  
>>  #else  /* !__ASSEMBLY__ */
>> @@ -37,7 +37,7 @@
>>          .error "Invalid BUGFRAME index"
>>      .endif
>>  
>> -    .L\@ud: ud2a
>> +    .L\@ud: .byte 0xd6 /* UDB */

P.S. Presumably binutils is going to learn a udb mnemonic at some
point?  Can we include a version number in the comment?

I'm trying to organise such comments everywhere so it's less effort to
figure out when we can drop it in the future.

~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 10/10] x86emul: support AVX512-BMM
  2025-12-05 12:33   ` Andrew Cooper
@ 2025-12-05 12:47     ` Jan Beulich
  2026-04-07 15:11       ` Andrew Cooper
  0 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-12-05 12:47 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Roger Pau Monné, xen-devel@lists.xenproject.org

On 05.12.2025 13:33, Andrew Cooper wrote:
> On 24/11/2025 3:02 pm, Jan Beulich wrote:
>> EVEX.W meaning is unusual for VBMAC{,X}OR16x16x16, but that needs taking
>> care of only in the test harness.
>>
>> Like already proposed in "x86emul: support AVX10.1", use just
>> vcpu_must_have(), not host_and_vcpu_must_have().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> The Disp8Shift settings are guesswork; the binutils submission bogusly(?)
>> suggests no scaling at all.

I realize I should have dropped this remark: It was applicable only to
early versions of that change.

>> No idea how to test this without having access to capable hardware. AMD,
>> to my knowledge, offers no equivalent to Intel's SDE.
> 
> I'm not aware of anything equivalent for AMD.
> 
> IIRC, the binutils thread says Zen6 for these instructions?  I'm still
> trying to get access myself.  No ETA yet.

Yes, that's what they add for Zen6. I committed that patch just earlier
today.

> Very tentatively Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks. I'll put this into the patch as-is; it's unclear to me though
whether I could legitimately commit the patch with this. (It really
doesn't depend on earlier patches in the series, after all.)

> Given that .W is wonky for these instructions, I wouldn't quite so
> easily rule out other wonkyness.

It's AMD's way of giving .W dual purpose; really triple now.

> Would the test harness pick that up?  Not AFAICT.

Not without adding something to it, which I think makes sense only when
one can test it. (Which is where Intel's SDE helps quite a bit.)

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-12-05 12:01   ` Andrew Cooper
  2025-12-05 12:40     ` Andrew Cooper
@ 2025-12-05 13:09     ` Jan Beulich
  1 sibling, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2025-12-05 13:09 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Roger Pau Monné, xen-devel@lists.xenproject.org

On 05.12.2025 13:01, Andrew Cooper wrote:
> On 24/11/2025 3:01 pm, Jan Beulich wrote:
>> With opcode D6h now firmly reserved as another #UD-raising one in 64-bit
>> mode, use that instead of the two-byte UD2 for bug frame marking.
>>
>> While there also make the respective adjustment to the emulator.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> Should we also switch {svm,vmx}_init_hypercall_page()?
> 
> The hypercall pages are 32/64-agnostic right now, and used by 32bit
> guests.  UDB isn't safe to use in those cases.
> 
>> Furthermore x86_64/kexec_reloc.S also has two uses. Question is whether
>> "tailcall" is being open-coded there, or whether that's deliberately not
>> using the macro we have.
> 
> The code in kexec_reloc.S long predates the tailcall macro.  Also I now
> regret calling it tailcall; terminalcall might be more accurate but it's
> overly long.
> 
> Very counter-intuitively, Linux has plain KEXEC which is phrase as
> "call", and KEXEC_JUMP which the ability for the invoked kernel to return.
> 
> For the regular cases, Linux invokes the new kernel by spilling a
> register to the stack, and RET-ing to it.  For KEXEC_JUMP, it does call
> the target kernel, after some games to set up a stack in the target image.
> 
> Either way, we can convert these two to plain JMPs.  One of the ud2's in
> particular cannot be converted to UDB because of being in 32bit code.

Oh, right.

> I can do a patch if you'd like?

I could as well. However, isn't BUG wrong (or at the very least misleading)
to use in either case? The code there isn't executed in place, but at
wherever the page is copied. Hence the address association in the resulting
bug frame is useless.

>> One of the table entries in stub_selftest() uses UD1, yet not in quite
>> an appropriate way: The 0x90 following it (presumably meant to be a NOP)
>> really is a ModR/M byte, requiring a displacement to follow. Wouldn't we
>> better adjust that (e.g. using 0xcc instead)?
> 
> Oh lovely...  That was my mistake in 2eb1132f7963, where I was
> converting what looked to be a double ret.
> 
> Can't we use 0x00 so it doesn't look like an int3 either, and update the
> comment to make it explicit that there's a ModRM byte?

As the situation isn't entirely tidy for whether UD1 takes a ModR/M byte,
at the very least I'd want to use something that's a single-byte opcode.
There aren't that many candidates in the 00-3F opcode range. The PUSH/POP
sreg insns I would want to avoid. The (no-op) segment overrides would
associate with whatever follows. Which would leave only the 4 opcodes
which are invalid in 64-bit mode.

If CC isn't desirable from your perspective, could we perhaps pick another
single-byte opcode from the C0-FF range?

>> --- a/xen/arch/x86/x86_emulate/decode.c
>> +++ b/xen/arch/x86/x86_emulate/decode.c
>> @@ -651,7 +651,7 @@ decode_onebyte(struct x86_emulate_state
>>      case 0xce: /* into */
>>      case 0xd4: /* aam */
>>      case 0xd5: /* aad */
>> -    case 0xd6: /* salc */
>> +        /* 0xd6 (salc) omitted here, for #UD to be raised in 64-bit mode. */
>>          s->not_64bit = true;
>>          break;
> 
> Why does the not_64bit logic not suffice?  #UD is exactly what we do:
> 
>     generate_exception_if(state->not_64bit && mode_64bit(), X86_EXC_UD);

Hmm, right, we could get away with just a comment adjustment here.

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-12-05 12:40     ` Andrew Cooper
@ 2025-12-05 13:13       ` Jan Beulich
  2025-12-05 13:15         ` Andrew Cooper
  2025-12-05 13:15       ` Jan Beulich
  1 sibling, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-12-05 13:13 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Roger Pau Monné, xen-devel@lists.xenproject.org

On 05.12.2025 13:40, Andrew Cooper wrote:
> On 05/12/2025 12:01 pm, Andrew Cooper wrote:
>> On 24/11/2025 3:01 pm, Jan Beulich wrote:
>>> --- a/xen/arch/x86/include/asm/bug.h
>>> +++ b/xen/arch/x86/include/asm/bug.h
>>> @@ -21,7 +21,7 @@
>>>  
>>>  #ifndef __ASSEMBLY__
>>>  
>>> -#define BUG_INSTR       "ud2"
>>> +#define BUG_INSTR       ".byte 0xd6" /* UDB */
>>>  #define BUG_ASM_CONST   "c"
>>>  
>>>  #else  /* !__ASSEMBLY__ */
>>> @@ -37,7 +37,7 @@
>>>          .error "Invalid BUGFRAME index"
>>>      .endif
>>>  
>>> -    .L\@ud: ud2a
>>> +    .L\@ud: .byte 0xd6 /* UDB */
> 
> P.S. Presumably binutils is going to learn a udb mnemonic at some
> point?  Can we include a version number in the comment?

I has already learned it, so it'll be available from 2.46 onwards. I've
added a comment, but aiui we'd then need to also cover Clang's integrated
assembler (if and when that gains support). In the meantime I've made both
comments say "UDB, requiring gas 2.46".

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-12-05 13:13       ` Jan Beulich
@ 2025-12-05 13:15         ` Andrew Cooper
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Cooper @ 2025-12-05 13:15 UTC (permalink / raw)
  To: Jan Beulich
  Cc: andrew.cooper3, Roger Pau Monné,
	xen-devel@lists.xenproject.org

On 05/12/2025 1:13 pm, Jan Beulich wrote:
> On 05.12.2025 13:40, Andrew Cooper wrote:
>> On 05/12/2025 12:01 pm, Andrew Cooper wrote:
>>> On 24/11/2025 3:01 pm, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/include/asm/bug.h
>>>> +++ b/xen/arch/x86/include/asm/bug.h
>>>> @@ -21,7 +21,7 @@
>>>>  
>>>>  #ifndef __ASSEMBLY__
>>>>  
>>>> -#define BUG_INSTR       "ud2"
>>>> +#define BUG_INSTR       ".byte 0xd6" /* UDB */
>>>>  #define BUG_ASM_CONST   "c"
>>>>  
>>>>  #else  /* !__ASSEMBLY__ */
>>>> @@ -37,7 +37,7 @@
>>>>          .error "Invalid BUGFRAME index"
>>>>      .endif
>>>>  
>>>> -    .L\@ud: ud2a
>>>> +    .L\@ud: .byte 0xd6 /* UDB */
>> P.S. Presumably binutils is going to learn a udb mnemonic at some
>> point?  Can we include a version number in the comment?
> I has already learned it, so it'll be available from 2.46 onwards. I've
> added a comment, but aiui we'd then need to also cover Clang's integrated
> assembler (if and when that gains support). In the meantime I've made both
> comments say "UDB, requiring gas 2.46".

Thanks.  Clang Trunk doesn't currently know it.

https://godbolt.org/z/ccrhzq95h

~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-12-05 12:40     ` Andrew Cooper
  2025-12-05 13:13       ` Jan Beulich
@ 2025-12-05 13:15       ` Jan Beulich
  2025-12-05 13:35         ` Andrew Cooper
  1 sibling, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2025-12-05 13:15 UTC (permalink / raw)
  To: Andrew Cooper, xen-devel@lists.xenproject.org; +Cc: Roger Pau Monné

On 05.12.2025 13:40, Andrew Cooper wrote:
> On 05/12/2025 12:01 pm, Andrew Cooper wrote:
>> On 24/11/2025 3:01 pm, Jan Beulich wrote:
>>> --- a/xen/arch/x86/include/asm/bug.h
>>> +++ b/xen/arch/x86/include/asm/bug.h
>>> @@ -21,7 +21,7 @@
>>>  
>>>  #ifndef __ASSEMBLY__
>>>  
>>> -#define BUG_INSTR       "ud2"
>>> +#define BUG_INSTR       ".byte 0xd6" /* UDB */
>>>  #define BUG_ASM_CONST   "c"
>>>  
>>>  #else  /* !__ASSEMBLY__ */
>>> @@ -37,7 +37,7 @@
>>>          .error "Invalid BUGFRAME index"
>>>      .endif
>>>  
>>> -    .L\@ud: ud2a
>>> +    .L\@ud: .byte 0xd6 /* UDB */
> 
> P.S. Presumably binutils is going to learn a udb mnemonic at some
> point?  Can we include a version number in the comment?
> 
> I'm trying to organise such comments everywhere so it's less effort to
> figure out when we can drop it in the future.

For them to be useful, wouldn't we need to settle on some canonical form
first? Else how would one locate them (other than by coming across them
by chance)?

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 09/10] x86: use / "support" UDB
  2025-12-05 13:15       ` Jan Beulich
@ 2025-12-05 13:35         ` Andrew Cooper
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Cooper @ 2025-12-05 13:35 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org
  Cc: andrew.cooper3, Roger Pau Monné

On 05/12/2025 1:15 pm, Jan Beulich wrote:
> On 05.12.2025 13:40, Andrew Cooper wrote:
>> On 05/12/2025 12:01 pm, Andrew Cooper wrote:
>>> On 24/11/2025 3:01 pm, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/include/asm/bug.h
>>>> +++ b/xen/arch/x86/include/asm/bug.h
>>>> @@ -21,7 +21,7 @@
>>>>  
>>>>  #ifndef __ASSEMBLY__
>>>>  
>>>> -#define BUG_INSTR       "ud2"
>>>> +#define BUG_INSTR       ".byte 0xd6" /* UDB */
>>>>  #define BUG_ASM_CONST   "c"
>>>>  
>>>>  #else  /* !__ASSEMBLY__ */
>>>> @@ -37,7 +37,7 @@
>>>>          .error "Invalid BUGFRAME index"
>>>>      .endif
>>>>  
>>>> -    .L\@ud: ud2a
>>>> +    .L\@ud: .byte 0xd6 /* UDB */
>> P.S. Presumably binutils is going to learn a udb mnemonic at some
>> point?  Can we include a version number in the comment?
>>
>> I'm trying to organise such comments everywhere so it's less effort to
>> figure out when we can drop it in the future.
> For them to be useful, wouldn't we need to settle on some canonical form
> first? Else how would one locate them (other than by coming across them
> by chance)?

I don't think a canonical form to use.  Some of the (in progress)
examples I've got are:

diff --git a/xen/arch/x86/arch.mk b/xen/arch/x86/arch.mk
index 16368a498bb7..870823f7991d 100644
--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -14,10 +14,14 @@ CFLAGS-$(CONFIG_CC_IS_GCC) += -malign-data=abi
 $(call cc-options-add,CFLAGS,CC,$(EMBEDDED_EXTRA_CFLAGS))
 $(call cc-option-add,CFLAGS,CC,-Wnested-externs)
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$(comma)1",-DHAVE_AS_QUOTED_SYM)
+
+# Binutils >= (<=2.34), Clang >= 7
 $(call as-option-add,CFLAGS,CC,"movdiri %rax$(comma)(%rax)",-DHAVE_AS_MOVDIR)
+
+# Binutils >= (<=2.34), Clang >= 9
 $(call as-option-add,CFLAGS,CC,"enqcmd (%rax)$(comma)%rax",-DHAVE_AS_ENQCMD)
 
-# Check to see whether the assembler supports the .nop directive.
+# Binutils >= (<=2.34)
 $(call as-option-add,CFLAGS,CC,\
     ".L1: .L2: .nops (.L2 - .L1)$(comma)9",-DHAVE_AS_NOPS_DIRECTIVE)
 
diff --git a/xen/arch/x86/include/asm/asm-defns.h b/xen/arch/x86/include/asm/asm-defns.h
index 239dc3af096c..b1d5539bdc7c 100644
--- a/xen/arch/x86/include/asm/asm-defns.h
+++ b/xen/arch/x86/include/asm/asm-defns.h
@@ -27,6 +27,8 @@
  *
  * With no compiler support, this degrades into a plain indirect call/jmp.
  * With compiler support, dispatch to the correct __x86_indirect_thunk_*
+ *
+ * TODO: Use %V constraint modifier (GCC >= 8).
  */
     .if CONFIG_INDIRECT_THUNK == 1

 
The examples using .byte are easy to find.  It's rather less easy doing
the archaeology on binutils, as gotbolt does not have every release
(unless GCC and Clang), and the release notes are not as great as they
could be for some instruction groups.

Our Kconfig annotations are better (insofar that most are annotated),
but even there there's quite a mix.

~Andrew


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 10/10] x86emul: support AVX512-BMM
  2025-12-05 12:47     ` Jan Beulich
@ 2026-04-07 15:11       ` Andrew Cooper
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Cooper @ 2026-04-07 15:11 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, Roger Pau Monné,
	xen-devel@lists.xenproject.org

On 05/12/2025 12:47 pm, Jan Beulich wrote:
> On 05.12.2025 13:33, Andrew Cooper wrote:
>> Very tentatively Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Thanks. I'll put this into the patch as-is; it's unclear to me though
> whether I could legitimately commit the patch with this. (It really
> doesn't depend on earlier patches in the series, after all.)

For an off-by-default change like this, it's fine.  There needs to be
another patch swapping a for A when we get access to real hardware,
where any other changes can be accommodated.

In the meantime AMD have published a spec
https://docs.amd.com/v/u/en-US/69192-PUB so feel free to upgrade to a
regular A-by.

~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic
  2025-11-24 14:59 ` [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic Jan Beulich
@ 2026-04-07 21:58   ` Andrew Cooper
  2026-04-08  6:09     ` Jan Beulich
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Cooper @ 2026-04-07 21:58 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xenproject.org
  Cc: Andrew Cooper, Roger Pau Monné

On 24/11/2025 2:59 pm, Jan Beulich wrote:
> Move the PKS check into an "else" for the corresponding "if()", such
> that further adjustments (like for USER_MSR) can easily be put there as
> well.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v5: Re-base.
> v4: New.
>
> --- a/xen/arch/x86/cpu-policy.c
> +++ b/xen/arch/x86/cpu-policy.c
> @@ -812,19 +812,20 @@ static void __init calculate_hvm_max_pol
>          if ( !cpu_has_vmx_xsaves )
>              __clear_bit(X86_FEATURE_XSAVES, fs);
>      }
> +    else
> +    {
> +        /*
> +         * Xen doesn't use PKS, so the guest support for it has opted to not use
> +         * the VMCS load/save controls for efficiency reasons.  This depends on
> +         * the exact vmentry/exit behaviour, so don't expose PKS in other
> +         * situations until someone has cross-checked the behaviour for safety.
> +         */
> +        __clear_bit(X86_FEATURE_PKS, fs);
> +    }
>  
>      if ( !cpu_has_vmx_msrlist )
>          __clear_bit(X86_FEATURE_MSRLIST, fs);
>  
> -    /*
> -     * Xen doesn't use PKS, so the guest support for it has opted to not use
> -     * the VMCS load/save controls for efficiency reasons.  This depends on
> -     * the exact vmentry/exit behaviour, so don't expose PKS in other
> -     * situations until someone has cross-checked the behaviour for safety.
> -     */
> -    if ( !cpu_has_vmx )
> -        __clear_bit(X86_FEATURE_PKS, fs);
> -
>      /* 
>       * Make adjustments to possible (nested) virtualization features exposed
>       * to the guest
>

These clauses aren't logically doing the same thing.  So while the
compiler can merge them, I don't think it's a good idea to do so at a
source level.

The "if ( vmx ) " block we can just see the end of is for features which
need to cross-check extra VMX capabilities.  Each of RDTSCP, INVPCID and
XSAVES are #UD unless explicitly enabled.  MPX is the odd-one out,
checking the load/save capabilities.

I suspect this list is incomplete.  These cross-checks shouldn't fail on
real hardware, where the VMX capabilities should match the native
features, but nested virt is rife with enumeration bugs here.

The second "if ( !vmx )" clause is different.  This is really "I wired
up PKS based on how Intel works, and if AMD ever gains it it will
definitely need context switching changes to work".  This is in lieu of
having dedicated Intel/AMD annotations for features.

The MSRLIST addition from the prior patch is arguably misplaced, except
it's trying to cover both of the aspects.


AMD are making no obvious moves to add PKS, and I expect MSRLIST is even
lower down their priority list.

Overall, we need checks here for every guest-visible feature which:
* has VMCS/VMCB controls which are enumerated separately
* needs new context switching considerations

Maybe the "if ( !vmx )" shouldn't really be written this way.  I'm open
to suggestions, but making it an else block isn't a solution.

~Andrew


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic
  2026-04-07 21:58   ` Andrew Cooper
@ 2026-04-08  6:09     ` Jan Beulich
  0 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2026-04-08  6:09 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Roger Pau Monné, xen-devel@lists.xenproject.org

On 07.04.2026 23:58, Andrew Cooper wrote:
> On 24/11/2025 2:59 pm, Jan Beulich wrote:
>> --- a/xen/arch/x86/cpu-policy.c
>> +++ b/xen/arch/x86/cpu-policy.c
>> @@ -812,19 +812,20 @@ static void __init calculate_hvm_max_pol
>>          if ( !cpu_has_vmx_xsaves )
>>              __clear_bit(X86_FEATURE_XSAVES, fs);
>>      }
>> +    else
>> +    {
>> +        /*
>> +         * Xen doesn't use PKS, so the guest support for it has opted to not use
>> +         * the VMCS load/save controls for efficiency reasons.  This depends on
>> +         * the exact vmentry/exit behaviour, so don't expose PKS in other
>> +         * situations until someone has cross-checked the behaviour for safety.
>> +         */
>> +        __clear_bit(X86_FEATURE_PKS, fs);
>> +    }
>>  
>>      if ( !cpu_has_vmx_msrlist )
>>          __clear_bit(X86_FEATURE_MSRLIST, fs);
>>  
>> -    /*
>> -     * Xen doesn't use PKS, so the guest support for it has opted to not use
>> -     * the VMCS load/save controls for efficiency reasons.  This depends on
>> -     * the exact vmentry/exit behaviour, so don't expose PKS in other
>> -     * situations until someone has cross-checked the behaviour for safety.
>> -     */
>> -    if ( !cpu_has_vmx )
>> -        __clear_bit(X86_FEATURE_PKS, fs);
>> -
>>      /* 
>>       * Make adjustments to possible (nested) virtualization features exposed
>>       * to the guest
>>
> 
> These clauses aren't logically doing the same thing.  So while the
> compiler can merge them, I don't think it's a good idea to do so at a
> source level.
> 
> The "if ( vmx ) " block we can just see the end of is for features which
> need to cross-check extra VMX capabilities.  Each of RDTSCP, INVPCID and
> XSAVES are #UD unless explicitly enabled.  MPX is the odd-one out,
> checking the load/save capabilities.
> 
> I suspect this list is incomplete.  These cross-checks shouldn't fail on
> real hardware, where the VMX capabilities should match the native
> features, but nested virt is rife with enumeration bugs here.
> 
> The second "if ( !vmx )" clause is different.  This is really "I wired
> up PKS based on how Intel works, and if AMD ever gains it it will
> definitely need context switching changes to work".  This is in lieu of
> having dedicated Intel/AMD annotations for features.

Right, and this is what I'm using the new "else" body for later in the
series.

> The MSRLIST addition from the prior patch is arguably misplaced, except
> it's trying to cover both of the aspects.

Why is it misplaced? Where else would you want it to go? The VMX aspect,
as you say, is included in cpu_has_vmx_msrlist, so it doesn't need
separate checking for VMX.

> AMD are making no obvious moves to add PKS, and I expect MSRLIST is even
> lower down their priority list.
> 
> Overall, we need checks here for every guest-visible feature which:
> * has VMCS/VMCB controls which are enumerated separately
> * needs new context switching considerations

Which in this series are USER-MSR and MSR-IMM. Hence this prereq change,
to simplify the additions there.

> Maybe the "if ( !vmx )" shouldn't really be written this way.  I'm open
> to suggestions, but making it an else block isn't a solution.

Well in this situation I guess it's you to make suggestions. I have made
mine by way of this submission.

Jan


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2026-04-08  6:09 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-24 14:56 [PATCH v9 10/10] x86emul: misc additions Jan Beulich
2025-11-24 14:57 ` [PATCH v9 01/10] x86emul: support LKGS Jan Beulich
2025-11-24 14:58 ` [PATCH v9 02/10] x86emul+VMX: support {RD,WR}MSRLIST Jan Beulich
2025-11-24 14:58 ` [PATCH v9 03/10] x86emul: support USER_MSR instructions Jan Beulich
2025-11-24 14:59 ` [PATCH v9 04/10] x86/cpu-policy: re-arrange no-VMX logic Jan Beulich
2026-04-07 21:58   ` Andrew Cooper
2026-04-08  6:09     ` Jan Beulich
2025-11-24 15:00 ` [PATCH v9 05/10] VMX: support USER-MSR Jan Beulich
2025-11-24 15:00 ` [PATCH v9 06/10] x86emul: support MSR_IMM instructions Jan Beulich
2025-11-24 15:00 ` [PATCH v9 07/10] VMX: support MSR-IMM Jan Beulich
2025-11-26 18:50   ` Andrew Cooper
2025-11-27  8:18     ` Jan Beulich
2025-11-24 15:01 ` [PATCH v9 08/10] x86emul: support non-SIMD MOVRS Jan Beulich
2025-11-24 15:01 ` [PATCH v9 09/10] x86: use / "support" UDB Jan Beulich
2025-12-05 12:01   ` Andrew Cooper
2025-12-05 12:40     ` Andrew Cooper
2025-12-05 13:13       ` Jan Beulich
2025-12-05 13:15         ` Andrew Cooper
2025-12-05 13:15       ` Jan Beulich
2025-12-05 13:35         ` Andrew Cooper
2025-12-05 13:09     ` Jan Beulich
2025-11-24 15:02 ` [PATCH v9 10/10] x86emul: support AVX512-BMM Jan Beulich
2025-12-05 12:33   ` Andrew Cooper
2025-12-05 12:47     ` Jan Beulich
2026-04-07 15:11       ` Andrew Cooper
2025-11-24 15:03 ` [PATCH v9 00/10] x86emul: misc additions Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.