* [PATCH RFC] kvm: emulate avx vmovdq
@ 2024-08-20 23:04 Keith Busch
2024-08-21 16:12 ` Sean Christopherson
` (4 more replies)
0 siblings, 5 replies; 11+ messages in thread
From: Keith Busch @ 2024-08-20 23:04 UTC (permalink / raw)
To: kvm; +Cc: x86, Keith Busch, Alex Williamson, Paolo Bonzini, Xu Liu
From: Keith Busch <kbusch@kernel.org>
Because people would like to use this (see "Link"), interpret the VEX
prefix and emulate mov instrutions accordingly. The only avx
instructions emulated here are the aligned and unaligned mov.
Everything else will fail as before.
This is new territory for me, so any feedback is appreciated.
To test, I executed the following program against a qemu emulated pci
device resource. Prior to this kernel patch, it would fail with
traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
And is successful with this kernel patch.
Test program, vmovdq.c:
#include <x86intrin.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
static inline void read_avx_reg(__m256i *data)
{
asm("vmovdqu %%ymm0, %0" : "=m"(*data));
}
static inline void write_avx_reg(const __m256i *data)
{
asm("vmovdqu %0, %%ymm0" : : "m"(*data));
}
int main(int argc, char **argv)
{
__m256i s, *d;
void *map;
int fd;
if(argc < 2) {
fprintf(stderr, "usage: %s <resource-file>\n", argv[1]);
return 1;
}
fd = open(argv[1], O_RDWR | O_SYNC);
if (fd < 0) {
fprintf(stderr, "failed to open %s\n", argv[1]);
return 1;
}
map = mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "failed to mmap %s\n", argv[1]);
return 1;
}
memset(&s, 0xd0, sizeof(s));
d = (__m256i *)map;
write_avx_reg(&s);
read_avx_reg(d);
write_avx_reg(d);
read_avx_reg(&s);
return 0;
}
Link: https://lore.kernel.org/kvm/BD108C42-0382-4B17-B601-434A4BD038E7@fb.com/T/
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Xu Liu <liuxu@meta.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
arch/x86/kvm/emulate.c | 136 ++++++++++++++++++++++++++++++++-----
arch/x86/kvm/fpu.h | 62 +++++++++++++++++
arch/x86/kvm/kvm_emulate.h | 6 +-
3 files changed, 187 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e72aed25d7212..aad8da15b6b77 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1144,6 +1144,19 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
else
reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
+ if (ctxt->d & Avx) {
+ op->bytes = ctxt->op_bytes;
+ if (op->bytes == 16) {
+ op->type = OP_XMM;
+ op->addr.xmm = reg;
+ kvm_read_sse_reg(reg, &op->vec_val);
+ } else {
+ op->type = OP_YMM;
+ op->addr.ymm = reg;
+ kvm_read_avx_reg(reg, &op->vec_val2);
+ }
+ return;
+ }
if (ctxt->d & Sse) {
op->type = OP_XMM;
op->bytes = 16;
@@ -1177,13 +1190,24 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
u8 sib;
- int index_reg, base_reg, scale;
+ int index_reg = 0, base_reg = 0, scale = 0;
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
- index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
- base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+ if (ctxt->vex_prefix[0]) {
+ if ((ctxt->vex_prefix[1] & 0x80) == 0) /* VEX._R */
+ ctxt->modrm_reg = 8;
+ if (ctxt->vex_prefix[0] == 0xc4) {
+ if ((ctxt->vex_prefix[1] & 0x40) == 0) /* VEX._X */
+ index_reg = 8;
+ if ((ctxt->vex_prefix[1] & 0x20) == 0) /* VEX._B */
+ base_reg = 8;
+ }
+ } else {
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+ }
ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
@@ -1195,6 +1219,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
ctxt->d & ByteOp);
+ if (ctxt->d & Avx) {
+ op->bytes = ctxt->op_bytes;
+ if (op->bytes == 16) {
+ op->type = OP_XMM;
+ op->addr.xmm = ctxt->modrm_rm;
+ kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
+ } else {
+ op->type = OP_YMM;
+ op->addr.ymm = ctxt->modrm_rm;
+ kvm_read_avx_reg(ctxt->modrm_rm, &op->vec_val2);
+ }
+ return rc;
+ }
if (ctxt->d & Sse) {
op->type = OP_XMM;
op->bytes = 16;
@@ -1808,6 +1845,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
case OP_XMM:
kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
break;
+ case OP_YMM:
+ kvm_write_avx_reg(op->addr.ymm, &op->vec_val2);
+ break;
case OP_MM:
kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
break;
@@ -3232,7 +3272,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
- memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
return X86EMUL_CONTINUE;
}
@@ -4460,6 +4500,23 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
};
+static const struct gprefix pfx_avx_0f_6f_0f_7f = {
+ N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
+};
+
+static const struct opcode avx_0f_table[256] = {
+ /* 0x00 - 0x5f */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x60 - 0x6F */
+ X8(N), X4(N), X2(N), N,
+ GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
+ /* 0x70 - 0x7F */
+ X8(N), X4(N), X2(N), N,
+ GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
+ /* 0x80 - 0xFF */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+};
+
static const struct instr_dual instr_dual_0f_38_f0 = {
I(DstReg | SrcMem | Mov, em_movbe), N
};
@@ -4724,6 +4781,41 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
return rc;
}
+static struct opcode x86_decode_avx(struct x86_emulate_ctxt *ctxt)
+{
+ u8 map, pp, l, v;
+
+ if (ctxt->vex_prefix[0] == 0xc5) {
+ pp = ctxt->vex_prefix[1] & 0x3; /* VEX.p1p0 */
+ l = ctxt->vex_prefix[1] & 0x4; /* VEX.L */
+ v = ~((ctxt->vex_prefix[1] >> 3) & 0xf) & 0xf; /* VEX.v3v2v1v0 */
+ map = 1; /* for 0f map */
+ ctxt->opcode_len = 2;
+ } else {
+ map = ctxt->vex_prefix[1] & 0x1f;
+ pp = ctxt->vex_prefix[2] & 0x3;
+ l = ctxt->vex_prefix[2] & 0x4;
+ v = ~((ctxt->vex_prefix[2] >> 3) & 0xf) & 0xf;
+ ctxt->opcode_len = 3;
+ }
+
+ if (l)
+ ctxt->op_bytes = 32;
+ else
+ ctxt->op_bytes = 16;
+
+ switch (pp) {
+ case 0: ctxt->rep_prefix = 0x00; break;
+ case 1: ctxt->rep_prefix = 0x66; break;
+ case 2: ctxt->rep_prefix = 0xf3; break;
+ case 3: ctxt->rep_prefix = 0xf2; break;
+ }
+
+ if (map == 1 && !v)
+ return avx_0f_table[ctxt->b];
+ return (struct opcode){.flags = NotImpl};
+}
+
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
{
int rc = X86EMUL_CONTINUE;
@@ -4777,7 +4869,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
ctxt->op_bytes = def_op_bytes;
ctxt->ad_bytes = def_ad_bytes;
- /* Legacy prefixes. */
+ /* prefixes. */
for (;;) {
switch (ctxt->b = insn_fetch(u8, ctxt)) {
case 0x66: /* operand-size override */
@@ -4822,6 +4914,19 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
goto done_prefixes;
ctxt->rex_prefix = ctxt->b;
continue;
+ case 0xc4: /* VEX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->vex_prefix[0] = ctxt->b;
+ ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
+ ctxt->vex_prefix[2] = insn_fetch(u8, ctxt);
+ break;
+ case 0xc5: /* VEX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->vex_prefix[0] = ctxt->b;
+ ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
+ break;
case 0xf0: /* LOCK */
ctxt->lock_prefix = 1;
break;
@@ -4844,10 +4949,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
if (ctxt->rex_prefix & 8)
ctxt->op_bytes = 8; /* REX.W */
- /* Opcode byte(s). */
- opcode = opcode_table[ctxt->b];
- /* Two-byte opcode? */
- if (ctxt->b == 0x0f) {
+ if (ctxt->vex_prefix[0]) {
+ opcode = x86_decode_avx(ctxt);
+ } else if (ctxt->b == 0x0f) {
+ /* Two-byte opcode? */
ctxt->opcode_len = 2;
ctxt->b = insn_fetch(u8, ctxt);
opcode = twobyte_table[ctxt->b];
@@ -4858,18 +4963,16 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
ctxt->b = insn_fetch(u8, ctxt);
opcode = opcode_map_0f_38[ctxt->b];
}
+ } else {
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
}
+
ctxt->d = opcode.flags;
if (ctxt->d & ModRM)
ctxt->modrm = insn_fetch(u8, ctxt);
- /* vex-prefix instructions are not implemented */
- if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
- (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
- ctxt->d = NotImpl;
- }
-
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
@@ -5091,6 +5194,7 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt)
/* Clear fields that are set conditionally but read without a guard. */
ctxt->rip_relative = false;
ctxt->rex_prefix = 0;
+ memset(ctxt->vex_prefix, 0, sizeof(ctxt->vex_prefix));;
ctxt->lock_prefix = 0;
ctxt->rep_prefix = 0;
ctxt->regs_valid = 0;
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
index 3ba12888bf66a..9bc08c3c53f5d 100644
--- a/arch/x86/kvm/fpu.h
+++ b/arch/x86/kvm/fpu.h
@@ -15,6 +15,54 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+typedef u32 __attribute__((vector_size(32))) avx256_t;
+
+static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break;
+ case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break;
+ case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break;
+ case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break;
+ case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break;
+ case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break;
+ case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break;
+ case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break;
+ case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break;
+ case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break;
+ case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
+ case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
+ case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
+ case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
+ case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
+ case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break;
+ case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break;
+ case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break;
+ case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break;
+ case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break;
+ case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break;
+ case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break;
+ case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break;
+ case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break;
+ case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break;
+ case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
+ case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
+ case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
+ case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
+ case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
+ case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
{
switch (reg) {
@@ -109,6 +157,20 @@ static inline void kvm_fpu_put(void)
fpregs_unlock();
}
+static inline void kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
static inline void kvm_read_sse_reg(int reg, sse128_t *data)
{
kvm_fpu_get();
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 55a18e2f2dcd9..0e12f187e0b57 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -239,7 +239,7 @@ struct x86_emulate_ops {
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
unsigned int bytes;
unsigned int count;
union {
@@ -253,13 +253,16 @@ struct operand {
unsigned seg;
} mem;
unsigned xmm;
+ unsigned ymm;
unsigned mm;
} addr;
union {
unsigned long val;
u64 val64;
char valptr[sizeof(sse128_t)];
+ char valptr2[sizeof(avx256_t)];
sse128_t vec_val;
+ avx256_t vec_val2;
u64 mm_val;
void *data;
};
@@ -347,6 +350,7 @@ struct x86_emulate_ctxt {
bool rip_relative;
u8 rex_prefix;
+ u8 vex_prefix[3];
u8 lock_prefix;
u8 rep_prefix;
/* bitmaps of registers in _regs[] that can be read */
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-20 23:04 [PATCH RFC] kvm: emulate avx vmovdq Keith Busch
@ 2024-08-21 16:12 ` Sean Christopherson
2024-08-21 16:34 ` Keith Busch
2024-08-22 7:22 ` Tao Su
` (3 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Sean Christopherson @ 2024-08-21 16:12 UTC (permalink / raw)
To: Keith Busch; +Cc: kvm, x86, Keith Busch, Alex Williamson, Paolo Bonzini, Xu Liu
On Tue, Aug 20, 2024, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Because people would like to use this (see "Link"), interpret the VEX
Please explicitly call define the use case in the changelog. Yeah, I can follow
the link, but I shouldn't have to just to understand that this is the compiler
generating vmovdqu for its built-in memcpy().
> prefix and emulate mov instrutions accordingly. The only avx
> instructions emulated here are the aligned and unaligned mov.
> Everything else will fail as before.
>
> This is new territory for me, so any feedback is appreciated.
Heh, this is probably new territory for everyone except possibly Paolo. I don't
recall the last time KVM was effectively forced to add emulation for something
this gnarly.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-21 16:12 ` Sean Christopherson
@ 2024-08-21 16:34 ` Keith Busch
0 siblings, 0 replies; 11+ messages in thread
From: Keith Busch @ 2024-08-21 16:34 UTC (permalink / raw)
To: Sean Christopherson
Cc: Keith Busch, kvm, x86, Alex Williamson, Paolo Bonzini, Xu Liu
On Wed, Aug 21, 2024 at 09:12:20AM -0700, Sean Christopherson wrote:
> On Tue, Aug 20, 2024, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> >
> > Because people would like to use this (see "Link"), interpret the VEX
>
> Please explicitly call define the use case in the changelog. Yeah, I can follow
> the link, but I shouldn't have to just to understand that this is the compiler
> generating vmovdqu for its built-in memcpy().
Sorry about that, but yes, it's essentially a compiler using an
intrinsic aware memcpy. That all works for mmio addresses from pci
passthrough functions, but currently fails on emulated device addresses.
> > prefix and emulate mov instrutions accordingly. The only avx
> > instructions emulated here are the aligned and unaligned mov.
> > Everything else will fail as before.
> >
> > This is new territory for me, so any feedback is appreciated.
>
> Heh, this is probably new territory for everyone except possibly Paolo. I don't
> recall the last time KVM was effectively forced to add emulation for something
> this gnarly.
Thanks, I feel a little less shame admitting this patch took me longer
to figure out than predicted. :)
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-20 23:04 [PATCH RFC] kvm: emulate avx vmovdq Keith Busch
2024-08-21 16:12 ` Sean Christopherson
@ 2024-08-22 7:22 ` Tao Su
2024-08-22 14:39 ` Keith Busch
2024-09-03 21:25 ` Keith Busch
` (2 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Tao Su @ 2024-08-22 7:22 UTC (permalink / raw)
To: Keith Busch; +Cc: kvm, x86, Keith Busch, Alex Williamson, Paolo Bonzini, Xu Liu
On Tue, Aug 20, 2024 at 04:04:31PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Because people would like to use this (see "Link"), interpret the VEX
> prefix and emulate mov instrutions accordingly. The only avx
> instructions emulated here are the aligned and unaligned mov.
> Everything else will fail as before.
>
> This is new territory for me, so any feedback is appreciated.
>
> To test, I executed the following program against a qemu emulated pci
> device resource. Prior to this kernel patch, it would fail with
>
> traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
>
> And is successful with this kernel patch.
>
> Test program, vmovdq.c:
>
> #include <x86intrin.h>
> #include <fcntl.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/mman.h>
>
> static inline void read_avx_reg(__m256i *data)
> {
> asm("vmovdqu %%ymm0, %0" : "=m"(*data));
> }
>
> static inline void write_avx_reg(const __m256i *data)
> {
> asm("vmovdqu %0, %%ymm0" : : "m"(*data));
> }
>
> int main(int argc, char **argv)
> {
> __m256i s, *d;
> void *map;
> int fd;
>
> if(argc < 2) {
> fprintf(stderr, "usage: %s <resource-file>\n", argv[1]);
> return 1;
> }
>
> fd = open(argv[1], O_RDWR | O_SYNC);
> if (fd < 0) {
> fprintf(stderr, "failed to open %s\n", argv[1]);
> return 1;
> }
>
> map = mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> if (map == MAP_FAILED) {
> fprintf(stderr, "failed to mmap %s\n", argv[1]);
> return 1;
>
> }
>
> memset(&s, 0xd0, sizeof(s));
> d = (__m256i *)map;
>
> write_avx_reg(&s);
> read_avx_reg(d);
>
> write_avx_reg(d);
> read_avx_reg(&s);
>
> return 0;
> }
>
> Link: https://lore.kernel.org/kvm/BD108C42-0382-4B17-B601-434A4BD038E7@fb.com/T/
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Xu Liu <liuxu@meta.com>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> arch/x86/kvm/emulate.c | 136 ++++++++++++++++++++++++++++++++-----
> arch/x86/kvm/fpu.h | 62 +++++++++++++++++
> arch/x86/kvm/kvm_emulate.h | 6 +-
> 3 files changed, 187 insertions(+), 17 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index e72aed25d7212..aad8da15b6b77 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -1144,6 +1144,19 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
> else
> reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
>
> + if (ctxt->d & Avx) {
> + op->bytes = ctxt->op_bytes;
> + if (op->bytes == 16) {
> + op->type = OP_XMM;
> + op->addr.xmm = reg;
> + kvm_read_sse_reg(reg, &op->vec_val);
> + } else {
> + op->type = OP_YMM;
> + op->addr.ymm = reg;
> + kvm_read_avx_reg(reg, &op->vec_val2);
> + }
> + return;
> + }
> if (ctxt->d & Sse) {
> op->type = OP_XMM;
> op->bytes = 16;
> @@ -1177,13 +1190,24 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
> struct operand *op)
> {
> u8 sib;
> - int index_reg, base_reg, scale;
> + int index_reg = 0, base_reg = 0, scale = 0;
> int rc = X86EMUL_CONTINUE;
> ulong modrm_ea = 0;
>
> - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
> - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
> - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
> + if (ctxt->vex_prefix[0]) {
> + if ((ctxt->vex_prefix[1] & 0x80) == 0) /* VEX._R */
> + ctxt->modrm_reg = 8;
> + if (ctxt->vex_prefix[0] == 0xc4) {
> + if ((ctxt->vex_prefix[1] & 0x40) == 0) /* VEX._X */
> + index_reg = 8;
> + if ((ctxt->vex_prefix[1] & 0x20) == 0) /* VEX._B */
> + base_reg = 8;
> + }
> + } else {
> + ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
> + index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
> + base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
> + }
>
> ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
> ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
> @@ -1195,6 +1219,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
> op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
> op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
> ctxt->d & ByteOp);
> + if (ctxt->d & Avx) {
> + op->bytes = ctxt->op_bytes;
> + if (op->bytes == 16) {
> + op->type = OP_XMM;
> + op->addr.xmm = ctxt->modrm_rm;
> + kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
> + } else {
> + op->type = OP_YMM;
> + op->addr.ymm = ctxt->modrm_rm;
> + kvm_read_avx_reg(ctxt->modrm_rm, &op->vec_val2);
> + }
> + return rc;
> + }
> if (ctxt->d & Sse) {
> op->type = OP_XMM;
> op->bytes = 16;
> @@ -1808,6 +1845,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
> case OP_XMM:
> kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
> break;
> + case OP_YMM:
> + kvm_write_avx_reg(op->addr.ymm, &op->vec_val2);
> + break;
> case OP_MM:
> kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
> break;
> @@ -3232,7 +3272,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
>
> static int em_mov(struct x86_emulate_ctxt *ctxt)
> {
> - memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
> + memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
> return X86EMUL_CONTINUE;
> }
>
> @@ -4460,6 +4500,23 @@ static const struct opcode twobyte_table[256] = {
> N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
> };
>
> +static const struct gprefix pfx_avx_0f_6f_0f_7f = {
> + N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
> +};
> +
> +static const struct opcode avx_0f_table[256] = {
> + /* 0x00 - 0x5f */
> + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> + /* 0x60 - 0x6F */
> + X8(N), X4(N), X2(N), N,
> + GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> + /* 0x70 - 0x7F */
> + X8(N), X4(N), X2(N), N,
> + GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> + /* 0x80 - 0xFF */
> + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> +};
> +
> static const struct instr_dual instr_dual_0f_38_f0 = {
> I(DstReg | SrcMem | Mov, em_movbe), N
> };
> @@ -4724,6 +4781,41 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
> return rc;
> }
>
> +static struct opcode x86_decode_avx(struct x86_emulate_ctxt *ctxt)
> +{
> + u8 map, pp, l, v;
> +
> + if (ctxt->vex_prefix[0] == 0xc5) {
> + pp = ctxt->vex_prefix[1] & 0x3; /* VEX.p1p0 */
> + l = ctxt->vex_prefix[1] & 0x4; /* VEX.L */
> + v = ~((ctxt->vex_prefix[1] >> 3) & 0xf) & 0xf; /* VEX.v3v2v1v0 */
> + map = 1; /* for 0f map */
> + ctxt->opcode_len = 2;
> + } else {
> + map = ctxt->vex_prefix[1] & 0x1f;
> + pp = ctxt->vex_prefix[2] & 0x3;
> + l = ctxt->vex_prefix[2] & 0x4;
> + v = ~((ctxt->vex_prefix[2] >> 3) & 0xf) & 0xf;
> + ctxt->opcode_len = 3;
> + }
> +
> + if (l)
> + ctxt->op_bytes = 32;
> + else
> + ctxt->op_bytes = 16;
> +
> + switch (pp) {
> + case 0: ctxt->rep_prefix = 0x00; break;
> + case 1: ctxt->rep_prefix = 0x66; break;
> + case 2: ctxt->rep_prefix = 0xf3; break;
> + case 3: ctxt->rep_prefix = 0xf2; break;
> + }
> +
> + if (map == 1 && !v)
> + return avx_0f_table[ctxt->b];
> + return (struct opcode){.flags = NotImpl};
Can we check whether the host supports AVX? I.e. if the host does not support
AVX, set NotImpl. I am thinking that if the host does not support AVX, perhaps
the guest executing AVX instructions will cause the host to panic, because the
host will execute AVX instructions during the simulation.
Yeah if the host does not support AVX, it may not report AVX to the guest, but
the guest can always ignore the AVX check, such as the code in the commit.
> +}
> +
> int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
> {
> int rc = X86EMUL_CONTINUE;
> @@ -4777,7 +4869,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> ctxt->op_bytes = def_op_bytes;
> ctxt->ad_bytes = def_ad_bytes;
>
> - /* Legacy prefixes. */
> + /* prefixes. */
> for (;;) {
> switch (ctxt->b = insn_fetch(u8, ctxt)) {
> case 0x66: /* operand-size override */
> @@ -4822,6 +4914,19 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> goto done_prefixes;
> ctxt->rex_prefix = ctxt->b;
> continue;
> + case 0xc4: /* VEX */
> + if (mode != X86EMUL_MODE_PROT64)
> + goto done_prefixes;
> + ctxt->vex_prefix[0] = ctxt->b;
> + ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
> + ctxt->vex_prefix[2] = insn_fetch(u8, ctxt);
> + break;
> + case 0xc5: /* VEX */
> + if (mode != X86EMUL_MODE_PROT64)
> + goto done_prefixes;
> + ctxt->vex_prefix[0] = ctxt->b;
> + ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
> + break;
> case 0xf0: /* LOCK */
> ctxt->lock_prefix = 1;
> break;
> @@ -4844,10 +4949,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> if (ctxt->rex_prefix & 8)
> ctxt->op_bytes = 8; /* REX.W */
>
> - /* Opcode byte(s). */
> - opcode = opcode_table[ctxt->b];
> - /* Two-byte opcode? */
> - if (ctxt->b == 0x0f) {
> + if (ctxt->vex_prefix[0]) {
> + opcode = x86_decode_avx(ctxt);
> + } else if (ctxt->b == 0x0f) {
> + /* Two-byte opcode? */
> ctxt->opcode_len = 2;
> ctxt->b = insn_fetch(u8, ctxt);
> opcode = twobyte_table[ctxt->b];
> @@ -4858,18 +4963,16 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> ctxt->b = insn_fetch(u8, ctxt);
> opcode = opcode_map_0f_38[ctxt->b];
> }
> + } else {
> + /* Opcode byte(s). */
> + opcode = opcode_table[ctxt->b];
> }
> +
> ctxt->d = opcode.flags;
>
> if (ctxt->d & ModRM)
> ctxt->modrm = insn_fetch(u8, ctxt);
>
> - /* vex-prefix instructions are not implemented */
> - if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
> - (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
> - ctxt->d = NotImpl;
> - }
> -
> while (ctxt->d & GroupMask) {
> switch (ctxt->d & GroupMask) {
> case Group:
> @@ -5091,6 +5194,7 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt)
> /* Clear fields that are set conditionally but read without a guard. */
> ctxt->rip_relative = false;
> ctxt->rex_prefix = 0;
> + memset(ctxt->vex_prefix, 0, sizeof(ctxt->vex_prefix));;
^^
Two ; here.
> ctxt->lock_prefix = 0;
> ctxt->rep_prefix = 0;
> ctxt->regs_valid = 0;
> diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
> index 3ba12888bf66a..9bc08c3c53f5d 100644
> --- a/arch/x86/kvm/fpu.h
> +++ b/arch/x86/kvm/fpu.h
> @@ -15,6 +15,54 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
> #define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
> #define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
>
> +typedef u32 __attribute__((vector_size(32))) avx256_t;
> +
> +static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
> +{
> + switch (reg) {
> + case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break;
> + case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break;
> + case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break;
> + case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break;
> + case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break;
> + case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break;
> + case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break;
> + case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break;
> + case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break;
> + case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break;
> + case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
> + case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
> + case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
> + case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
> + case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
> + case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
> + default: BUG();
> + }
> +}
> +
> +static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
> +{
> + switch (reg) {
> + case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break;
> + case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break;
> + case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break;
> + case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break;
> + case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break;
> + case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break;
> + case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break;
> + case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break;
> + case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break;
> + case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break;
> + case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
> + case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
> + case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
> + case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
> + case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
> + case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
> + default: BUG();
> + }
> +}
> +
> static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
> {
> switch (reg) {
> @@ -109,6 +157,20 @@ static inline void kvm_fpu_put(void)
> fpregs_unlock();
> }
>
> +static inline void kvm_read_avx_reg(int reg, avx256_t *data)
> +{
> + kvm_fpu_get();
> + _kvm_read_avx_reg(reg, data);
> + kvm_fpu_put();
> +}
> +
> +static inline void kvm_write_avx_reg(int reg, const avx256_t *data)
> +{
> + kvm_fpu_get();
> + _kvm_write_avx_reg(reg, data);
> + kvm_fpu_put();
> +}
> +
> static inline void kvm_read_sse_reg(int reg, sse128_t *data)
> {
> kvm_fpu_get();
> diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
> index 55a18e2f2dcd9..0e12f187e0b57 100644
> --- a/arch/x86/kvm/kvm_emulate.h
> +++ b/arch/x86/kvm/kvm_emulate.h
> @@ -239,7 +239,7 @@ struct x86_emulate_ops {
>
> /* Type, address-of, and value of an instruction's operand. */
> struct operand {
> - enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
> + enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
> unsigned int bytes;
> unsigned int count;
> union {
> @@ -253,13 +253,16 @@ struct operand {
> unsigned seg;
> } mem;
> unsigned xmm;
> + unsigned ymm;
> unsigned mm;
> } addr;
> union {
> unsigned long val;
> u64 val64;
> char valptr[sizeof(sse128_t)];
> + char valptr2[sizeof(avx256_t)];
> sse128_t vec_val;
> + avx256_t vec_val2;
> u64 mm_val;
> void *data;
> };
> @@ -347,6 +350,7 @@ struct x86_emulate_ctxt {
>
> bool rip_relative;
> u8 rex_prefix;
> + u8 vex_prefix[3];
> u8 lock_prefix;
> u8 rep_prefix;
> /* bitmaps of registers in _regs[] that can be read */
> --
> 2.43.5
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-22 7:22 ` Tao Su
@ 2024-08-22 14:39 ` Keith Busch
2024-08-23 3:47 ` Tao Su
0 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2024-08-22 14:39 UTC (permalink / raw)
To: Tao Su; +Cc: Keith Busch, kvm, x86, Alex Williamson, Paolo Bonzini, Xu Liu
On Thu, Aug 22, 2024 at 03:22:35PM +0800, Tao Su wrote:
> On Tue, Aug 20, 2024 at 04:04:31PM -0700, Keith Busch wrote:
> > + if (map == 1 && !v)
> > + return avx_0f_table[ctxt->b];
> > + return (struct opcode){.flags = NotImpl};
>
> Can we check whether the host supports AVX? I.e. if the host does not support
> AVX, set NotImpl. I am thinking that if the host does not support AVX, perhaps
> the guest executing AVX instructions will cause the host to panic, because the
> host will execute AVX instructions during the simulation.
>
> Yeah if the host does not support AVX, it may not report AVX to the guest, but
> the guest can always ignore the AVX check, such as the code in the commit.
That's a good thought. Here is how I rationalized not adding additional
checks for it:
If the guest cpu doesn't support AVX, I think it should fail then and
there rather than trap to the hypervisor running on the host, so this
new code wouldn't get a chance to attempt emulating it.
In the case where the host doesn't support AVX, but the guest does
support it, then I assume the VM is running on an emulated CPU and not
using kvm acceleration anymore.
Anyway, I haven't tried it, so not entirely confident that's how this
all works. I was mainly following the existing SSE emulations, which
don't have CPU support checks either. I don't think it's a problem to
add such checks though, so happy to do it if needed.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-22 14:39 ` Keith Busch
@ 2024-08-23 3:47 ` Tao Su
0 siblings, 0 replies; 11+ messages in thread
From: Tao Su @ 2024-08-23 3:47 UTC (permalink / raw)
To: Keith Busch; +Cc: Keith Busch, kvm, x86, Alex Williamson, Paolo Bonzini, Xu Liu
On Thu, Aug 22, 2024 at 08:39:19AM -0600, Keith Busch wrote:
> On Thu, Aug 22, 2024 at 03:22:35PM +0800, Tao Su wrote:
> > On Tue, Aug 20, 2024 at 04:04:31PM -0700, Keith Busch wrote:
> > > + if (map == 1 && !v)
> > > + return avx_0f_table[ctxt->b];
> > > + return (struct opcode){.flags = NotImpl};
> >
> > Can we check whether the host supports AVX? I.e. if the host does not support
> > AVX, set NotImpl. I am thinking that if the host does not support AVX, perhaps
> > the guest executing AVX instructions will cause the host to panic, because the
> > host will execute AVX instructions during the simulation.
> >
> > Yeah if the host does not support AVX, it may not report AVX to the guest, but
> > the guest can always ignore the AVX check, such as the code in the commit.
>
> That's a good thought. Here is how I rationalized not adding additional
> checks for it:
>
> If the guest cpu doesn't support AVX, I think it should fail then and
> there rather than trap to the hypervisor running on the host, so this
> new code wouldn't get a chance to attempt emulating it.
>
Per SDM:
If YMM state management is not enabled by an operating systems, Intel AVX
instructions will #UD regardless of CPUID.1:ECX.AVX[bit 28].
Host and guest can set different xstates, so it has possibility to trigger,
i.e. host clears but guest sets XCR0[2]. But I don’t see this case can occur
now, so just ignore my concern if no one else wants to do that :-)
> In the case where the host doesn't support AVX, but the guest does
> support it, then I assume the VM is running on an emulated CPU and not
> using kvm acceleration anymore.
>
> Anyway, I haven't tried it, so not entirely confident that's how this
> all works. I was mainly following the existing SSE emulations, which
> don't have CPU support checks either. I don't think it's a problem to
> add such checks though, so happy to do it if needed.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-20 23:04 [PATCH RFC] kvm: emulate avx vmovdq Keith Busch
2024-08-21 16:12 ` Sean Christopherson
2024-08-22 7:22 ` Tao Su
@ 2024-09-03 21:25 ` Keith Busch
2024-09-22 12:57 ` Sean Christopherson
2025-11-04 17:40 ` Paolo Bonzini
4 siblings, 0 replies; 11+ messages in thread
From: Keith Busch @ 2024-09-03 21:25 UTC (permalink / raw)
To: Keith Busch; +Cc: kvm, x86, Alex Williamson, Paolo Bonzini, Xu Liu
On Tue, Aug 20, 2024 at 04:04:31PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Because people would like to use this (see "Link"), interpret the VEX
> prefix and emulate mov instrutions accordingly. The only avx
> instructions emulated here are the aligned and unaligned mov.
> Everything else will fail as before.
Hey, checking back in on this since it's been a couple weeks. The only
feedback so far are either commit-log changes or cosmetic code fixes.
I'm happy to provide more details or test cases if needed. I'll also be
at LPC if in-preson discussions might be useful.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-20 23:04 [PATCH RFC] kvm: emulate avx vmovdq Keith Busch
` (2 preceding siblings ...)
2024-09-03 21:25 ` Keith Busch
@ 2024-09-22 12:57 ` Sean Christopherson
2024-09-25 8:09 ` Keith Busch
2025-11-04 17:40 ` Paolo Bonzini
4 siblings, 1 reply; 11+ messages in thread
From: Sean Christopherson @ 2024-09-22 12:57 UTC (permalink / raw)
To: Keith Busch; +Cc: kvm, x86, Keith Busch, Alex Williamson, Paolo Bonzini, Xu Liu
On Tue, Aug 20, 2024, Keith Busch wrote:
> To test, I executed the following program against a qemu emulated pci
> device resource. Prior to this kernel patch, it would fail with
>
> traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
...
> +static const struct gprefix pfx_avx_0f_6f_0f_7f = {
> + N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
> +};
> +
> +static const struct opcode avx_0f_table[256] = {
> + /* 0x00 - 0x5f */
> + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> + /* 0x60 - 0x6F */
> + X8(N), X4(N), X2(N), N,
> + GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> + /* 0x70 - 0x7F */
> + X8(N), X4(N), X2(N), N,
> + GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> + /* 0x80 - 0xFF */
> + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> +};
Mostly as an FYI, we're likely going to run into more than just VMOVDQU sooner
rather than later. E.g. gcc-13 with -march=x86-64-v3 (which per Vitaly is now
the default gcc behavior for some distros[*]) compiles this chunk from KVM
selftests' kvm_fixup_exception():
regs->rip = regs->r11;
regs->r9 = regs->vector;
regs->r10 = regs->error_code;
intto this monstronsity (which is clever, but oof).
405313: c4 e1 f9 6e c8 vmovq %rax,%xmm1
405318: 48 89 68 08 mov %rbp,0x8(%rax)
40531c: 48 89 e8 mov %rbp,%rax
40531f: c4 c3 f1 22 c4 01 vpinsrq $0x1,%r12,%xmm1,%xmm0
405325: 49 89 6d 38 mov %rbp,0x38(%r13)
405329: c5 fa 7f 45 00 vmovdqu %xmm0,0x0(%rbp)
I wouldn't be surprised if the same packing shenanigans get employed when generating
code for a struct overlay of emulated MMIO.
[*] https://lore.kernel.org/all/20240920154422.2890096-1-vkuznets@redhat.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-09-22 12:57 ` Sean Christopherson
@ 2024-09-25 8:09 ` Keith Busch
2024-09-25 13:46 ` Sean Christopherson
0 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2024-09-25 8:09 UTC (permalink / raw)
To: Sean Christopherson
Cc: Keith Busch, kvm, x86, Alex Williamson, Paolo Bonzini, Xu Liu
On Sun, Sep 22, 2024 at 05:57:05AM -0700, Sean Christopherson wrote:
> On Tue, Aug 20, 2024, Keith Busch wrote:
> > To test, I executed the following program against a qemu emulated pci
> > device resource. Prior to this kernel patch, it would fail with
> >
> > traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
>
> ...
>
> > +static const struct gprefix pfx_avx_0f_6f_0f_7f = {
> > + N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
> > +};
> > +
> > +static const struct opcode avx_0f_table[256] = {
> > + /* 0x00 - 0x5f */
> > + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> > + /* 0x60 - 0x6F */
> > + X8(N), X4(N), X2(N), N,
> > + GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> > + /* 0x70 - 0x7F */
> > + X8(N), X4(N), X2(N), N,
> > + GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> > + /* 0x80 - 0xFF */
> > + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> > +};
>
> Mostly as an FYI, we're likely going to run into more than just VMOVDQU sooner
> rather than later. E.g. gcc-13 with -march=x86-64-v3 (which per Vitaly is now
> the default gcc behavior for some distros[*]) compiles this chunk from KVM
> selftests' kvm_fixup_exception():
>
> regs->rip = regs->r11;
> regs->r9 = regs->vector;
> regs->r10 = regs->error_code;
>
> intto this monstronsity (which is clever, but oof).
>
> 405313: c4 e1 f9 6e c8 vmovq %rax,%xmm1
> 405318: 48 89 68 08 mov %rbp,0x8(%rax)
> 40531c: 48 89 e8 mov %rbp,%rax
> 40531f: c4 c3 f1 22 c4 01 vpinsrq $0x1,%r12,%xmm1,%xmm0
> 405325: 49 89 6d 38 mov %rbp,0x38(%r13)
> 405329: c5 fa 7f 45 00 vmovdqu %xmm0,0x0(%rbp)
>
> I wouldn't be surprised if the same packing shenanigans get employed when generating
> code for a struct overlay of emulated MMIO.
Thanks for the notice. I'm hoping we can proceed with just the mov
instructions for now, unless someone already has a real use for these on
emulated MMIO. Otherwise, we can cross that bridge when we get there.
As it is, if just the vmovdq[u,a] are okay, I have a follow on for
vmovdqu64, though I'm currently having trouble adding AVX-512 registers.
Simply increasing the size of the struct x86_emulate_ctxt appears to
break something even without trying to emulate those instructions. But I
want to wait to see if this first part is okay before spending too much
time on it.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-09-25 8:09 ` Keith Busch
@ 2024-09-25 13:46 ` Sean Christopherson
0 siblings, 0 replies; 11+ messages in thread
From: Sean Christopherson @ 2024-09-25 13:46 UTC (permalink / raw)
To: Keith Busch; +Cc: Keith Busch, kvm, x86, Alex Williamson, Paolo Bonzini, Xu Liu
On Wed, Sep 25, 2024, Keith Busch wrote:
> On Sun, Sep 22, 2024 at 05:57:05AM -0700, Sean Christopherson wrote:
> > On Tue, Aug 20, 2024, Keith Busch wrote:
> > > To test, I executed the following program against a qemu emulated pci
> > > device resource. Prior to this kernel patch, it would fail with
> > >
> > > traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
> >
> > ...
> >
> > > +static const struct gprefix pfx_avx_0f_6f_0f_7f = {
> > > + N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
> > > +};
> > > +
> > > +static const struct opcode avx_0f_table[256] = {
> > > + /* 0x00 - 0x5f */
> > > + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> > > + /* 0x60 - 0x6F */
> > > + X8(N), X4(N), X2(N), N,
> > > + GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> > > + /* 0x70 - 0x7F */
> > > + X8(N), X4(N), X2(N), N,
> > > + GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> > > + /* 0x80 - 0xFF */
> > > + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> > > +};
> >
> > Mostly as an FYI, we're likely going to run into more than just VMOVDQU sooner
> > rather than later. E.g. gcc-13 with -march=x86-64-v3 (which per Vitaly is now
> > the default gcc behavior for some distros[*]) compiles this chunk from KVM
> > selftests' kvm_fixup_exception():
> >
> > regs->rip = regs->r11;
> > regs->r9 = regs->vector;
> > regs->r10 = regs->error_code;
> >
> > intto this monstronsity (which is clever, but oof).
> >
> > 405313: c4 e1 f9 6e c8 vmovq %rax,%xmm1
> > 405318: 48 89 68 08 mov %rbp,0x8(%rax)
> > 40531c: 48 89 e8 mov %rbp,%rax
> > 40531f: c4 c3 f1 22 c4 01 vpinsrq $0x1,%r12,%xmm1,%xmm0
> > 405325: 49 89 6d 38 mov %rbp,0x38(%r13)
> > 405329: c5 fa 7f 45 00 vmovdqu %xmm0,0x0(%rbp)
> >
> > I wouldn't be surprised if the same packing shenanigans get employed when generating
> > code for a struct overlay of emulated MMIO.
>
> Thanks for the notice. I'm hoping we can proceed with just the mov
> instructions for now, unless someone already has a real use for these on
> emulated MMIO. Otherwise, we can cross that bridge when we get there.
Oh, yeah, for sure. The FYI was really for Paolo, e.g. to make sure we don't make
assumptions in the emulator or something and make our future lives harder (I haven't
looked at your patch in any detail, so my fears could be completely unfounded).
> As it is, if just the vmovdq[u,a] are okay, I have a follow on for
> vmovdqu64, though I'm currently having trouble adding AVX-512 registers.
> Simply increasing the size of the struct x86_emulate_ctxt appears to
> break something even without trying to emulate those instructions. But I
> want to wait to see if this first part is okay before spending too much
> time on it.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC] kvm: emulate avx vmovdq
2024-08-20 23:04 [PATCH RFC] kvm: emulate avx vmovdq Keith Busch
` (3 preceding siblings ...)
2024-09-22 12:57 ` Sean Christopherson
@ 2025-11-04 17:40 ` Paolo Bonzini
4 siblings, 0 replies; 11+ messages in thread
From: Paolo Bonzini @ 2025-11-04 17:40 UTC (permalink / raw)
To: Keith Busch, kvm; +Cc: x86, Keith Busch, Alex Williamson, Xu Liu
On 8/21/24 01:04, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Because people would like to use this (see "Link"), interpret the VEX
> prefix and emulate mov instrutions accordingly. The only avx
> instructions emulated here are the aligned and unaligned mov.
> Everything else will fail as before.
>
> This is new territory for me, so any feedback is appreciated.
>
> To test, I executed the following program against a qemu emulated pci
> device resource. Prior to this kernel patch, it would fail with
>
> traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
>
> And is successful with this kernel patch.
It's been a while but I am going to revive this patch (fix it and resend
it)... Here are a couple notes on what can be done.
> - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
> - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
> - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
> + if (ctxt->vex_prefix[0]) {
> + if ((ctxt->vex_prefix[1] & 0x80) == 0) /* VEX._R */
> + ctxt->modrm_reg = 8;
> + if (ctxt->vex_prefix[0] == 0xc4) {
> + if ((ctxt->vex_prefix[1] & 0x40) == 0) /* VEX._X */
> + index_reg = 8;
> + if ((ctxt->vex_prefix[1] & 0x20) == 0) /* VEX._B */
> + base_reg = 8;
> + }
> + } else {
> + ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
> + index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
> + base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
> + }
It's easier to do all the VEX decoding straight into rex_prefix in
x86_decode_avx.
> ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
> ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
> @@ -1195,6 +1219,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
> op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
> op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
> ctxt->d & ByteOp);
> + if (ctxt->d & Avx) {
> + op->bytes = ctxt->op_bytes;
> + if (op->bytes == 16) {
> + op->type = OP_XMM;
> + op->addr.xmm = ctxt->modrm_rm;
> + kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
> + } else {
> + op->type = OP_YMM;
> + op->addr.ymm = ctxt->modrm_rm;
> + kvm_read_avx_reg(ctxt->modrm_rm, &op->vec_val2);
> + }
> + return rc;
> + }
The duplication with decode_register_operand sucks, I'll check what can
be done about it.
> if (ctxt->d & Sse) {
> op->type = OP_XMM;
> op->bytes = 16;
> @@ -1808,6 +1845,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
> case OP_XMM:
> kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
> break;
> + case OP_YMM:
> + kvm_write_avx_reg(op->addr.ymm, &op->vec_val2);
> + break;
> case OP_MM:
> kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
> break;
> @@ -3232,7 +3272,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
>
> static int em_mov(struct x86_emulate_ctxt *ctxt)
> {
> - memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
> + memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
The idea here was that copying everything is faster because the size is
constant. 256 bits starts to be relatively hefty, but still only 4
words. Maybe worth adding an "if (ctxt->op_bytes <= 8)".
> +static const struct gprefix pfx_avx_0f_6f_0f_7f = {
> + N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
> +};
> +
> +static const struct opcode avx_0f_table[256] = {
> + /* 0x00 - 0x5f */
> + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> + /* 0x60 - 0x6F */
> + X8(N), X4(N), X2(N), N,
> + GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> + /* 0x70 - 0x7F */
> + X8(N), X4(N), X2(N), N,
> + GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> + /* 0x80 - 0xFF */
> + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> +};
Can't blame you for duplicating the table, as that's the easiest way to
do it. I'll check if I can reuse some ideas from QEMU on how to avoid that.
> +static struct opcode x86_decode_avx(struct x86_emulate_ctxt *ctxt)
> +{
> + u8 map, pp, l, v;
Should check that there are no 0x66/0xf2/0xf3 prefixes.
> + if (ctxt->vex_prefix[0] == 0xc5) {
> + pp = ctxt->vex_prefix[1] & 0x3; /* VEX.p1p0 */
> + l = ctxt->vex_prefix[1] & 0x4; /* VEX.L */
> + v = ~((ctxt->vex_prefix[1] >> 3) & 0xf) & 0xf; /* VEX.v3v2v1v0 */
> + map = 1; /* for 0f map */
> + ctxt->opcode_len = 2;
> + } else {
> + map = ctxt->vex_prefix[1] & 0x1f;
> + pp = ctxt->vex_prefix[2] & 0x3;
> + l = ctxt->vex_prefix[2] & 0x4;
> + v = ~((ctxt->vex_prefix[2] >> 3) & 0xf) & 0xf;
> + ctxt->opcode_len = 3;
> + }
> +
> + if (l)
> + ctxt->op_bytes = 32;
> + else
> + ctxt->op_bytes = 16;
> +
> + switch (pp) {
> + case 0: ctxt->rep_prefix = 0x00; break;
> + case 1: ctxt->rep_prefix = 0x66; break;
> + case 2: ctxt->rep_prefix = 0xf3; break;
> + case 3: ctxt->rep_prefix = 0xf2; break;
> + }
> +
> + if (map == 1 && !v)
> + return avx_0f_table[ctxt->b];
> + return (struct opcode){.flags = NotImpl};
> +}
> +
> int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
> {
> int rc = X86EMUL_CONTINUE;
> @@ -4777,7 +4869,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> ctxt->op_bytes = def_op_bytes;
> ctxt->ad_bytes = def_ad_bytes;
>
> - /* Legacy prefixes. */
> + /* prefixes. */
> for (;;) {
> switch (ctxt->b = insn_fetch(u8, ctxt)) {
> case 0x66: /* operand-size override */
> @@ -4822,6 +4914,19 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> goto done_prefixes;
> ctxt->rex_prefix = ctxt->b;
> continue;
> + case 0xc4: /* VEX */
> + if (mode != X86EMUL_MODE_PROT64)
> + goto done_prefixes;
VEX prefixes can actually be used in 32-bit modes as long as bits 7:6
are 11 in binary. Might actually do that, since we don't support
lds/les instructions at all in the emulator.
Also I'll move all the fetches to x86_decode_avx as well. Just do a
"break" here...
> + ctxt->vex_prefix[0] = ctxt->b;
> + ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
> + ctxt->vex_prefix[2] = insn_fetch(u8, ctxt);
> + break;
> + case 0xc5: /* VEX */
> + if (mode != X86EMUL_MODE_PROT64)
> + goto done_prefixes;
> + ctxt->vex_prefix[0] = ctxt->b;
> + ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
> + break;
> case 0xf0: /* LOCK */
> ctxt->lock_prefix = 1;
> break;
> @@ -4844,10 +4949,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> if (ctxt->rex_prefix & 8)
> ctxt->op_bytes = 8; /* REX.W */
>
> - /* Opcode byte(s). */
> - opcode = opcode_table[ctxt->b];
> - /* Two-byte opcode? */
> - if (ctxt->b == 0x0f) {
> + if (ctxt->vex_prefix[0]) {
> + opcode = x86_decode_avx(ctxt);
> + } else if (ctxt->b == 0x0f) {
> + /* Two-byte opcode? */
> ctxt->opcode_len = 2;
> ctxt->b = insn_fetch(u8, ctxt);
> opcode = twobyte_table[ctxt->b];
> @@ -4858,18 +4963,16 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
> ctxt->b = insn_fetch(u8, ctxt);
> opcode = opcode_map_0f_38[ctxt->b];
> }
> + } else {
> + /* Opcode byte(s). */
> + opcode = opcode_table[ctxt->b];
> }
> +
> ctxt->d = opcode.flags;
... and call out to x86_decode_avx here for the actual processing of the
prefix:
if (ctxt->opcode_len == 1 && ctxt->b == 0xc4 || ctxt->b == 0xc5) {
int modrm = insn_fetch(u8, ctxt);
if (mode == X86EMUL_MODE_PROT64 || (modrm & 0xc0) == 0xc0) {
opcode = x86_decode_avx(ctxt, ctxt->b, modrm);
modrm = insn_fetch(u8, ctxt);
}
ctxt->modrm = modrm;
} else if (ctxt->d & ModRM)
modrm = insn_fetch(u8, ctxt);
Nevertheless, thanks so much for writing this and sorry for dropping it
on the floor for so long. It's way overdue.
Paolo
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2025-11-04 17:40 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-20 23:04 [PATCH RFC] kvm: emulate avx vmovdq Keith Busch
2024-08-21 16:12 ` Sean Christopherson
2024-08-21 16:34 ` Keith Busch
2024-08-22 7:22 ` Tao Su
2024-08-22 14:39 ` Keith Busch
2024-08-23 3:47 ` Tao Su
2024-09-03 21:25 ` Keith Busch
2024-09-22 12:57 ` Sean Christopherson
2024-09-25 8:09 ` Keith Busch
2024-09-25 13:46 ` Sean Christopherson
2025-11-04 17:40 ` Paolo Bonzini
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).