* [PATCH] x86: AVX instruction emulation fixes
@ 2013-08-28 8:32 Jan Beulich
2013-08-28 8:49 ` Fabio Fantoni
2013-08-28 12:49 ` Keir Fraser
0 siblings, 2 replies; 4+ messages in thread
From: Jan Beulich @ 2013-08-28 8:32 UTC (permalink / raw)
To: xen-devel; +Cc: Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 8813 bytes --]
- we used the C4/C5 (first prefix) byte instead of the apparent ModR/M
one as the second prefix byte
- early decoding normalized vex.reg, thus corrupting it for the main
consumer (copy_REX_VEX()), resulting in #UD on the two-operand
instructions we emulate
Also add respective test cases to the testing utility plus
- fix get_fpu() (the fall-through order was inverted)
- add cpu_has_avx2, even if it's currently unused (as in the new test
cases I decided to refrain from using AVX2 instructions in order to
be able to actually run all the tests on the hardware I have)
- slightly tweak cpu_has_avx to more consistently express the outputs
we don't care about (sinking them all into the same variable)
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x
}
#define cpu_has_avx ({ \
- unsigned int eax = 1, ecx = 0, edx; \
- cpuid(&eax, &edx, &ecx, &edx, NULL); \
+ unsigned int eax = 1, ecx = 0; \
+ cpuid(&eax, &eax, &ecx, &eax, NULL); \
if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
ecx = 0; \
(ecx & (1U << 28)) != 0; \
})
+#define cpu_has_avx2 ({ \
+ unsigned int eax = 1, ebx, ecx = 0; \
+ cpuid(&eax, &ebx, &ecx, &eax, NULL); \
+ if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+ ebx = 0; \
+ else { \
+ eax = 7, ecx = 0; \
+ cpuid(&eax, &ebx, &ecx, &eax, NULL); \
+ } \
+ (ebx & (1U << 5)) != 0; \
+})
+
int get_fpu(
void (*exception_callback)(void *, struct cpu_user_regs *),
void *exception_callback_arg,
@@ -111,14 +123,14 @@ int get_fpu(
{
case X86EMUL_FPU_fpu:
break;
- case X86EMUL_FPU_ymm:
- if ( cpu_has_avx )
+ case X86EMUL_FPU_mmx:
+ if ( cpu_has_mmx )
break;
case X86EMUL_FPU_xmm:
if ( cpu_has_sse )
break;
- case X86EMUL_FPU_mmx:
- if ( cpu_has_mmx )
+ case X86EMUL_FPU_ymm:
+ if ( cpu_has_avx )
break;
default:
return X86EMUL_UNHANDLEABLE;
@@ -629,6 +641,73 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovdqu %ymm2,(%ecx)...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovdqu_to_mem[];
+
+ asm volatile ( "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n"
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovdqu_to_mem: vmovdqu %%ymm2, (%0)\n"
+ ".popsection" :: "c" (NULL) );
+
+ memcpy(instr, vmovdqu_to_mem, 15);
+ memset(res, 0x55, 128);
+ memset(res + 16, 0xff, 16);
+ memset(res + 20, 0x00, 16);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 16, 64) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmovdqu (%edx),%ymm4...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovdqu_from_mem[];
+
+#if 0 /* Don't use AVX2 instructions for now */
+ asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
+#else
+ asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
+ "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
+#endif
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovdqu_from_mem: vmovdqu (%0), %%ymm4\n"
+ ".popsection" :: "d" (NULL) );
+
+ memcpy(instr, vmovdqu_from_mem, 15);
+ memset(res + 4, 0xff, 16);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = 0;
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( rc != 0xffffffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
memset(res, 0x77, 64);
memset(res + 10, 0x66, 8);
@@ -683,6 +762,59 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovsd %xmm5,(%ecx)...");
+ memset(res, 0x88, 64);
+ memset(res + 10, 0x77, 8);
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovsd_to_mem[];
+
+ asm volatile ( "vbroadcastsd %0, %%ymm5\n"
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovsd_to_mem: vmovsd %%xmm5, (%1)\n"
+ ".popsection" :: "m" (res[10]), "c" (NULL) );
+
+ memcpy(instr, vmovsd_to_mem, 15);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = (unsigned long)(res + 2);
+ regs.edx = 0;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 8, 32) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ {
+ printf("skipped\n");
+ memset(res + 2, 0x77, 8);
+ }
+
+ printf("%-40s", "Testing vmovaps (%edx),%ymm7...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovaps_from_mem[];
+
+ asm volatile ( "vxorps %%ymm7, %%ymm7, %%ymm7\n"
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovaps_from_mem: vmovaps (%0), %%ymm7\n"
+ ".popsection" :: "d" (NULL) );
+
+ memcpy(instr, vmovaps_from_mem, 15);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = 0;
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY )
+ goto fail;
+ asm ( "vcmpeqps %1, %%ymm7, %%ymm0\n\t"
+ "vmovmskps %%ymm0, %0" : "=r" (rc) : "m" (res[8]) );
+ if ( rc != 0xff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
for ( j = 1; j <= 2; j++ )
{
#if defined(__i386__)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1454,10 +1454,10 @@ x86_emulate(
/* VEX */
generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
- vex.raw[0] = b;
+ vex.raw[0] = modrm;
if ( b & 1 )
{
- vex.raw[1] = b;
+ vex.raw[1] = modrm;
vex.opcx = vex_0f;
vex.x = 1;
vex.b = 1;
@@ -1479,10 +1479,7 @@ x86_emulate(
}
}
}
- vex.reg ^= 0xf;
- if ( !mode_64bit() )
- vex.reg &= 0x7;
- else if ( !vex.r )
+ if ( mode_64bit() && !vex.r )
rex_prefix |= REX_R;
fail_if(vex.opcx != vex_0f);
@@ -3899,8 +3896,9 @@ x86_emulate(
else
{
fail_if((vex.opcx != vex_0f) ||
- (vex.reg && ((ea.type == OP_MEM) ||
- !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
+ ((vex.reg != 0xf) &&
+ ((ea.type == OP_MEM) ||
+ !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
vcpu_must_have_avx();
get_fpu(X86EMUL_FPU_ymm, &fic);
ea.bytes = 16 << vex.l;
@@ -4168,7 +4166,7 @@ x86_emulate(
}
else
{
- fail_if((vex.opcx != vex_0f) || vex.reg ||
+ fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
vcpu_must_have_avx();
get_fpu(X86EMUL_FPU_ymm, &fic);
[-- Attachment #2: x86-emul-test-AVX.patch --]
[-- Type: text/plain, Size: 8849 bytes --]
x86: AVX instruction emulation fixes
- we used the C4/C5 (first prefix) byte instead of the apparent ModR/M
one as the second prefix byte
- early decoding normalized vex.reg, thus corrupting it for the main
consumer (copy_REX_VEX()), resulting in #UD on the two-operand
instructions we emulate
Also add respective test cases to the testing utility plus
- fix get_fpu() (the fall-through order was inverted)
- add cpu_has_avx2, even if it's currently unused (as in the new test
cases I decided to refrain from using AVX2 instructions in order to
be able to actually run all the tests on the hardware I have)
- slightly tweak cpu_has_avx to more consistently express the outputs
we don't care about (sinking them all into the same variable)
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x
}
#define cpu_has_avx ({ \
- unsigned int eax = 1, ecx = 0, edx; \
- cpuid(&eax, &edx, &ecx, &edx, NULL); \
+ unsigned int eax = 1, ecx = 0; \
+ cpuid(&eax, &eax, &ecx, &eax, NULL); \
if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
ecx = 0; \
(ecx & (1U << 28)) != 0; \
})
+#define cpu_has_avx2 ({ \
+ unsigned int eax = 1, ebx, ecx = 0; \
+ cpuid(&eax, &ebx, &ecx, &eax, NULL); \
+ if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+ ebx = 0; \
+ else { \
+ eax = 7, ecx = 0; \
+ cpuid(&eax, &ebx, &ecx, &eax, NULL); \
+ } \
+ (ebx & (1U << 5)) != 0; \
+})
+
int get_fpu(
void (*exception_callback)(void *, struct cpu_user_regs *),
void *exception_callback_arg,
@@ -111,14 +123,14 @@ int get_fpu(
{
case X86EMUL_FPU_fpu:
break;
- case X86EMUL_FPU_ymm:
- if ( cpu_has_avx )
+ case X86EMUL_FPU_mmx:
+ if ( cpu_has_mmx )
break;
case X86EMUL_FPU_xmm:
if ( cpu_has_sse )
break;
- case X86EMUL_FPU_mmx:
- if ( cpu_has_mmx )
+ case X86EMUL_FPU_ymm:
+ if ( cpu_has_avx )
break;
default:
return X86EMUL_UNHANDLEABLE;
@@ -629,6 +641,73 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovdqu %ymm2,(%ecx)...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovdqu_to_mem[];
+
+ asm volatile ( "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n"
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovdqu_to_mem: vmovdqu %%ymm2, (%0)\n"
+ ".popsection" :: "c" (NULL) );
+
+ memcpy(instr, vmovdqu_to_mem, 15);
+ memset(res, 0x55, 128);
+ memset(res + 16, 0xff, 16);
+ memset(res + 20, 0x00, 16);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 16, 64) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmovdqu (%edx),%ymm4...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovdqu_from_mem[];
+
+#if 0 /* Don't use AVX2 instructions for now */
+ asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
+#else
+ asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
+ "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
+#endif
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovdqu_from_mem: vmovdqu (%0), %%ymm4\n"
+ ".popsection" :: "d" (NULL) );
+
+ memcpy(instr, vmovdqu_from_mem, 15);
+ memset(res + 4, 0xff, 16);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = 0;
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( rc != 0xffffffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
memset(res, 0x77, 64);
memset(res + 10, 0x66, 8);
@@ -683,6 +762,59 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovsd %xmm5,(%ecx)...");
+ memset(res, 0x88, 64);
+ memset(res + 10, 0x77, 8);
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovsd_to_mem[];
+
+ asm volatile ( "vbroadcastsd %0, %%ymm5\n"
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovsd_to_mem: vmovsd %%xmm5, (%1)\n"
+ ".popsection" :: "m" (res[10]), "c" (NULL) );
+
+ memcpy(instr, vmovsd_to_mem, 15);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = (unsigned long)(res + 2);
+ regs.edx = 0;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 8, 32) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ {
+ printf("skipped\n");
+ memset(res + 2, 0x77, 8);
+ }
+
+ printf("%-40s", "Testing vmovaps (%edx),%ymm7...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ extern const unsigned char vmovaps_from_mem[];
+
+ asm volatile ( "vxorps %%ymm7, %%ymm7, %%ymm7\n"
+ ".pushsection .test, \"a\", @progbits\n"
+ "vmovaps_from_mem: vmovaps (%0), %%ymm7\n"
+ ".popsection" :: "d" (NULL) );
+
+ memcpy(instr, vmovaps_from_mem, 15);
+ regs.eip = (unsigned long)&instr[0];
+ regs.ecx = 0;
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY )
+ goto fail;
+ asm ( "vcmpeqps %1, %%ymm7, %%ymm0\n\t"
+ "vmovmskps %%ymm0, %0" : "=r" (rc) : "m" (res[8]) );
+ if ( rc != 0xff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
for ( j = 1; j <= 2; j++ )
{
#if defined(__i386__)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1454,10 +1454,10 @@ x86_emulate(
/* VEX */
generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
- vex.raw[0] = b;
+ vex.raw[0] = modrm;
if ( b & 1 )
{
- vex.raw[1] = b;
+ vex.raw[1] = modrm;
vex.opcx = vex_0f;
vex.x = 1;
vex.b = 1;
@@ -1479,10 +1479,7 @@ x86_emulate(
}
}
}
- vex.reg ^= 0xf;
- if ( !mode_64bit() )
- vex.reg &= 0x7;
- else if ( !vex.r )
+ if ( mode_64bit() && !vex.r )
rex_prefix |= REX_R;
fail_if(vex.opcx != vex_0f);
@@ -3899,8 +3896,9 @@ x86_emulate(
else
{
fail_if((vex.opcx != vex_0f) ||
- (vex.reg && ((ea.type == OP_MEM) ||
- !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
+ ((vex.reg != 0xf) &&
+ ((ea.type == OP_MEM) ||
+ !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
vcpu_must_have_avx();
get_fpu(X86EMUL_FPU_ymm, &fic);
ea.bytes = 16 << vex.l;
@@ -4168,7 +4166,7 @@ x86_emulate(
}
else
{
- fail_if((vex.opcx != vex_0f) || vex.reg ||
+ fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
vcpu_must_have_avx();
get_fpu(X86EMUL_FPU_ymm, &fic);
[-- Attachment #3: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH] x86: AVX instruction emulation fixes
2013-08-28 8:32 [PATCH] x86: AVX instruction emulation fixes Jan Beulich
@ 2013-08-28 8:49 ` Fabio Fantoni
2013-08-28 8:56 ` Andrew Cooper
2013-08-28 12:49 ` Keir Fraser
1 sibling, 1 reply; 4+ messages in thread
From: Fabio Fantoni @ 2013-08-28 8:49 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser
[-- Attachment #1.1: Type: text/plain, Size: 9453 bytes --]
Il 28/08/2013 10:32, Jan Beulich ha scritto:
> - we used the C4/C5 (first prefix) byte instead of the apparent ModR/M
> one as the second prefix byte
> - early decoding normalized vex.reg, thus corrupting it for the main
> consumer (copy_REX_VEX()), resulting in #UD on the two-operand
> instructions we emulate
>
> Also add respective test cases to the testing utility plus
> - fix get_fpu() (the fall-through order was inverted)
> - add cpu_has_avx2, even if it's currently unused (as in the new test
> cases I decided to refrain from using AVX2 instructions in order to
> be able to actually run all the tests on the hardware I have)
> - slightly tweak cpu_has_avx to more consistently express the outputs
> we don't care about (sinking them all into the same variable)
This patch include the solution for full SSE support needed to solve
this problem?
http://bugs.xenproject.org/xen/bug/11
Thanks for any reply.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x
> }
>
> #define cpu_has_avx ({ \
> - unsigned int eax = 1, ecx = 0, edx; \
> - cpuid(&eax, &edx, &ecx, &edx, NULL); \
> + unsigned int eax = 1, ecx = 0; \
> + cpuid(&eax, &eax, &ecx, &eax, NULL); \
> if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
> ecx = 0; \
> (ecx & (1U << 28)) != 0; \
> })
>
> +#define cpu_has_avx2 ({ \
> + unsigned int eax = 1, ebx, ecx = 0; \
> + cpuid(&eax, &ebx, &ecx, &eax, NULL); \
> + if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
> + ebx = 0; \
> + else { \
> + eax = 7, ecx = 0; \
> + cpuid(&eax, &ebx, &ecx, &eax, NULL); \
> + } \
> + (ebx & (1U << 5)) != 0; \
> +})
> +
> int get_fpu(
> void (*exception_callback)(void *, struct cpu_user_regs *),
> void *exception_callback_arg,
> @@ -111,14 +123,14 @@ int get_fpu(
> {
> case X86EMUL_FPU_fpu:
> break;
> - case X86EMUL_FPU_ymm:
> - if ( cpu_has_avx )
> + case X86EMUL_FPU_mmx:
> + if ( cpu_has_mmx )
> break;
> case X86EMUL_FPU_xmm:
> if ( cpu_has_sse )
> break;
> - case X86EMUL_FPU_mmx:
> - if ( cpu_has_mmx )
> + case X86EMUL_FPU_ymm:
> + if ( cpu_has_avx )
> break;
> default:
> return X86EMUL_UNHANDLEABLE;
> @@ -629,6 +641,73 @@ int main(int argc, char **argv)
> else
> printf("skipped\n");
>
> + printf("%-40s", "Testing vmovdqu %ymm2,(%ecx)...");
> + if ( stack_exec && cpu_has_avx )
> + {
> + extern const unsigned char vmovdqu_to_mem[];
> +
> + asm volatile ( "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n"
> + ".pushsection .test, \"a\", @progbits\n"
> + "vmovdqu_to_mem: vmovdqu %%ymm2, (%0)\n"
> + ".popsection" :: "c" (NULL) );
> +
> + memcpy(instr, vmovdqu_to_mem, 15);
> + memset(res, 0x55, 128);
> + memset(res + 16, 0xff, 16);
> + memset(res + 20, 0x00, 16);
> + regs.eip = (unsigned long)&instr[0];
> + regs.ecx = (unsigned long)res;
> + rc = x86_emulate(&ctxt, &emulops);
> + if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 16, 64) )
> + goto fail;
> + printf("okay\n");
> + }
> + else
> + printf("skipped\n");
> +
> + printf("%-40s", "Testing vmovdqu (%edx),%ymm4...");
> + if ( stack_exec && cpu_has_avx )
> + {
> + extern const unsigned char vmovdqu_from_mem[];
> +
> +#if 0 /* Don't use AVX2 instructions for now */
> + asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
> +#else
> + asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
> + "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
> +#endif
> + ".pushsection .test, \"a\", @progbits\n"
> + "vmovdqu_from_mem: vmovdqu (%0), %%ymm4\n"
> + ".popsection" :: "d" (NULL) );
> +
> + memcpy(instr, vmovdqu_from_mem, 15);
> + memset(res + 4, 0xff, 16);
> + regs.eip = (unsigned long)&instr[0];
> + regs.ecx = 0;
> + regs.edx = (unsigned long)res;
> + rc = x86_emulate(&ctxt, &emulops);
> + if ( rc != X86EMUL_OKAY )
> + goto fail;
> +#if 0 /* Don't use AVX2 instructions for now */
> + asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
> + "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
> + "vpmovmskb %%ymm1, %0" : "=r" (rc) );
> +#else
> + asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
> + "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
> + "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
> + "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
> + "vpmovmskb %%xmm0, %0\n\t"
> + "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
> + rc |= i << 16;
> +#endif
> + if ( rc != 0xffffffff )
> + goto fail;
> + printf("okay\n");
> + }
> + else
> + printf("skipped\n");
> +
> printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
> memset(res, 0x77, 64);
> memset(res + 10, 0x66, 8);
> @@ -683,6 +762,59 @@ int main(int argc, char **argv)
> else
> printf("skipped\n");
>
> + printf("%-40s", "Testing vmovsd %xmm5,(%ecx)...");
> + memset(res, 0x88, 64);
> + memset(res + 10, 0x77, 8);
> + if ( stack_exec && cpu_has_avx )
> + {
> + extern const unsigned char vmovsd_to_mem[];
> +
> + asm volatile ( "vbroadcastsd %0, %%ymm5\n"
> + ".pushsection .test, \"a\", @progbits\n"
> + "vmovsd_to_mem: vmovsd %%xmm5, (%1)\n"
> + ".popsection" :: "m" (res[10]), "c" (NULL) );
> +
> + memcpy(instr, vmovsd_to_mem, 15);
> + regs.eip = (unsigned long)&instr[0];
> + regs.ecx = (unsigned long)(res + 2);
> + regs.edx = 0;
> + rc = x86_emulate(&ctxt, &emulops);
> + if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 8, 32) )
> + goto fail;
> + printf("okay\n");
> + }
> + else
> + {
> + printf("skipped\n");
> + memset(res + 2, 0x77, 8);
> + }
> +
> + printf("%-40s", "Testing vmovaps (%edx),%ymm7...");
> + if ( stack_exec && cpu_has_avx )
> + {
> + extern const unsigned char vmovaps_from_mem[];
> +
> + asm volatile ( "vxorps %%ymm7, %%ymm7, %%ymm7\n"
> + ".pushsection .test, \"a\", @progbits\n"
> + "vmovaps_from_mem: vmovaps (%0), %%ymm7\n"
> + ".popsection" :: "d" (NULL) );
> +
> + memcpy(instr, vmovaps_from_mem, 15);
> + regs.eip = (unsigned long)&instr[0];
> + regs.ecx = 0;
> + regs.edx = (unsigned long)res;
> + rc = x86_emulate(&ctxt, &emulops);
> + if ( rc != X86EMUL_OKAY )
> + goto fail;
> + asm ( "vcmpeqps %1, %%ymm7, %%ymm0\n\t"
> + "vmovmskps %%ymm0, %0" : "=r" (rc) : "m" (res[8]) );
> + if ( rc != 0xff )
> + goto fail;
> + printf("okay\n");
> + }
> + else
> + printf("skipped\n");
> +
> for ( j = 1; j <= 2; j++ )
> {
> #if defined(__i386__)
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1454,10 +1454,10 @@ x86_emulate(
> /* VEX */
> generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
>
> - vex.raw[0] = b;
> + vex.raw[0] = modrm;
> if ( b & 1 )
> {
> - vex.raw[1] = b;
> + vex.raw[1] = modrm;
> vex.opcx = vex_0f;
> vex.x = 1;
> vex.b = 1;
> @@ -1479,10 +1479,7 @@ x86_emulate(
> }
> }
> }
> - vex.reg ^= 0xf;
> - if ( !mode_64bit() )
> - vex.reg &= 0x7;
> - else if ( !vex.r )
> + if ( mode_64bit() && !vex.r )
> rex_prefix |= REX_R;
>
> fail_if(vex.opcx != vex_0f);
> @@ -3899,8 +3896,9 @@ x86_emulate(
> else
> {
> fail_if((vex.opcx != vex_0f) ||
> - (vex.reg && ((ea.type == OP_MEM) ||
> - !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
> + ((vex.reg != 0xf) &&
> + ((ea.type == OP_MEM) ||
> + !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
> vcpu_must_have_avx();
> get_fpu(X86EMUL_FPU_ymm, &fic);
> ea.bytes = 16 << vex.l;
> @@ -4168,7 +4166,7 @@ x86_emulate(
> }
> else
> {
> - fail_if((vex.opcx != vex_0f) || vex.reg ||
> + fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
> ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
> vcpu_must_have_avx();
> get_fpu(X86EMUL_FPU_ymm, &fic);
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
[-- Attachment #1.2: Type: text/html, Size: 10115 bytes --]
[-- Attachment #2: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH] x86: AVX instruction emulation fixes
2013-08-28 8:49 ` Fabio Fantoni
@ 2013-08-28 8:56 ` Andrew Cooper
0 siblings, 0 replies; 4+ messages in thread
From: Andrew Cooper @ 2013-08-28 8:56 UTC (permalink / raw)
To: Fabio Fantoni; +Cc: xen-devel, Keir Fraser, Jan Beulich
[-- Attachment #1.1: Type: text/plain, Size: 10006 bytes --]
On 28/08/13 09:49, Fabio Fantoni wrote:
> Il 28/08/2013 10:32, Jan Beulich ha scritto:
>> - we used the C4/C5 (first prefix) byte instead of the apparent ModR/M
>> one as the second prefix byte
>> - early decoding normalized vex.reg, thus corrupting it for the main
>> consumer (copy_REX_VEX()), resulting in #UD on the two-operand
>> instructions we emulate
>>
>> Also add respective test cases to the testing utility plus
>> - fix get_fpu() (the fall-through order was inverted)
>> - add cpu_has_avx2, even if it's currently unused (as in the new test
>> cases I decided to refrain from using AVX2 instructions in order to
>> be able to actually run all the tests on the hardware I have)
>> - slightly tweak cpu_has_avx to more consistently express the outputs
>> we don't care about (sinking them all into the same variable)
>
> This patch include the solution for full SSE support needed to solve
> this problem?
> http://bugs.xenproject.org/xen/bug/11
>
> Thanks for any reply.
No it doesn't. Bug #11 is a problem with the communication between Xen
and Qemu.
This fixes some bugs with Xens emulation of instructions, irrespective
of Qemu.
~Andrew
>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/tools/tests/x86_emulator/test_x86_emulator.c
>> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
>> @@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x
>> }
>>
>> #define cpu_has_avx ({ \
>> - unsigned int eax = 1, ecx = 0, edx; \
>> - cpuid(&eax, &edx, &ecx, &edx, NULL); \
>> + unsigned int eax = 1, ecx = 0; \
>> + cpuid(&eax, &eax, &ecx, &eax, NULL); \
>> if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
>> ecx = 0; \
>> (ecx & (1U << 28)) != 0; \
>> })
>>
>> +#define cpu_has_avx2 ({ \
>> + unsigned int eax = 1, ebx, ecx = 0; \
>> + cpuid(&eax, &ebx, &ecx, &eax, NULL); \
>> + if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
>> + ebx = 0; \
>> + else { \
>> + eax = 7, ecx = 0; \
>> + cpuid(&eax, &ebx, &ecx, &eax, NULL); \
>> + } \
>> + (ebx & (1U << 5)) != 0; \
>> +})
>> +
>> int get_fpu(
>> void (*exception_callback)(void *, struct cpu_user_regs *),
>> void *exception_callback_arg,
>> @@ -111,14 +123,14 @@ int get_fpu(
>> {
>> case X86EMUL_FPU_fpu:
>> break;
>> - case X86EMUL_FPU_ymm:
>> - if ( cpu_has_avx )
>> + case X86EMUL_FPU_mmx:
>> + if ( cpu_has_mmx )
>> break;
>> case X86EMUL_FPU_xmm:
>> if ( cpu_has_sse )
>> break;
>> - case X86EMUL_FPU_mmx:
>> - if ( cpu_has_mmx )
>> + case X86EMUL_FPU_ymm:
>> + if ( cpu_has_avx )
>> break;
>> default:
>> return X86EMUL_UNHANDLEABLE;
>> @@ -629,6 +641,73 @@ int main(int argc, char **argv)
>> else
>> printf("skipped\n");
>>
>> + printf("%-40s", "Testing vmovdqu %ymm2,(%ecx)...");
>> + if ( stack_exec && cpu_has_avx )
>> + {
>> + extern const unsigned char vmovdqu_to_mem[];
>> +
>> + asm volatile ( "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n"
>> + ".pushsection .test, \"a\", @progbits\n"
>> + "vmovdqu_to_mem: vmovdqu %%ymm2, (%0)\n"
>> + ".popsection" :: "c" (NULL) );
>> +
>> + memcpy(instr, vmovdqu_to_mem, 15);
>> + memset(res, 0x55, 128);
>> + memset(res + 16, 0xff, 16);
>> + memset(res + 20, 0x00, 16);
>> + regs.eip = (unsigned long)&instr[0];
>> + regs.ecx = (unsigned long)res;
>> + rc = x86_emulate(&ctxt, &emulops);
>> + if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 16, 64) )
>> + goto fail;
>> + printf("okay\n");
>> + }
>> + else
>> + printf("skipped\n");
>> +
>> + printf("%-40s", "Testing vmovdqu (%edx),%ymm4...");
>> + if ( stack_exec && cpu_has_avx )
>> + {
>> + extern const unsigned char vmovdqu_from_mem[];
>> +
>> +#if 0 /* Don't use AVX2 instructions for now */
>> + asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
>> +#else
>> + asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
>> + "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
>> +#endif
>> + ".pushsection .test, \"a\", @progbits\n"
>> + "vmovdqu_from_mem: vmovdqu (%0), %%ymm4\n"
>> + ".popsection" :: "d" (NULL) );
>> +
>> + memcpy(instr, vmovdqu_from_mem, 15);
>> + memset(res + 4, 0xff, 16);
>> + regs.eip = (unsigned long)&instr[0];
>> + regs.ecx = 0;
>> + regs.edx = (unsigned long)res;
>> + rc = x86_emulate(&ctxt, &emulops);
>> + if ( rc != X86EMUL_OKAY )
>> + goto fail;
>> +#if 0 /* Don't use AVX2 instructions for now */
>> + asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
>> + "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
>> + "vpmovmskb %%ymm1, %0" : "=r" (rc) );
>> +#else
>> + asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
>> + "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
>> + "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
>> + "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
>> + "vpmovmskb %%xmm0, %0\n\t"
>> + "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
>> + rc |= i << 16;
>> +#endif
>> + if ( rc != 0xffffffff )
>> + goto fail;
>> + printf("okay\n");
>> + }
>> + else
>> + printf("skipped\n");
>> +
>> printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
>> memset(res, 0x77, 64);
>> memset(res + 10, 0x66, 8);
>> @@ -683,6 +762,59 @@ int main(int argc, char **argv)
>> else
>> printf("skipped\n");
>>
>> + printf("%-40s", "Testing vmovsd %xmm5,(%ecx)...");
>> + memset(res, 0x88, 64);
>> + memset(res + 10, 0x77, 8);
>> + if ( stack_exec && cpu_has_avx )
>> + {
>> + extern const unsigned char vmovsd_to_mem[];
>> +
>> + asm volatile ( "vbroadcastsd %0, %%ymm5\n"
>> + ".pushsection .test, \"a\", @progbits\n"
>> + "vmovsd_to_mem: vmovsd %%xmm5, (%1)\n"
>> + ".popsection" :: "m" (res[10]), "c" (NULL) );
>> +
>> + memcpy(instr, vmovsd_to_mem, 15);
>> + regs.eip = (unsigned long)&instr[0];
>> + regs.ecx = (unsigned long)(res + 2);
>> + regs.edx = 0;
>> + rc = x86_emulate(&ctxt, &emulops);
>> + if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 8, 32) )
>> + goto fail;
>> + printf("okay\n");
>> + }
>> + else
>> + {
>> + printf("skipped\n");
>> + memset(res + 2, 0x77, 8);
>> + }
>> +
>> + printf("%-40s", "Testing vmovaps (%edx),%ymm7...");
>> + if ( stack_exec && cpu_has_avx )
>> + {
>> + extern const unsigned char vmovaps_from_mem[];
>> +
>> + asm volatile ( "vxorps %%ymm7, %%ymm7, %%ymm7\n"
>> + ".pushsection .test, \"a\", @progbits\n"
>> + "vmovaps_from_mem: vmovaps (%0), %%ymm7\n"
>> + ".popsection" :: "d" (NULL) );
>> +
>> + memcpy(instr, vmovaps_from_mem, 15);
>> + regs.eip = (unsigned long)&instr[0];
>> + regs.ecx = 0;
>> + regs.edx = (unsigned long)res;
>> + rc = x86_emulate(&ctxt, &emulops);
>> + if ( rc != X86EMUL_OKAY )
>> + goto fail;
>> + asm ( "vcmpeqps %1, %%ymm7, %%ymm0\n\t"
>> + "vmovmskps %%ymm0, %0" : "=r" (rc) : "m" (res[8]) );
>> + if ( rc != 0xff )
>> + goto fail;
>> + printf("okay\n");
>> + }
>> + else
>> + printf("skipped\n");
>> +
>> for ( j = 1; j <= 2; j++ )
>> {
>> #if defined(__i386__)
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -1454,10 +1454,10 @@ x86_emulate(
>> /* VEX */
>> generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1);
>>
>> - vex.raw[0] = b;
>> + vex.raw[0] = modrm;
>> if ( b & 1 )
>> {
>> - vex.raw[1] = b;
>> + vex.raw[1] = modrm;
>> vex.opcx = vex_0f;
>> vex.x = 1;
>> vex.b = 1;
>> @@ -1479,10 +1479,7 @@ x86_emulate(
>> }
>> }
>> }
>> - vex.reg ^= 0xf;
>> - if ( !mode_64bit() )
>> - vex.reg &= 0x7;
>> - else if ( !vex.r )
>> + if ( mode_64bit() && !vex.r )
>> rex_prefix |= REX_R;
>>
>> fail_if(vex.opcx != vex_0f);
>> @@ -3899,8 +3896,9 @@ x86_emulate(
>> else
>> {
>> fail_if((vex.opcx != vex_0f) ||
>> - (vex.reg && ((ea.type == OP_MEM) ||
>> - !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
>> + ((vex.reg != 0xf) &&
>> + ((ea.type == OP_MEM) ||
>> + !(vex.pfx & VEX_PREFIX_SCALAR_MASK))));
>> vcpu_must_have_avx();
>> get_fpu(X86EMUL_FPU_ymm, &fic);
>> ea.bytes = 16 << vex.l;
>> @@ -4168,7 +4166,7 @@ x86_emulate(
>> }
>> else
>> {
>> - fail_if((vex.opcx != vex_0f) || vex.reg ||
>> + fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) ||
>> ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
>> vcpu_must_have_avx();
>> get_fpu(X86EMUL_FPU_ymm, &fic);
>>
>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
[-- Attachment #1.2: Type: text/html, Size: 11247 bytes --]
[-- Attachment #2: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] x86: AVX instruction emulation fixes
2013-08-28 8:32 [PATCH] x86: AVX instruction emulation fixes Jan Beulich
2013-08-28 8:49 ` Fabio Fantoni
@ 2013-08-28 12:49 ` Keir Fraser
1 sibling, 0 replies; 4+ messages in thread
From: Keir Fraser @ 2013-08-28 12:49 UTC (permalink / raw)
To: Jan Beulich, xen-devel
On 28/08/2013 09:32, "Jan Beulich" <JBeulich@suse.com> wrote:
> - we used the C4/C5 (first prefix) byte instead of the apparent ModR/M
> one as the second prefix byte
> - early decoding normalized vex.reg, thus corrupting it for the main
> consumer (copy_REX_VEX()), resulting in #UD on the two-operand
> instructions we emulate
>
> Also add respective test cases to the testing utility plus
> - fix get_fpu() (the fall-through order was inverted)
> - add cpu_has_avx2, even if it's currently unused (as in the new test
> cases I decided to refrain from using AVX2 instructions in order to
> be able to actually run all the tests on the hardware I have)
> - slightly tweak cpu_has_avx to more consistently express the outputs
> we don't care about (sinking them all into the same variable)
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Keir Fraser <keir@xen.org>
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2013-08-28 12:50 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-28 8:32 [PATCH] x86: AVX instruction emulation fixes Jan Beulich
2013-08-28 8:49 ` Fabio Fantoni
2013-08-28 8:56 ` Andrew Cooper
2013-08-28 12:49 ` Keir Fraser
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.