[PATCH] target/i386: do not access beyond the low 128 bits of SSE registers

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers
@ 2022-04-13 16:09 Paolo Bonzini
  2022-04-13 16:25 ` Peter Maydell
  2022-04-13 18:44 ` Alex Bennée
  0 siblings, 2 replies; 4+ messages in thread
From: Paolo Bonzini @ 2022-04-13 16:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, alex.bennee

The i386 target consolidates all vector registers so that instead of
XMMReg, YMMReg and ZMMReg structs there is a single ZMMReg that can
fit all of SSE, AVX and AVX512.

When TCG copies data from and to the SSE registers, it uses the
full 64-byte width.  This is not a correctness issue because TCG
never lets guest code see beyond the first 128 bits of the ZMM
registers, however it causes uninitialized stack memory to
make it to the CPU's migration stream.

Fix it by only copying the low 16 bytes of the ZMMReg union into
the destination register.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/ops_sse.h | 75 +++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 6f1fc174b3..b415809396 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -22,6 +22,7 @@
 
 #if SHIFT == 0
 #define Reg MMXReg
+#define SIZE 8
 #define XMM_ONLY(...)
 #define B(n) MMX_B(n)
 #define W(n) MMX_W(n)
@@ -30,6 +31,7 @@
 #define SUFFIX _mmx
 #else
 #define Reg ZMMReg
+#define SIZE 16
 #define XMM_ONLY(...) __VA_ARGS__
 #define B(n) ZMM_B(n)
 #define W(n) ZMM_W(n)
@@ -38,6 +40,22 @@
 #define SUFFIX _xmm
 #endif
 
+/*
+ * Copy the relevant parts of a Reg value around. In the case where
+ * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
+ * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
+ * untouched in the guest-visible destination destination register.
+ * Note that the "lower bytes" are placed last in memory on big-endian
+ * hosts, which store the vector backwards in memory.  In that case the
+ * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
+ * the little-endian case.
+ */
+#ifdef HOST_WORDS_BIGENDIAN
+#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(d).B(SIZE - 1), SIZE)
+#else
+#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
+#endif
+
 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     int shift;
@@ -516,7 +534,7 @@ void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
     r.W(1) = s->W((order >> 2) & 3);
     r.W(2) = s->W((order >> 4) & 3);
     r.W(3) = s->W((order >> 6) & 3);
-    *d = r;
+    MOVE(*d, r);
 }
 #else
 void helper_shufps(Reg *d, Reg *s, int order)
@@ -527,7 +545,7 @@ void helper_shufps(Reg *d, Reg *s, int order)
     r.L(1) = d->L((order >> 2) & 3);
     r.L(2) = s->L((order >> 4) & 3);
     r.L(3) = s->L((order >> 6) & 3);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_shufpd(Reg *d, Reg *s, int order)
@@ -536,7 +554,7 @@ void helper_shufpd(Reg *d, Reg *s, int order)
 
     r.Q(0) = d->Q(order & 1);
     r.Q(1) = s->Q((order >> 1) & 1);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
@@ -547,7 +565,7 @@ void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
     r.L(1) = s->L((order >> 2) & 3);
     r.L(2) = s->L((order >> 4) & 3);
     r.L(3) = s->L((order >> 6) & 3);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
@@ -559,7 +577,7 @@ void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
     r.W(2) = s->W((order >> 4) & 3);
     r.W(3) = s->W((order >> 6) & 3);
     r.Q(1) = s->Q(1);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
@@ -571,7 +589,7 @@ void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
     r.W(5) = s->W(4 + ((order >> 2) & 3));
     r.W(6) = s->W(4 + ((order >> 4) & 3));
     r.W(7) = s->W(4 + ((order >> 6) & 3));
-    *d = r;
+    MOVE(*d, r);
 }
 #endif
 
@@ -937,7 +955,7 @@ void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
     r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
     r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
     r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -946,7 +964,7 @@ void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 
     r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
     r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -957,7 +975,7 @@ void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
     r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
     r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
     r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -966,7 +984,7 @@ void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 
     r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
     r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
@@ -1153,7 +1171,7 @@ void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     r.B(14) = satsb((int16_t)s->W(6));
     r.B(15) = satsb((int16_t)s->W(7));
 #endif
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -1180,7 +1198,7 @@ void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     r.B(14) = satub((int16_t)s->W(6));
     r.B(15) = satub((int16_t)s->W(7));
 #endif
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -1199,7 +1217,7 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     r.W(6) = satsw(s->L(2));
     r.W(7) = satsw(s->L(3));
 #endif
-    *d = r;
+    MOVE(*d, r);
 }
 
 #define UNPCK_OP(base_name, base)                                       \
@@ -1227,7 +1245,7 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
                  r.B(14) = d->B((base << (SHIFT + 2)) + 7);             \
                  r.B(15) = s->B((base << (SHIFT + 2)) + 7);             \
                                                                       ) \
-            *d = r;                                                     \
+        MOVE(*d, r);                                                    \
     }                                                                   \
                                                                         \
     void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
@@ -1245,7 +1263,7 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
                  r.W(6) = d->W((base << (SHIFT + 1)) + 3);              \
                  r.W(7) = s->W((base << (SHIFT + 1)) + 3);              \
                                                                       ) \
-            *d = r;                                                     \
+            MOVE(*d, r);                                                \
     }                                                                   \
                                                                         \
     void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
@@ -1259,7 +1277,7 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
                  r.L(2) = d->L((base << SHIFT) + 1);                    \
                  r.L(3) = s->L((base << SHIFT) + 1);                    \
                                                                       ) \
-            *d = r;                                                     \
+            MOVE(*d, r);                                                \
     }                                                                   \
                                                                         \
     XMM_ONLY(                                                           \
@@ -1272,7 +1290,7 @@ void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
                                                                         \
                  r.Q(0) = d->Q(base);                                   \
                  r.Q(1) = s->Q(base);                                   \
-                 *d = r;                                                \
+                 MOVE(*d, r);                                           \
              }                                                          \
                                                                         )
 
@@ -1313,7 +1331,7 @@ void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
 
     r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
@@ -1378,7 +1396,7 @@ void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
 
     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
     r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
@@ -1387,7 +1405,7 @@ void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
 
     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
-    *d = r;
+    MOVE(*d, r);
 }
 
 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
@@ -1424,7 +1442,7 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
 
     r.MMX_L(0) = s->MMX_L(1);
     r.MMX_L(1) = s->MMX_L(0);
-    *d = r;
+    MOVE(*d, r);
 }
 #endif
 
@@ -1438,7 +1456,7 @@ void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
         r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
     }
 
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -1455,7 +1473,7 @@ void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
     XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
 
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -1467,7 +1485,7 @@ void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
     XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
 
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -1483,7 +1501,7 @@ void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
     XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
 
-    *d = r;
+    MOVE(*d, r);
 }
 
 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
@@ -1585,7 +1603,7 @@ void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
 #undef SHR
     }
 
-    *d = r;
+    MOVE(*d, r);
 }
 
 #define XMM0 (env->xmm_regs[0])
@@ -1718,7 +1736,7 @@ void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     r.W(5) = satuw((int32_t) s->L(1));
     r.W(6) = satuw((int32_t) s->L(2));
     r.W(7) = satuw((int32_t) s->L(3));
-    *d = r;
+    MOVE(*d, r);
 }
 
 #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
@@ -1984,7 +2002,7 @@ void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
         r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
     }
 
-    *d = r;
+    MOVE(*d, r);
 }
 
 /* SSE4.2 op helpers */
@@ -2324,3 +2342,4 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
 #undef L
 #undef Q
 #undef SUFFIX
+#undef SIZE
-- 
2.35.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers
  2022-04-13 16:09 [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers Paolo Bonzini
@ 2022-04-13 16:25 ` Peter Maydell
  2022-04-13 18:44 ` Alex Bennée
  1 sibling, 0 replies; 4+ messages in thread
From: Peter Maydell @ 2022-04-13 16:25 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: alex.bennee, qemu-devel

On Wed, 13 Apr 2022 at 17:09, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> The i386 target consolidates all vector registers so that instead of
> XMMReg, YMMReg and ZMMReg structs there is a single ZMMReg that can
> fit all of SSE, AVX and AVX512.
>
> When TCG copies data from and to the SSE registers, it uses the
> full 64-byte width.  This is not a correctness issue because TCG
> never lets guest code see beyond the first 128 bits of the ZMM
> registers, however it causes uninitialized stack memory to
> make it to the CPU's migration stream.
>
> Fix it by only copying the low 16 bytes of the ZMMReg union into
> the destination register.
>

> +/*
> + * Copy the relevant parts of a Reg value around. In the case where
> + * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
> + * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
> + * untouched in the guest-visible destination destination register.
> + * Note that the "lower bytes" are placed last in memory on big-endian
> + * hosts, which store the vector backwards in memory.  In that case the
> + * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
> + * the little-endian case.
> + */
> +#ifdef HOST_WORDS_BIGENDIAN
> +#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(d).B(SIZE - 1), SIZE)

Still has the typo where it's copying d to d, not r to d.


> +#else
> +#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
> +#endif

Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

thanks
-- PMM


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers
  2022-04-13 16:09 [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers Paolo Bonzini
  2022-04-13 16:25 ` Peter Maydell
@ 2022-04-13 18:44 ` Alex Bennée
  2022-04-13 20:11   ` Paolo Bonzini
  1 sibling, 1 reply; 4+ messages in thread
From: Alex Bennée @ 2022-04-13 18:44 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: peter.maydell, qemu-devel


Paolo Bonzini <pbonzini@redhat.com> writes:

> The i386 target consolidates all vector registers so that instead of
> XMMReg, YMMReg and ZMMReg structs there is a single ZMMReg that can
> fit all of SSE, AVX and AVX512.
>
> When TCG copies data from and to the SSE registers, it uses the
> full 64-byte width.  This is not a correctness issue because TCG
> never lets guest code see beyond the first 128 bits of the ZMM
> registers, however it causes uninitialized stack memory to
> make it to the CPU's migration stream.
>
> Fix it by only copying the low 16 bytes of the ZMMReg union into
> the destination register.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

And:

Fixes: b7711471f5 ("target-i386: make xmm_regs 512-bit wide")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/420

?
-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers
  2022-04-13 18:44 ` Alex Bennée
@ 2022-04-13 20:11   ` Paolo Bonzini
  0 siblings, 0 replies; 4+ messages in thread
From: Paolo Bonzini @ 2022-04-13 20:11 UTC (permalink / raw)
  To: Alex Bennée; +Cc: peter.maydell, qemu-devel

On 4/13/22 20:44, Alex Bennée wrote:
> And:
> 
> Fixes: b7711471f5 ("target-i386: make xmm_regs 512-bit wide")
> Resolves:https://gitlab.com/qemu-project/qemu/-/issues/420

It's not really a bug, beyond the possibility of uninitialized data in 
the migration stream.  The reporter was probably using a fork of QEMU.

Paolo


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-04-13 20:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-04-13 16:09 [PATCH] target/i386: do not access beyond the low 128 bits of SSE registers Paolo Bonzini
2022-04-13 16:25 ` Peter Maydell
2022-04-13 18:44 ` Alex Bennée
2022-04-13 20:11   ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).