* [Qemu-devel] [5411] Implement SSE4.1, SSE4.2 (x86). @ 2008-10-04 3:27 Andrzej Zaborowski [not found] ` <0C221828-78E4-49DC-AB8E-58B5BBD3F294@hotmail.com> 0 siblings, 1 reply; 3+ messages in thread From: Andrzej Zaborowski @ 2008-10-04 3:27 UTC (permalink / raw) To: qemu-devel Revision: 5411 http://svn.sv.gnu.org/viewvc/?view=rev&root=qemu&revision=5411 Author: balrog Date: 2008-10-04 03:27:44 +0000 (Sat, 04 Oct 2008) Log Message: ----------- Implement SSE4.1, SSE4.2 (x86). This adds support for CPUID_EXT_SSE41, CPUID_EXT_SSE42, CPUID_EXT_POPCNT extensions. Most instructions haven't been tested yet. Modified Paths: -------------- trunk/target-i386/ops_sse.h trunk/target-i386/ops_sse_header.h trunk/target-i386/translate.c trunk/tests/test-i386-ssse3.c Modified: trunk/target-i386/ops_sse.h =================================================================== --- trunk/target-i386/ops_sse.h 2008-10-03 23:09:08 UTC (rev 5410) +++ trunk/target-i386/ops_sse.h 2008-10-04 03:27:44 UTC (rev 5411) @@ -1,7 +1,8 @@ /* - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support * * Copyright (c) 2005 Fabrice Bellard + * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -1420,6 +1421,621 @@ *d = r; } +#define XMM0 env->xmm_regs[0] + +#if SHIFT == 1 +#define SSE_HELPER_V(name, elem, num, F)\ +void glue(name, SUFFIX) (Reg *d, Reg *s)\ +{\ + d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\ + d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\ + if (num > 2) {\ + d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\ + d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\ + if (num > 4) {\ + d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\ + d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\ + d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\ + d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\ + if (num > 8) {\ + d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\ + d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\ + d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\ + d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\ + d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\ + d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\ + d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\ + d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\ + }\ + }\ + }\ +} + +#define SSE_HELPER_I(name, elem, num, F)\ +void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\ +{\ + d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\ + d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\ + if (num > 2) {\ + d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\ + d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\ + if (num > 4) {\ + d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\ + d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\ + d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\ + d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\ + if (num > 8) {\ + d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\ + d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\ + d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\ + d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\ + d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\ + d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\ + d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\ + d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\ + }\ + }\ + }\ +} + +/* SSE4.1 op helpers */ +#define FBLENDVB(d, s, m) (m & 0x80) ? s : d +#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d +#define FBLENDVPD(d, s, m) (m & 0x8000000000000000) ? s : d +SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) +SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) +SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) + +void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s) +{ + uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); + uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); + + CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); +} + +#define SSE_HELPER_F(name, elem, num, F)\ +void glue(name, SUFFIX) (Reg *d, Reg *s)\ +{\ + d->elem(0) = F(0);\ + d->elem(1) = F(1);\ + d->elem(2) = F(2);\ + d->elem(3) = F(3);\ + if (num > 3) {\ + d->elem(4) = F(4);\ + d->elem(5) = F(5);\ + if (num > 5) {\ + d->elem(6) = F(6);\ + d->elem(7) = F(7);\ + }\ + }\ +} + +SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) +SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) +SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) +SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) +SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) +SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) +SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) +SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) +SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) +SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) +SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) +SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) + +void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s) +{ + d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0); + d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2); +} + +#define FCMPEQQ(d, s) d == s ? -1 : 0 +SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) + +void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s) +{ + d->W(0) = satuw((int32_t) d->L(0)); + d->W(1) = satuw((int32_t) d->L(1)); + d->W(2) = satuw((int32_t) d->L(2)); + d->W(3) = satuw((int32_t) d->L(3)); + d->W(4) = satuw((int32_t) s->L(0)); + d->W(5) = satuw((int32_t) s->L(1)); + d->W(6) = satuw((int32_t) s->L(2)); + d->W(7) = satuw((int32_t) s->L(3)); +} + +#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s) +#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s) +#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s) +#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s) +SSE_HELPER_B(helper_pminsb, FMINSB) +SSE_HELPER_L(helper_pminsd, FMINSD) +SSE_HELPER_W(helper_pminuw, MIN) +SSE_HELPER_L(helper_pminud, MIN) +SSE_HELPER_B(helper_pmaxsb, FMAXSB) +SSE_HELPER_L(helper_pmaxsd, FMAXSD) +SSE_HELPER_W(helper_pmaxuw, MAX) +SSE_HELPER_L(helper_pmaxud, MAX) + +#define FMULLD(d, s) (int32_t) d * (int32_t) s +SSE_HELPER_L(helper_pmulld, FMULLD) + +void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s) +{ + int idx = 0; + + if (s->W(1) < s->W(idx)) + idx = 1; + if (s->W(2) < s->W(idx)) + idx = 2; + if (s->W(3) < s->W(idx)) + idx = 3; + if (s->W(4) < s->W(idx)) + idx = 4; + if (s->W(5) < s->W(idx)) + idx = 5; + if (s->W(6) < s->W(idx)) + idx = 6; + if (s->W(7) < s->W(idx)) + idx = 7; + + d->Q(1) = 0; + d->L(1) = 0; + d->W(1) = idx; + d->W(0) = s->W(idx); +} + +void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode) +{ + signed char prev_rounding_mode; + + prev_rounding_mode = env->sse_status.float_rounding_mode; + if (!(mode & (1 << 2))) + switch (mode & 3) { + case 0: + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); + break; + case 1: + set_float_rounding_mode(float_round_down, &env->sse_status); + break; + case 2: + set_float_rounding_mode(float_round_up, &env->sse_status); + break; + case 3: + set_float_rounding_mode(float_round_to_zero, &env->sse_status); + break; + } + + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); + d->L(1) = float64_round_to_int(s->L(1), &env->sse_status); + d->L(2) = float64_round_to_int(s->L(2), &env->sse_status); + d->L(3) = float64_round_to_int(s->L(3), &env->sse_status); + +#if 0 /* TODO */ + if (mode & (1 << 3)) + set_float_exception_flags( + get_float_exception_flags(&env->sse_status) & + ~float_flag_inexact, + &env->sse_status); +#endif + env->sse_status.float_rounding_mode = prev_rounding_mode; +} + +void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) +{ + signed char prev_rounding_mode; + + prev_rounding_mode = env->sse_status.float_rounding_mode; + if (!(mode & (1 << 2))) + switch (mode & 3) { + case 0: + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); + break; + case 1: + set_float_rounding_mode(float_round_down, &env->sse_status); + break; + case 2: + set_float_rounding_mode(float_round_up, &env->sse_status); + break; + case 3: + set_float_rounding_mode(float_round_to_zero, &env->sse_status); + break; + } + + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); + d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status); + +#if 0 /* TODO */ + if (mode & (1 << 3)) + set_float_exception_flags( + get_float_exception_flags(&env->sse_status) & + ~float_flag_inexact, + &env->sse_status); +#endif + env->sse_status.float_rounding_mode = prev_rounding_mode; +} + +void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode) +{ + signed char prev_rounding_mode; + + prev_rounding_mode = env->sse_status.float_rounding_mode; + if (!(mode & (1 << 2))) + switch (mode & 3) { + case 0: + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); + break; + case 1: + set_float_rounding_mode(float_round_down, &env->sse_status); + break; + case 2: + set_float_rounding_mode(float_round_up, &env->sse_status); + break; + case 3: + set_float_rounding_mode(float_round_to_zero, &env->sse_status); + break; + } + + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); + +#if 0 /* TODO */ + if (mode & (1 << 3)) + set_float_exception_flags( + get_float_exception_flags(&env->sse_status) & + ~float_flag_inexact, + &env->sse_status); +#endif + env->sse_status.float_rounding_mode = prev_rounding_mode; +} + +void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) +{ + signed char prev_rounding_mode; + + prev_rounding_mode = env->sse_status.float_rounding_mode; + if (!(mode & (1 << 2))) + switch (mode & 3) { + case 0: + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); + break; + case 1: + set_float_rounding_mode(float_round_down, &env->sse_status); + break; + case 2: + set_float_rounding_mode(float_round_up, &env->sse_status); + break; + case 3: + set_float_rounding_mode(float_round_to_zero, &env->sse_status); + break; + } + + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); + +#if 0 /* TODO */ + if (mode & (1 << 3)) + set_float_exception_flags( + get_float_exception_flags(&env->sse_status) & + ~float_flag_inexact, + &env->sse_status); +#endif + env->sse_status.float_rounding_mode = prev_rounding_mode; +} + +#define FBLENDP(d, s, m) m ? s : d +SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) +SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) +SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) + +void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask) +{ + float32 iresult = 0 /*float32_zero*/; + + if (mask & (1 << 4)) + iresult = float32_add(iresult, + float32_mul(d->L(0), s->L(0), &env->sse_status), + &env->sse_status); + if (mask & (1 << 5)) + iresult = float32_add(iresult, + float32_mul(d->L(1), s->L(1), &env->sse_status), + &env->sse_status); + if (mask & (1 << 6)) + iresult = float32_add(iresult, + float32_mul(d->L(2), s->L(2), &env->sse_status), + &env->sse_status); + if (mask & (1 << 7)) + iresult = float32_add(iresult, + float32_mul(d->L(3), s->L(3), &env->sse_status), + &env->sse_status); + d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/; + d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/; + d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/; + d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/; +} + +void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask) +{ + float64 iresult = 0 /*float64_zero*/; + + if (mask & (1 << 4)) + iresult = float64_add(iresult, + float64_mul(d->Q(0), s->Q(0), &env->sse_status), + &env->sse_status); + if (mask & (1 << 5)) + iresult = float64_add(iresult, + float64_mul(d->Q(1), s->Q(1), &env->sse_status), + &env->sse_status); + d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/; + d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/; +} + +void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset) +{ + int s0 = (offset & 3) << 2; + int d0 = (offset & 4) << 0; + int i; + Reg r; + + for (i = 0; i < 8; i++, d0++) { + r.W(i) = 0; + r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); + r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); + r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); + r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); + } + + *d = r; +} + +/* SSE4.2 op helpers */ +/* it's unclear whether signed or unsigned */ +#define FCMPGTQ(d, s) d > s ? -1 : 0 +SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) + +static inline int pcmp_elen(int reg, uint32_t ctrl) +{ + int val; + + /* Presence of REX.W is indicated by a bit higher than 7 set */ + if (ctrl >> 8) + val = abs1((int64_t) env->regs[reg]); + else + val = abs1((int32_t) env->regs[reg]); + + if (ctrl & 1) { + if (val > 8) + return 8; + } else + if (val > 16) + return 16; + + return val; +} + +static inline int pcmp_ilen(Reg *r, uint8_t ctrl) +{ + int val = 0; + + if (ctrl & 1) { + while (val < 8 && r->W(val)) + val++; + } else + while (val < 16 && r->B(val)) + val++; + + return val; +} + +static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) +{ + switch ((ctrl >> 0) & 3) { + case 0: + return r->B(i); + case 1: + return r->W(i); + case 2: + return (int8_t) r->B(i); + case 3: + default: + return (int16_t) r->W(i); + } +} + +static inline unsigned pcmpxstrx(Reg *d, Reg *s, + int8_t ctrl, int valids, int validd) +{ + unsigned int res = 0; + int v; + int j, i; + int upper = (ctrl & 1) ? 7 : 15; + + valids--; + validd--; + + CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); + + switch ((ctrl >> 2) & 3) { + case 0: + for (j = valids; j >= 0; j--) { + res <<= 1; + v = pcmp_val(s, ctrl, j); + for (i = validd; i >= 0; i--) + res |= (v == pcmp_val(d, ctrl, i)); + } + break; + case 1: + for (j = valids; j >= 0; j--) { + res <<= 1; + v = pcmp_val(s, ctrl, j); + for (i = ((validd - 1) | 1); i >= 0; i -= 2) + res |= (pcmp_val(d, ctrl, i - 0) <= v && + pcmp_val(d, ctrl, i - 1) >= v); + } + break; + case 2: + res = (2 << (upper - MAX(valids, validd))) - 1; + res <<= MAX(valids, validd) - MIN(valids, validd); + for (i = MIN(valids, validd); i >= 0; i--) { + res <<= 1; + v = pcmp_val(s, ctrl, i); + res |= (v == pcmp_val(d, ctrl, i)); + } + break; + case 3: + for (j = valids - validd; j >= 0; j--) { + res <<= 1; + res |= 1; + for (i = MIN(upper - j, validd); i >= 0; i--) + res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); + } + break; + } + + switch ((ctrl >> 4) & 3) { + case 1: + res ^= (2 << upper) - 1; + break; + case 3: + res ^= (2 << valids) - 1; + break; + } + + if (res) + CC_SRC |= CC_C; + if (res & 1) + CC_SRC |= CC_O; + + return res; +} + +static inline int rffs1(unsigned int val) +{ + int ret = 1, hi; + + for (hi = sizeof(val) * 4; hi; hi /= 2) + if (val >> hi) { + val >>= hi; + ret += hi; + } + + return ret; +} + +static inline int ffs1(unsigned int val) +{ + int ret = 1, hi; + + for (hi = sizeof(val) * 4; hi; hi /= 2) + if (val << hi) { + val <<= hi; + ret += hi; + } + + return ret; +} + +void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) +{ + unsigned int res = pcmpxstrx(d, s, ctrl, + pcmp_elen(R_EDX, ctrl), + pcmp_elen(R_EAX, ctrl)); + + if (res) + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; + else + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); +} + +void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) +{ + int i; + unsigned int res = pcmpxstrx(d, s, ctrl, + pcmp_elen(R_EDX, ctrl), + pcmp_elen(R_EAX, ctrl)); + + if ((ctrl >> 6) & 1) { + if (ctrl & 1) + for (i = 0; i <= 8; i--, res >>= 1) + d->W(i) = (res & 1) ? ~0 : 0; + else + for (i = 0; i <= 16; i--, res >>= 1) + d->B(i) = (res & 1) ? ~0 : 0; + } else { + d->Q(1) = 0; + d->Q(0) = res; + } +} + +void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) +{ + unsigned int res = pcmpxstrx(d, s, ctrl, + pcmp_ilen(s, ctrl), + pcmp_ilen(d, ctrl)); + + if (res) + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; + else + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); +} + +void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) +{ + int i; + unsigned int res = pcmpxstrx(d, s, ctrl, + pcmp_ilen(s, ctrl), + pcmp_ilen(d, ctrl)); + + if ((ctrl >> 6) & 1) { + if (ctrl & 1) + for (i = 0; i <= 8; i--, res >>= 1) + d->W(i) = (res & 1) ? ~0 : 0; + else + for (i = 0; i <= 16; i--, res >>= 1) + d->B(i) = (res & 1) ? ~0 : 0; + } else { + d->Q(1) = 0; + d->Q(0) = res; + } +} + +#define CRCPOLY 0x1edc6f41 +#define CRCPOLY_BITREV 0x82f63b78 +target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) +{ + target_ulong crc = (msg & ((target_ulong) -1 >> + (TARGET_LONG_BITS - len))) ^ crc1; + + while (len--) + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); + + return crc; +} + +#define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1)) +#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i)) +target_ulong helper_popcnt(target_ulong n, uint32_t type) +{ + CC_SRC = n ? 0 : CC_Z; + + n = POPCOUNT(n, 0); + n = POPCOUNT(n, 1); + n = POPCOUNT(n, 2); + n = POPCOUNT(n, 3); + if (type == 1) + return n & 0xff; + + n = POPCOUNT(n, 4); +#ifndef TARGET_X86_64 + return n; +#else + if (type == 2) + return n & 0xff; + + return POPCOUNT(n, 5); +#endif +} +#endif + #undef SHIFT #undef XMM_ONLY #undef Reg Modified: trunk/target-i386/ops_sse_header.h =================================================================== --- trunk/target-i386/ops_sse_header.h 2008-10-03 23:09:08 UTC (rev 5410) +++ trunk/target-i386/ops_sse_header.h 2008-10-04 03:27:44 UTC (rev 5411) @@ -1,5 +1,5 @@ /* - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support * * Copyright (c) 2005 Fabrice Bellard * @@ -269,6 +269,61 @@ DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s)) DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift)) +/* SSE4.1 op helpers */ +#if SHIFT == 1 +DEF_HELPER(void, glue(helper_pblendvb, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_blendvps, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_blendvpd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_ptest, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovsxbw, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovsxbd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovsxbq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovsxwd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovsxwq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovsxdq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovzxbw, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovzxbd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovzxbq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovzxwd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovzxwq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmovzxdq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmuldq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pcmpeqq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_packusdw, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pminsb, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pminsd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pminuw, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pminud, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmaxsb, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmaxsd, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmaxuw, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmaxud, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pmulld, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_phminposuw, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_roundps, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) +DEF_HELPER(void, glue(helper_roundpd, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) +DEF_HELPER(void, glue(helper_roundss, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) +DEF_HELPER(void, glue(helper_roundsd, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) +DEF_HELPER(void, glue(helper_blendps, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) +DEF_HELPER(void, glue(helper_blendpd, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) +DEF_HELPER(void, glue(helper_pblendw, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) +DEF_HELPER(void, glue(helper_dpps, SUFFIX), (Reg *d, Reg *s, uint32_t mask)) +DEF_HELPER(void, glue(helper_dppd, SUFFIX), (Reg *d, Reg *s, uint32_t mask)) +DEF_HELPER(void, glue(helper_mpsadbw, SUFFIX), (Reg *d, Reg *s, uint32_t off)) +#endif + +/* SSE4.2 op helpers */ +#if SHIFT == 1 +DEF_HELPER(void, glue(helper_pcmpgtq, SUFFIX), (Reg *d, Reg *s)) +DEF_HELPER(void, glue(helper_pcmpestri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) +DEF_HELPER(void, glue(helper_pcmpestrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) +DEF_HELPER(void, glue(helper_pcmpistri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) +DEF_HELPER(void, glue(helper_pcmpistrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) +DEF_HELPER(target_ulong, helper_crc32, + (uint32_t crc1, target_ulong msg, uint32_t len)) +DEF_HELPER(target_ulong, helper_popcnt, (target_ulong n, uint32_t type)) +#endif + #undef SHIFT #undef Reg #undef SUFFIX Modified: trunk/target-i386/translate.c =================================================================== --- trunk/target-i386/translate.c 2008-10-03 23:09:08 UTC (rev 5410) +++ trunk/target-i386/translate.c 2008-10-04 03:27:44 UTC (rev 5411) @@ -2140,7 +2140,7 @@ } } -/* generate modrm memory load or store of 'reg'. TMP0 is used if reg != +/* generate modrm memory load or store of 'reg'. TMP0 is used if reg == OR_TMP0 */ static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store) { @@ -2770,8 +2770,8 @@ [0xc2] = SSE_FOP(cmpeq), [0xc6] = { helper_shufps, helper_shufpd }, - [0x38] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ - [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ + [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */ + [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */ /* MMX ops and their SSE extensions */ [0x60] = MMX_OP2(punpcklbw), @@ -2924,26 +2924,85 @@ [0xbf] = helper_pavgb_mmx /* pavgusb */ }; -static void *sse_op_table6[256][2] = { - [0x00] = MMX_OP2(pshufb), - [0x01] = MMX_OP2(phaddw), - [0x02] = MMX_OP2(phaddd), - [0x03] = MMX_OP2(phaddsw), - [0x04] = MMX_OP2(pmaddubsw), - [0x05] = MMX_OP2(phsubw), - [0x06] = MMX_OP2(phsubd), - [0x07] = MMX_OP2(phsubsw), - [0x08] = MMX_OP2(psignb), - [0x09] = MMX_OP2(psignw), - [0x0a] = MMX_OP2(psignd), - [0x0b] = MMX_OP2(pmulhrsw), - [0x1c] = MMX_OP2(pabsb), - [0x1d] = MMX_OP2(pabsw), - [0x1e] = MMX_OP2(pabsd), +struct sse_op_helper_s { + void *op[2]; uint32_t ext_mask; }; +#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } +#define SSE41_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE41 } +#define SSE42_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE42 } +#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 } +static struct sse_op_helper_s sse_op_table6[256] = { + [0x00] = SSSE3_OP(pshufb), + [0x01] = SSSE3_OP(phaddw), + [0x02] = SSSE3_OP(phaddd), + [0x03] = SSSE3_OP(phaddsw), + [0x04] = SSSE3_OP(pmaddubsw), + [0x05] = SSSE3_OP(phsubw), + [0x06] = SSSE3_OP(phsubd), + [0x07] = SSSE3_OP(phsubsw), + [0x08] = SSSE3_OP(psignb), + [0x09] = SSSE3_OP(psignw), + [0x0a] = SSSE3_OP(psignd), + [0x0b] = SSSE3_OP(pmulhrsw), + [0x10] = SSE41_OP(pblendvb), + [0x14] = SSE41_OP(blendvps), + [0x15] = SSE41_OP(blendvpd), + [0x17] = SSE41_OP(ptest), + [0x1c] = SSSE3_OP(pabsb), + [0x1d] = SSSE3_OP(pabsw), + [0x1e] = SSSE3_OP(pabsd), + [0x20] = SSE41_OP(pmovsxbw), + [0x21] = SSE41_OP(pmovsxbd), + [0x22] = SSE41_OP(pmovsxbq), + [0x23] = SSE41_OP(pmovsxwd), + [0x24] = SSE41_OP(pmovsxwq), + [0x25] = SSE41_OP(pmovsxdq), + [0x28] = SSE41_OP(pmuldq), + [0x29] = SSE41_OP(pcmpeqq), + [0x2a] = SSE41_SPECIAL, /* movntqda */ + [0x2b] = SSE41_OP(packusdw), + [0x30] = SSE41_OP(pmovzxbw), + [0x31] = SSE41_OP(pmovzxbd), + [0x32] = SSE41_OP(pmovzxbq), + [0x33] = SSE41_OP(pmovzxwd), + [0x34] = SSE41_OP(pmovzxwq), + [0x35] = SSE41_OP(pmovzxdq), + [0x37] = SSE42_OP(pcmpgtq), + [0x38] = SSE41_OP(pminsb), + [0x39] = SSE41_OP(pminsd), + [0x3a] = SSE41_OP(pminuw), + [0x3b] = SSE41_OP(pminud), + [0x3c] = SSE41_OP(pmaxsb), + [0x3d] = SSE41_OP(pmaxsd), + [0x3e] = SSE41_OP(pmaxuw), + [0x3f] = SSE41_OP(pmaxud), + [0x40] = SSE41_OP(pmulld), + [0x41] = SSE41_OP(phminposuw), +}; -static void *sse_op_table7[256][2] = { - [0x0f] = MMX_OP2(palignr), +static struct sse_op_helper_s sse_op_table7[256] = { + [0x08] = SSE41_OP(roundps), + [0x09] = SSE41_OP(roundpd), + [0x0a] = SSE41_OP(roundss), + [0x0b] = SSE41_OP(roundsd), + [0x0c] = SSE41_OP(blendps), + [0x0d] = SSE41_OP(blendpd), + [0x0e] = SSE41_OP(pblendw), + [0x0f] = SSSE3_OP(palignr), + [0x14] = SSE41_SPECIAL, /* pextrb */ + [0x15] = SSE41_SPECIAL, /* pextrw */ + [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */ + [0x17] = SSE41_SPECIAL, /* extractps */ + [0x20] = SSE41_SPECIAL, /* pinsrb */ + [0x21] = SSE41_SPECIAL, /* insertps */ + [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */ + [0x40] = SSE41_OP(dpps), + [0x41] = SSE41_OP(dppd), + [0x42] = SSE41_OP(mpsadbw), + [0x60] = SSE42_OP(pcmpestrm), + [0x61] = SSE42_OP(pcmpestri), + [0x62] = SSE42_OP(pcmpistrm), + [0x63] = SSE42_OP(pcmpistri), }; static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) @@ -3511,18 +3570,20 @@ break; case 0x038: case 0x138: - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) - goto illegal_op; - b = modrm; modrm = ldub_code(s->pc++); rm = modrm & 7; reg = ((modrm >> 3) & 7) | rex_r; mod = (modrm >> 6) & 3; - sse_op2 = sse_op_table6[b][b1]; + if (s->prefix & PREFIX_REPNZ) + goto crc32; + + sse_op2 = sse_op_table6[b].op[b1]; if (!sse_op2) goto illegal_op; + if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) + goto illegal_op; if (b1) { op1_offset = offsetof(CPUX86State,xmm_regs[reg]); @@ -3531,7 +3592,32 @@ } else { op2_offset = offsetof(CPUX86State,xmm_t0); gen_lea_modrm(s, modrm, ®_addr, &offset_addr); - gen_ldo_env_A0(s->mem_index, op2_offset); + switch (b) { + case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ + case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ + case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ + gen_ldq_env_A0(s->mem_index, op2_offset + + offsetof(XMMReg, XMM_Q(0))); + break; + case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */ + case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */ + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, + (s->mem_index >> 2) - 1); + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset + + offsetof(XMMReg, XMM_L(0))); + break; + case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */ + tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0, + (s->mem_index >> 2) - 1); + tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset + + offsetof(XMMReg, XMM_W(0))); + break; + case 0x2a: /* movntqda */ + gen_ldo_env_A0(s->mem_index, op1_offset); + return; + default: + gen_ldo_env_A0(s->mem_index, op2_offset); + } } } else { op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); @@ -3543,25 +3629,178 @@ gen_ldq_env_A0(s->mem_index, op2_offset); } } + if (sse_op2 == SSE_SPECIAL) + goto illegal_op; + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1); + + if (b == 0x17) + s->cc_op = CC_OP_EFLAGS; break; + case 0x338: /* crc32 */ + crc32: + b = modrm; + modrm = ldub_code(s->pc++); + reg = ((modrm >> 3) & 7) | rex_r; + + if (b != 0xf0 && b != 0xf1) + goto illegal_op; + if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) + goto illegal_op; + + if (b == 0xf0) + ot = OT_BYTE; + else if (b == 0xf1 && s->dflag != 2) + if (s->prefix & PREFIX_DATA) + ot = OT_WORD; + else + ot = OT_LONG; + else + ot = OT_QUAD; + + gen_op_mov_TN_reg(OT_LONG, 0, reg); + tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); + tcg_gen_helper_1_3(helper_crc32, cpu_T[0], cpu_tmp2_i32, + cpu_T[0], tcg_const_i32(8 << ot)); + + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; + gen_op_mov_reg_T0(ot, reg); + break; case 0x03a: case 0x13a: - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) - goto illegal_op; - b = modrm; modrm = ldub_code(s->pc++); rm = modrm & 7; reg = ((modrm >> 3) & 7) | rex_r; mod = (modrm >> 6) & 3; - sse_op2 = sse_op_table7[b][b1]; + sse_op2 = sse_op_table7[b].op[b1]; if (!sse_op2) goto illegal_op; + if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) + goto illegal_op; + if (sse_op2 == SSE_SPECIAL) { + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; + rm = (modrm & 7) | REX_B(s); + if (mod != 3) + gen_lea_modrm(s, modrm, ®_addr, &offset_addr); + reg = ((modrm >> 3) & 7) | rex_r; + val = ldub_code(s->pc++); + switch (b) { + case 0x14: /* pextrb */ + tcg_gen_ld8u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_B(val & 15))); + if (mod == 3) + gen_op_mov_reg_T0(ot, rm); + else + tcg_gen_qemu_st8(cpu_T[0], cpu_A0, + (s->mem_index >> 2) - 1); + break; + case 0x15: /* pextrw */ + tcg_gen_ld16u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_W(val & 7))); + if (mod == 3) + gen_op_mov_reg_T0(ot, rm); + else + tcg_gen_qemu_st16(cpu_T[0], cpu_A0, + (s->mem_index >> 2) - 1); + break; + case 0x16: + if (ot == OT_LONG) { /* pextrd */ + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, + offsetof(CPUX86State, + xmm_regs[reg].XMM_L(val & 3))); + if (mod == 3) + gen_op_mov_reg_v(ot, rm, cpu_tmp2_i32); + else + tcg_gen_qemu_st32(cpu_tmp2_i32, cpu_A0, + (s->mem_index >> 2) - 1); + } else { /* pextrq */ + tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, + offsetof(CPUX86State, + xmm_regs[reg].XMM_Q(val & 1))); + if (mod == 3) + gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64); + else + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, + (s->mem_index >> 2) - 1); + } + break; + case 0x17: /* extractps */ + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_L(val & 3))); + if (mod == 3) + gen_op_mov_reg_T0(ot, rm); + else + tcg_gen_qemu_st32(cpu_T[0], cpu_A0, + (s->mem_index >> 2) - 1); + break; + case 0x20: /* pinsrb */ + if (mod == 3) + gen_op_mov_TN_reg(OT_LONG, 0, rm); + else + tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0, + (s->mem_index >> 2) - 1); + tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_B(val & 15))); + break; + case 0x21: /* insertps */ + if (mod == 3) + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, + offsetof(CPUX86State,xmm_regs[rm] + .XMM_L((val >> 6) & 3))); + else + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, + (s->mem_index >> 2) - 1); + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, + offsetof(CPUX86State,xmm_regs[reg] + .XMM_L((val >> 4) & 3))); + if ((val >> 0) & 1) + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), + cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_L(0))); + if ((val >> 1) & 1) + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), + cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_L(1))); + if ((val >> 2) & 1) + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), + cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_L(2))); + if ((val >> 3) & 1) + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), + cpu_env, offsetof(CPUX86State, + xmm_regs[reg].XMM_L(3))); + break; + case 0x22: + if (ot == OT_LONG) { /* pinsrd */ + if (mod == 3) + gen_op_mov_v_reg(ot, cpu_tmp2_i32, rm); + else + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, + (s->mem_index >> 2) - 1); + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, + offsetof(CPUX86State, + xmm_regs[reg].XMM_L(val & 3))); + } else { /* pinsrq */ + if (mod == 3) + gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm); + else + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, + (s->mem_index >> 2) - 1); + tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, + offsetof(CPUX86State, + xmm_regs[reg].XMM_Q(val & 1))); + } + break; + } + return; + } + if (b1) { op1_offset = offsetof(CPUX86State,xmm_regs[reg]); if (mod == 3) { @@ -3583,6 +3822,14 @@ } val = ldub_code(s->pc++); + if ((b & 0xfc) == 0x60) { /* pcmpXstrX */ + s->cc_op = CC_OP_EFLAGS; + + if (s->dflag == 2) + /* The helper must use entire 64-bit gp registers */ + val |= 1 << 8; + } + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val)); @@ -7094,7 +7341,7 @@ gen_eob(s); } break; - /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */ + /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */ case 0x1c3: /* MOVNTI reg, mem */ if (!(s->cpuid_features & CPUID_SSE2)) goto illegal_op; @@ -7202,6 +7449,28 @@ tcg_gen_helper_0_0(helper_rsm); gen_eob(s); break; + case 0x1b8: /* SSE4.2 popcnt */ + if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) != + PREFIX_REPZ) + goto illegal_op; + if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT)) + goto illegal_op; + + modrm = ldub_code(s->pc++); + reg = ((modrm >> 3) & 7); + + if (s->prefix & PREFIX_DATA) + ot = OT_WORD; + else if (s->dflag != 2) + ot = OT_LONG; + else + ot = OT_QUAD; + + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); + tcg_gen_helper_1_2(helper_popcnt, + cpu_T[0], cpu_T[0], tcg_const_i32(ot)); + gen_op_mov_reg_T0(ot, reg); + break; case 0x10e ... 0x10f: /* 3DNow! instructions, ignore prefixes */ s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); Modified: trunk/tests/test-i386-ssse3.c =================================================================== --- trunk/tests/test-i386-ssse3.c 2008-10-03 23:09:08 UTC (rev 5410) +++ trunk/tests/test-i386-ssse3.c 2008-10-04 03:27:44 UTC (rev 5411) @@ -1,6 +1,7 @@ /* See if various MMX/SSE SSSE3 instructions give expected results */ #include <stdio.h> #include <string.h> +#include <stdint.h> int main(int argc, char *argv[]) { char hello[16]; @@ -9,9 +10,11 @@ uint64_t a = 0x0000000000090007; uint64_t b = 0x0000000000000000; + uint32_t c; + uint16_t d; - const char c[16] = "LLOaaaaaaaaaaaaa"; - const char d[16] = "aaaaaaaaaaaaaaHE"; + const char e[16] = "LLOaaaaaaaaaaaaa"; + const char f[16] = "aaaaaaaaaaaaaaHE"; /* pshufb mm1/xmm1, mm2/xmm2 */ asm volatile ("movq (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1"); @@ -33,10 +36,22 @@ printf("%i - %i = %i\n", 9, 7, -(int16_t) a); /* palignr mm1/xmm1, m64/m128, imm8 */ - asm volatile ("movdqa (%0), %%xmm0" : : "r" (c) : "xmm0"); - asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (d)); + asm volatile ("movdqa (%0), %%xmm0" : : "r" (e) : "xmm0"); + asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f)); asm volatile ("movdqa %%xmm0, (%0)" : : "r" (hello)); printf("%5.5s\n", hello); +#if 1 /* SSE4 */ + /* popcnt r64, r/m64 */ + asm volatile ("movq $0x8421000010009c63, %%rax" : : : "rax"); + asm volatile ("popcnt %%ax, %%dx" : : : "dx"); + asm volatile ("popcnt %%eax, %%ecx" : : : "ecx"); + asm volatile ("popcnt %rax, %rax"); + asm volatile ("movq %%rax, %0" : "=m" (a)); + asm volatile ("movl %%ecx, %0" : "=m" (c)); + asm volatile ("movw %%dx, %0" : "=m" (d)); + printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1); +#endif + return 0; } ^ permalink raw reply [flat|nested] 3+ messages in thread
[parent not found: <0C221828-78E4-49DC-AB8E-58B5BBD3F294@hotmail.com>]
* Re: [Qemu-devel] [5411] Implement SSE4.1, SSE4.2 (x86). [not found] ` <0C221828-78E4-49DC-AB8E-58B5BBD3F294@hotmail.com> @ 2008-10-04 3:56 ` C.W. Betts 2008-10-04 11:38 ` andrzej zaborowski 0 siblings, 1 reply; 3+ messages in thread From: C.W. Betts @ 2008-10-04 3:56 UTC (permalink / raw) To: qemu-devel [-- Attachment #1: Type: text/plain, Size: 50812 bytes --] I get a lot of warning complaining about a number might not fit in long when I compile it on 32-bit processors: gcc -I. -I.. -I/Users/cwbetts/makestuff/qemu-allmac/src/target-i386 - I/Users/cwbetts/makestuff/qemu-allmac/src -MMD -MT op_helper.o -MP - DNEED_CPU_H -D__powerpc__ -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 - D_LARGEFILE_SOURCE -I/Users/cwbetts/makestuff/qemu-allmac/src/tcg -I/ Users/cwbetts/makestuff/qemu-allmac/src/tcg/ppc -I/Users/cwbetts/ makestuff/qemu-allmac/src/fpu -DHAS_AUDIO -DHAS_AUDIO_CHOICE -I/Users/ cwbetts/makestuff/qemu-allmac/src/slirp -Wall -O2 -g -fno-strict- aliasing -mdynamic-no-pic -m32 -arch ppc -mmacosx-version-min=10.3 - isysroot /Developer/SDKs/MacOSX10.3.9.sdk -mcpu=G3 - DMAC_OS_X_VERSION_MIN_REQUIRED=1030 -mtune=G4 -c -o op_helper.o / Users/cwbetts/makestuff/qemu-allmac/src/target-i386/op_helper.c In file included from /Users/cwbetts/makestuff/qemu-allmac/src/target- i386/op_helper.c:5443: /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h: In function ‘helper_blendvpd_xmm’: /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: warning: integer constant is too large for ‘long’ type It doesn't show up when I compile for 64-bit mode On Oct 3, 2008, at 9:27 PM, Andrzej Zaborowski wrote: > Revision: 5411 > http://svn.sv.gnu.org/viewvc/? > view=rev&root=qemu&revision=5411 > Author: balrog > Date: 2008-10-04 03:27:44 +0000 (Sat, 04 Oct 2008) > > Log Message: > ----------- > Implement SSE4.1, SSE4.2 (x86). > > This adds support for CPUID_EXT_SSE41, CPUID_EXT_SSE42, > CPUID_EXT_POPCNT > extensions. Most instructions haven't been tested yet. > > Modified Paths: > -------------- > trunk/target-i386/ops_sse.h > trunk/target-i386/ops_sse_header.h > trunk/target-i386/translate.c > trunk/tests/test-i386-ssse3.c > > Modified: trunk/target-i386/ops_sse.h > =================================================================== > --- trunk/target-i386/ops_sse.h 2008-10-03 23:09:08 UTC (rev 5410) > +++ trunk/target-i386/ops_sse.h 2008-10-04 03:27:44 UTC (rev 5411) > @@ -1,7 +1,8 @@ > /* > - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support > + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support > * > * Copyright (c) 2005 Fabrice Bellard > + * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com > > > * > * This library is free software; you can redistribute it and/or > * modify it under the terms of the GNU Lesser General Public > @@ -1420,6 +1421,621 @@ > *d = r; > } > > +#define XMM0 env->xmm_regs[0] > + > +#if SHIFT == 1 > +#define SSE_HELPER_V(name, elem, num, F)\ > +void glue(name, SUFFIX) (Reg *d, Reg *s)\ > +{\ > + d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\ > + d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\ > + if (num > 2) {\ > + d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\ > + d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\ > + if (num > 4) {\ > + d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\ > + d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\ > + d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\ > + d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\ > + if (num > 8) {\ > + d->elem(8) = F(d->elem(8), s->elem(8), > XMM0.elem(8));\ > + d->elem(9) = F(d->elem(9), s->elem(9), > XMM0.elem(9));\ > + d->elem(10) = F(d->elem(10), s->elem(10), > XMM0.elem(10));\ > + d->elem(11) = F(d->elem(11), s->elem(11), > XMM0.elem(11));\ > + d->elem(12) = F(d->elem(12), s->elem(12), > XMM0.elem(12));\ > + d->elem(13) = F(d->elem(13), s->elem(13), > XMM0.elem(13));\ > + d->elem(14) = F(d->elem(14), s->elem(14), > XMM0.elem(14));\ > + d->elem(15) = F(d->elem(15), s->elem(15), > XMM0.elem(15));\ > + }\ > + }\ > + }\ > +} > + > +#define SSE_HELPER_I(name, elem, num, F)\ > +void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\ > +{\ > + d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\ > + d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\ > + if (num > 2) {\ > + d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\ > + d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\ > + if (num > 4) {\ > + d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & > 1));\ > + d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & > 1));\ > + d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & > 1));\ > + d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & > 1));\ > + if (num > 8) {\ > + d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) > & 1));\ > + d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) > & 1));\ > + d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> > 10) & 1));\ > + d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> > 11) & 1));\ > + d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> > 12) & 1));\ > + d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> > 13) & 1));\ > + d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> > 14) & 1));\ > + d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> > 15) & 1));\ > + }\ > + }\ > + }\ > +} > + > +/* SSE4.1 op helpers */ > +#define FBLENDVB(d, s, m) (m & 0x80) ? s : d > +#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d > +#define FBLENDVPD(d, s, m) (m & 0x8000000000000000) ? s : d > +SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) > +SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) > +SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) > + > +void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s) > +{ > + uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); > + uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); > + > + CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); > +} > + > +#define SSE_HELPER_F(name, elem, num, F)\ > +void glue(name, SUFFIX) (Reg *d, Reg *s)\ > +{\ > + d->elem(0) = F(0);\ > + d->elem(1) = F(1);\ > + d->elem(2) = F(2);\ > + d->elem(3) = F(3);\ > + if (num > 3) {\ > + d->elem(4) = F(4);\ > + d->elem(5) = F(5);\ > + if (num > 5) {\ > + d->elem(6) = F(6);\ > + d->elem(7) = F(7);\ > + }\ > + }\ > +} > + > +SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) > +SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) > +SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) > +SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) > +SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) > +SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) > +SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) > +SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) > +SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) > +SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) > +SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) > +SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) > + > +void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s) > +{ > + d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0); > + d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2); > +} > + > +#define FCMPEQQ(d, s) d == s ? -1 : 0 > +SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) > + > +void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s) > +{ > + d->W(0) = satuw((int32_t) d->L(0)); > + d->W(1) = satuw((int32_t) d->L(1)); > + d->W(2) = satuw((int32_t) d->L(2)); > + d->W(3) = satuw((int32_t) d->L(3)); > + d->W(4) = satuw((int32_t) s->L(0)); > + d->W(5) = satuw((int32_t) s->L(1)); > + d->W(6) = satuw((int32_t) s->L(2)); > + d->W(7) = satuw((int32_t) s->L(3)); > +} > + > +#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s) > +#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s) > +#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s) > +#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s) > +SSE_HELPER_B(helper_pminsb, FMINSB) > +SSE_HELPER_L(helper_pminsd, FMINSD) > +SSE_HELPER_W(helper_pminuw, MIN) > +SSE_HELPER_L(helper_pminud, MIN) > +SSE_HELPER_B(helper_pmaxsb, FMAXSB) > +SSE_HELPER_L(helper_pmaxsd, FMAXSD) > +SSE_HELPER_W(helper_pmaxuw, MAX) > +SSE_HELPER_L(helper_pmaxud, MAX) > + > +#define FMULLD(d, s) (int32_t) d * (int32_t) s > +SSE_HELPER_L(helper_pmulld, FMULLD) > + > +void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s) > +{ > + int idx = 0; > + > + if (s->W(1) < s->W(idx)) > + idx = 1; > + if (s->W(2) < s->W(idx)) > + idx = 2; > + if (s->W(3) < s->W(idx)) > + idx = 3; > + if (s->W(4) < s->W(idx)) > + idx = 4; > + if (s->W(5) < s->W(idx)) > + idx = 5; > + if (s->W(6) < s->W(idx)) > + idx = 6; > + if (s->W(7) < s->W(idx)) > + idx = 7; > + > + d->Q(1) = 0; > + d->L(1) = 0; > + d->W(1) = idx; > + d->W(0) = s->W(idx); > +} > + > +void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode) > +{ > + signed char prev_rounding_mode; > + > + prev_rounding_mode = env->sse_status.float_rounding_mode; > + if (!(mode & (1 << 2))) > + switch (mode & 3) { > + case 0: > + set_float_rounding_mode(float_round_nearest_even, &env- > >sse_status); > + break; > + case 1: > + set_float_rounding_mode(float_round_down, &env- > >sse_status); > + break; > + case 2: > + set_float_rounding_mode(float_round_up, &env- > >sse_status); > + break; > + case 3: > + set_float_rounding_mode(float_round_to_zero, &env- > >sse_status); > + break; > + } > + > + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); > + d->L(1) = float64_round_to_int(s->L(1), &env->sse_status); > + d->L(2) = float64_round_to_int(s->L(2), &env->sse_status); > + d->L(3) = float64_round_to_int(s->L(3), &env->sse_status); > + > +#if 0 /* TODO */ > + if (mode & (1 << 3)) > + set_float_exception_flags( > + get_float_exception_flags(&env->sse_status) & > + ~float_flag_inexact, > + &env->sse_status); > +#endif > + env->sse_status.float_rounding_mode = prev_rounding_mode; > +} > + > +void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) > +{ > + signed char prev_rounding_mode; > + > + prev_rounding_mode = env->sse_status.float_rounding_mode; > + if (!(mode & (1 << 2))) > + switch (mode & 3) { > + case 0: > + set_float_rounding_mode(float_round_nearest_even, &env- > >sse_status); > + break; > + case 1: > + set_float_rounding_mode(float_round_down, &env- > >sse_status); > + break; > + case 2: > + set_float_rounding_mode(float_round_up, &env- > >sse_status); > + break; > + case 3: > + set_float_rounding_mode(float_round_to_zero, &env- > >sse_status); > + break; > + } > + > + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); > + d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status); > + > +#if 0 /* TODO */ > + if (mode & (1 << 3)) > + set_float_exception_flags( > + get_float_exception_flags(&env->sse_status) & > + ~float_flag_inexact, > + &env->sse_status); > +#endif > + env->sse_status.float_rounding_mode = prev_rounding_mode; > +} > + > +void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode) > +{ > + signed char prev_rounding_mode; > + > + prev_rounding_mode = env->sse_status.float_rounding_mode; > + if (!(mode & (1 << 2))) > + switch (mode & 3) { > + case 0: > + set_float_rounding_mode(float_round_nearest_even, &env- > >sse_status); > + break; > + case 1: > + set_float_rounding_mode(float_round_down, &env- > >sse_status); > + break; > + case 2: > + set_float_rounding_mode(float_round_up, &env- > >sse_status); > + break; > + case 3: > + set_float_rounding_mode(float_round_to_zero, &env- > >sse_status); > + break; > + } > + > + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); > + > +#if 0 /* TODO */ > + if (mode & (1 << 3)) > + set_float_exception_flags( > + get_float_exception_flags(&env->sse_status) & > + ~float_flag_inexact, > + &env->sse_status); > +#endif > + env->sse_status.float_rounding_mode = prev_rounding_mode; > +} > + > +void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) > +{ > + signed char prev_rounding_mode; > + > + prev_rounding_mode = env->sse_status.float_rounding_mode; > + if (!(mode & (1 << 2))) > + switch (mode & 3) { > + case 0: > + set_float_rounding_mode(float_round_nearest_even, &env- > >sse_status); > + break; > + case 1: > + set_float_rounding_mode(float_round_down, &env- > >sse_status); > + break; > + case 2: > + set_float_rounding_mode(float_round_up, &env- > >sse_status); > + break; > + case 3: > + set_float_rounding_mode(float_round_to_zero, &env- > >sse_status); > + break; > + } > + > + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); > + > +#if 0 /* TODO */ > + if (mode & (1 << 3)) > + set_float_exception_flags( > + get_float_exception_flags(&env->sse_status) & > + ~float_flag_inexact, > + &env->sse_status); > +#endif > + env->sse_status.float_rounding_mode = prev_rounding_mode; > +} > + > +#define FBLENDP(d, s, m) m ? s : d > +SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) > +SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) > +SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) > + > +void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask) > +{ > + float32 iresult = 0 /*float32_zero*/; > + > + if (mask & (1 << 4)) > + iresult = float32_add(iresult, > + float32_mul(d->L(0), s->L(0), &env- > >sse_status), > + &env->sse_status); > + if (mask & (1 << 5)) > + iresult = float32_add(iresult, > + float32_mul(d->L(1), s->L(1), &env- > >sse_status), > + &env->sse_status); > + if (mask & (1 << 6)) > + iresult = float32_add(iresult, > + float32_mul(d->L(2), s->L(2), &env- > >sse_status), > + &env->sse_status); > + if (mask & (1 << 7)) > + iresult = float32_add(iresult, > + float32_mul(d->L(3), s->L(3), &env- > >sse_status), > + &env->sse_status); > + d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/; > + d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/; > + d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/; > + d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/; > +} > + > +void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask) > +{ > + float64 iresult = 0 /*float64_zero*/; > + > + if (mask & (1 << 4)) > + iresult = float64_add(iresult, > + float64_mul(d->Q(0), s->Q(0), &env- > >sse_status), > + &env->sse_status); > + if (mask & (1 << 5)) > + iresult = float64_add(iresult, > + float64_mul(d->Q(1), s->Q(1), &env- > >sse_status), > + &env->sse_status); > + d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/; > + d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/; > +} > + > +void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset) > +{ > + int s0 = (offset & 3) << 2; > + int d0 = (offset & 4) << 0; > + int i; > + Reg r; > + > + for (i = 0; i < 8; i++, d0++) { > + r.W(i) = 0; > + r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); > + r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); > + r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); > + r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); > + } > + > + *d = r; > +} > + > +/* SSE4.2 op helpers */ > +/* it's unclear whether signed or unsigned */ > +#define FCMPGTQ(d, s) d > s ? -1 : 0 > +SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) > + > +static inline int pcmp_elen(int reg, uint32_t ctrl) > +{ > + int val; > + > + /* Presence of REX.W is indicated by a bit higher than 7 set */ > + if (ctrl >> 8) > + val = abs1((int64_t) env->regs[reg]); > + else > + val = abs1((int32_t) env->regs[reg]); > + > + if (ctrl & 1) { > + if (val > 8) > + return 8; > + } else > + if (val > 16) > + return 16; > + > + return val; > +} > + > +static inline int pcmp_ilen(Reg *r, uint8_t ctrl) > +{ > + int val = 0; > + > + if (ctrl & 1) { > + while (val < 8 && r->W(val)) > + val++; > + } else > + while (val < 16 && r->B(val)) > + val++; > + > + return val; > +} > + > +static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) > +{ > + switch ((ctrl >> 0) & 3) { > + case 0: > + return r->B(i); > + case 1: > + return r->W(i); > + case 2: > + return (int8_t) r->B(i); > + case 3: > + default: > + return (int16_t) r->W(i); > + } > +} > + > +static inline unsigned pcmpxstrx(Reg *d, Reg *s, > + int8_t ctrl, int valids, int validd) > +{ > + unsigned int res = 0; > + int v; > + int j, i; > + int upper = (ctrl & 1) ? 7 : 15; > + > + valids--; > + validd--; > + > + CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : > 0); > + > + switch ((ctrl >> 2) & 3) { > + case 0: > + for (j = valids; j >= 0; j--) { > + res <<= 1; > + v = pcmp_val(s, ctrl, j); > + for (i = validd; i >= 0; i--) > + res |= (v == pcmp_val(d, ctrl, i)); > + } > + break; > + case 1: > + for (j = valids; j >= 0; j--) { > + res <<= 1; > + v = pcmp_val(s, ctrl, j); > + for (i = ((validd - 1) | 1); i >= 0; i -= 2) > + res |= (pcmp_val(d, ctrl, i - 0) <= v && > + pcmp_val(d, ctrl, i - 1) >= v); > + } > + break; > + case 2: > + res = (2 << (upper - MAX(valids, validd))) - 1; > + res <<= MAX(valids, validd) - MIN(valids, validd); > + for (i = MIN(valids, validd); i >= 0; i--) { > + res <<= 1; > + v = pcmp_val(s, ctrl, i); > + res |= (v == pcmp_val(d, ctrl, i)); > + } > + break; > + case 3: > + for (j = valids - validd; j >= 0; j--) { > + res <<= 1; > + res |= 1; > + for (i = MIN(upper - j, validd); i >= 0; i--) > + res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, > ctrl, i)); > + } > + break; > + } > + > + switch ((ctrl >> 4) & 3) { > + case 1: > + res ^= (2 << upper) - 1; > + break; > + case 3: > + res ^= (2 << valids) - 1; > + break; > + } > + > + if (res) > + CC_SRC |= CC_C; > + if (res & 1) > + CC_SRC |= CC_O; > + > + return res; > +} > + > +static inline int rffs1(unsigned int val) > +{ > + int ret = 1, hi; > + > + for (hi = sizeof(val) * 4; hi; hi /= 2) > + if (val >> hi) { > + val >>= hi; > + ret += hi; > + } > + > + return ret; > +} > + > +static inline int ffs1(unsigned int val) > +{ > + int ret = 1, hi; > + > + for (hi = sizeof(val) * 4; hi; hi /= 2) > + if (val << hi) { > + val <<= hi; > + ret += hi; > + } > + > + return ret; > +} > + > +void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) > +{ > + unsigned int res = pcmpxstrx(d, s, ctrl, > + pcmp_elen(R_EDX, ctrl), > + pcmp_elen(R_EAX, ctrl)); > + > + if (res) > + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) > - 1; > + else > + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); > +} > + > +void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) > +{ > + int i; > + unsigned int res = pcmpxstrx(d, s, ctrl, > + pcmp_elen(R_EDX, ctrl), > + pcmp_elen(R_EAX, ctrl)); > + > + if ((ctrl >> 6) & 1) { > + if (ctrl & 1) > + for (i = 0; i <= 8; i--, res >>= 1) > + d->W(i) = (res & 1) ? ~0 : 0; > + else > + for (i = 0; i <= 16; i--, res >>= 1) > + d->B(i) = (res & 1) ? ~0 : 0; > + } else { > + d->Q(1) = 0; > + d->Q(0) = res; > + } > +} > + > +void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) > +{ > + unsigned int res = pcmpxstrx(d, s, ctrl, > + pcmp_ilen(s, ctrl), > + pcmp_ilen(d, ctrl)); > + > + if (res) > + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) > - 1; > + else > + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); > +} > + > +void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) > +{ > + int i; > + unsigned int res = pcmpxstrx(d, s, ctrl, > + pcmp_ilen(s, ctrl), > + pcmp_ilen(d, ctrl)); > + > + if ((ctrl >> 6) & 1) { > + if (ctrl & 1) > + for (i = 0; i <= 8; i--, res >>= 1) > + d->W(i) = (res & 1) ? ~0 : 0; > + else > + for (i = 0; i <= 16; i--, res >>= 1) > + d->B(i) = (res & 1) ? ~0 : 0; > + } else { > + d->Q(1) = 0; > + d->Q(0) = res; > + } > +} > + > +#define CRCPOLY 0x1edc6f41 > +#define CRCPOLY_BITREV 0x82f63b78 > +target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t > len) > +{ > + target_ulong crc = (msg & ((target_ulong) -1 >> > + (TARGET_LONG_BITS - len))) ^ crc1; > + > + while (len--) > + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); > + > + return crc; > +} > + > +#define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1)) > +#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & > POPMASK(i)) > +target_ulong helper_popcnt(target_ulong n, uint32_t type) > +{ > + CC_SRC = n ? 0 : CC_Z; > + > + n = POPCOUNT(n, 0); > + n = POPCOUNT(n, 1); > + n = POPCOUNT(n, 2); > + n = POPCOUNT(n, 3); > + if (type == 1) > + return n & 0xff; > + > + n = POPCOUNT(n, 4); > +#ifndef TARGET_X86_64 > + return n; > +#else > + if (type == 2) > + return n & 0xff; > + > + return POPCOUNT(n, 5); > +#endif > +} > +#endif > + > #undef SHIFT > #undef XMM_ONLY > #undef Reg > > Modified: trunk/target-i386/ops_sse_header.h > =================================================================== > --- trunk/target-i386/ops_sse_header.h 2008-10-03 23:09:08 UTC (rev > 5410) > +++ trunk/target-i386/ops_sse_header.h 2008-10-04 03:27:44 UTC (rev > 5411) > @@ -1,5 +1,5 @@ > /* > - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support > + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support > * > * Copyright (c) 2005 Fabrice Bellard > * > @@ -269,6 +269,61 @@ > DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s)) > DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, > int32_t shift)) > > +/* SSE4.1 op helpers */ > +#if SHIFT == 1 > +DEF_HELPER(void, glue(helper_pblendvb, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_blendvps, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_blendvpd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_ptest, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovsxbw, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovsxbd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovsxbq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovsxwd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovsxwq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovsxdq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovzxbw, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovzxbd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovzxbq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovzxwd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovzxwq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmovzxdq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmuldq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pcmpeqq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_packusdw, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pminsb, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pminsd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pminuw, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pminud, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmaxsb, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmaxsd, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmaxuw, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmaxud, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pmulld, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_phminposuw, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_roundps, SUFFIX), (Reg *d, Reg *s, > uint32_t mode)) > +DEF_HELPER(void, glue(helper_roundpd, SUFFIX), (Reg *d, Reg *s, > uint32_t mode)) > +DEF_HELPER(void, glue(helper_roundss, SUFFIX), (Reg *d, Reg *s, > uint32_t mode)) > +DEF_HELPER(void, glue(helper_roundsd, SUFFIX), (Reg *d, Reg *s, > uint32_t mode)) > +DEF_HELPER(void, glue(helper_blendps, SUFFIX), (Reg *d, Reg *s, > uint32_t imm)) > +DEF_HELPER(void, glue(helper_blendpd, SUFFIX), (Reg *d, Reg *s, > uint32_t imm)) > +DEF_HELPER(void, glue(helper_pblendw, SUFFIX), (Reg *d, Reg *s, > uint32_t imm)) > +DEF_HELPER(void, glue(helper_dpps, SUFFIX), (Reg *d, Reg *s, > uint32_t mask)) > +DEF_HELPER(void, glue(helper_dppd, SUFFIX), (Reg *d, Reg *s, > uint32_t mask)) > +DEF_HELPER(void, glue(helper_mpsadbw, SUFFIX), (Reg *d, Reg *s, > uint32_t off)) > +#endif > + > +/* SSE4.2 op helpers */ > +#if SHIFT == 1 > +DEF_HELPER(void, glue(helper_pcmpgtq, SUFFIX), (Reg *d, Reg *s)) > +DEF_HELPER(void, glue(helper_pcmpestri, SUFFIX), (Reg *d, Reg *s, > uint32_t ctl)) > +DEF_HELPER(void, glue(helper_pcmpestrm, SUFFIX), (Reg *d, Reg *s, > uint32_t ctl)) > +DEF_HELPER(void, glue(helper_pcmpistri, SUFFIX), (Reg *d, Reg *s, > uint32_t ctl)) > +DEF_HELPER(void, glue(helper_pcmpistrm, SUFFIX), (Reg *d, Reg *s, > uint32_t ctl)) > +DEF_HELPER(target_ulong, helper_crc32, > + (uint32_t crc1, target_ulong msg, uint32_t len)) > +DEF_HELPER(target_ulong, helper_popcnt, (target_ulong n, uint32_t > type)) > +#endif > + > #undef SHIFT > #undef Reg > #undef SUFFIX > > Modified: trunk/target-i386/translate.c > =================================================================== > --- trunk/target-i386/translate.c 2008-10-03 23:09:08 UTC (rev 5410) > +++ trunk/target-i386/translate.c 2008-10-04 03:27:44 UTC (rev 5411) > @@ -2140,7 +2140,7 @@ > } > } > > -/* generate modrm memory load or store of 'reg'. TMP0 is used if > reg != > +/* generate modrm memory load or store of 'reg'. TMP0 is used if > reg == > OR_TMP0 */ > static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int > reg, int is_store) > { > @@ -2770,8 +2770,8 @@ > [0xc2] = SSE_FOP(cmpeq), > [0xc6] = { helper_shufps, helper_shufpd }, > > - [0x38] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ > - [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ > + [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* > SSSE3/SSE4 */ > + [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */ > > /* MMX ops and their SSE extensions */ > [0x60] = MMX_OP2(punpcklbw), > @@ -2924,26 +2924,85 @@ > [0xbf] = helper_pavgb_mmx /* pavgusb */ > }; > > -static void *sse_op_table6[256][2] = { > - [0x00] = MMX_OP2(pshufb), > - [0x01] = MMX_OP2(phaddw), > - [0x02] = MMX_OP2(phaddd), > - [0x03] = MMX_OP2(phaddsw), > - [0x04] = MMX_OP2(pmaddubsw), > - [0x05] = MMX_OP2(phsubw), > - [0x06] = MMX_OP2(phsubd), > - [0x07] = MMX_OP2(phsubsw), > - [0x08] = MMX_OP2(psignb), > - [0x09] = MMX_OP2(psignw), > - [0x0a] = MMX_OP2(psignd), > - [0x0b] = MMX_OP2(pmulhrsw), > - [0x1c] = MMX_OP2(pabsb), > - [0x1d] = MMX_OP2(pabsw), > - [0x1e] = MMX_OP2(pabsd), > +struct sse_op_helper_s { > + void *op[2]; uint32_t ext_mask; > }; > +#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } > +#define SSE41_OP(x) { { NULL, helper_ ## x ## _xmm }, > CPUID_EXT_SSE41 } > +#define SSE42_OP(x) { { NULL, helper_ ## x ## _xmm }, > CPUID_EXT_SSE42 } > +#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 } > +static struct sse_op_helper_s sse_op_table6[256] = { > + [0x00] = SSSE3_OP(pshufb), > + [0x01] = SSSE3_OP(phaddw), > + [0x02] = SSSE3_OP(phaddd), > + [0x03] = SSSE3_OP(phaddsw), > + [0x04] = SSSE3_OP(pmaddubsw), > + [0x05] = SSSE3_OP(phsubw), > + [0x06] = SSSE3_OP(phsubd), > + [0x07] = SSSE3_OP(phsubsw), > + [0x08] = SSSE3_OP(psignb), > + [0x09] = SSSE3_OP(psignw), > + [0x0a] = SSSE3_OP(psignd), > + [0x0b] = SSSE3_OP(pmulhrsw), > + [0x10] = SSE41_OP(pblendvb), > + [0x14] = SSE41_OP(blendvps), > + [0x15] = SSE41_OP(blendvpd), > + [0x17] = SSE41_OP(ptest), > + [0x1c] = SSSE3_OP(pabsb), > + [0x1d] = SSSE3_OP(pabsw), > + [0x1e] = SSSE3_OP(pabsd), > + [0x20] = SSE41_OP(pmovsxbw), > + [0x21] = SSE41_OP(pmovsxbd), > + [0x22] = SSE41_OP(pmovsxbq), > + [0x23] = SSE41_OP(pmovsxwd), > + [0x24] = SSE41_OP(pmovsxwq), > + [0x25] = SSE41_OP(pmovsxdq), > + [0x28] = SSE41_OP(pmuldq), > + [0x29] = SSE41_OP(pcmpeqq), > + [0x2a] = SSE41_SPECIAL, /* movntqda */ > + [0x2b] = SSE41_OP(packusdw), > + [0x30] = SSE41_OP(pmovzxbw), > + [0x31] = SSE41_OP(pmovzxbd), > + [0x32] = SSE41_OP(pmovzxbq), > + [0x33] = SSE41_OP(pmovzxwd), > + [0x34] = SSE41_OP(pmovzxwq), > + [0x35] = SSE41_OP(pmovzxdq), > + [0x37] = SSE42_OP(pcmpgtq), > + [0x38] = SSE41_OP(pminsb), > + [0x39] = SSE41_OP(pminsd), > + [0x3a] = SSE41_OP(pminuw), > + [0x3b] = SSE41_OP(pminud), > + [0x3c] = SSE41_OP(pmaxsb), > + [0x3d] = SSE41_OP(pmaxsd), > + [0x3e] = SSE41_OP(pmaxuw), > + [0x3f] = SSE41_OP(pmaxud), > + [0x40] = SSE41_OP(pmulld), > + [0x41] = SSE41_OP(phminposuw), > +}; > > -static void *sse_op_table7[256][2] = { > - [0x0f] = MMX_OP2(palignr), > +static struct sse_op_helper_s sse_op_table7[256] = { > + [0x08] = SSE41_OP(roundps), > + [0x09] = SSE41_OP(roundpd), > + [0x0a] = SSE41_OP(roundss), > + [0x0b] = SSE41_OP(roundsd), > + [0x0c] = SSE41_OP(blendps), > + [0x0d] = SSE41_OP(blendpd), > + [0x0e] = SSE41_OP(pblendw), > + [0x0f] = SSSE3_OP(palignr), > + [0x14] = SSE41_SPECIAL, /* pextrb */ > + [0x15] = SSE41_SPECIAL, /* pextrw */ > + [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */ > + [0x17] = SSE41_SPECIAL, /* extractps */ > + [0x20] = SSE41_SPECIAL, /* pinsrb */ > + [0x21] = SSE41_SPECIAL, /* insertps */ > + [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */ > + [0x40] = SSE41_OP(dpps), > + [0x41] = SSE41_OP(dppd), > + [0x42] = SSE41_OP(mpsadbw), > + [0x60] = SSE42_OP(pcmpestrm), > + [0x61] = SSE42_OP(pcmpestri), > + [0x62] = SSE42_OP(pcmpistrm), > + [0x63] = SSE42_OP(pcmpistri), > }; > > static void gen_sse(DisasContext *s, int b, target_ulong pc_start, > int rex_r) > @@ -3511,18 +3570,20 @@ > break; > case 0x038: > case 0x138: > - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) > - goto illegal_op; > - > b = modrm; > modrm = ldub_code(s->pc++); > rm = modrm & 7; > reg = ((modrm >> 3) & 7) | rex_r; > mod = (modrm >> 6) & 3; > > - sse_op2 = sse_op_table6[b][b1]; > + if (s->prefix & PREFIX_REPNZ) > + goto crc32; > + > + sse_op2 = sse_op_table6[b].op[b1]; > if (!sse_op2) > goto illegal_op; > + if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) > + goto illegal_op; > > if (b1) { > op1_offset = offsetof(CPUX86State,xmm_regs[reg]); > @@ -3531,7 +3592,32 @@ > } else { > op2_offset = offsetof(CPUX86State,xmm_t0); > gen_lea_modrm(s, modrm, ®_addr, &offset_addr); > - gen_ldo_env_A0(s->mem_index, op2_offset); > + switch (b) { > + case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ > + case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ > + case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ > + gen_ldq_env_A0(s->mem_index, op2_offset + > + offsetof(XMMReg, XMM_Q(0))); > + break; > + case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */ > + case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */ > + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, > + (s->mem_index >> 2) - 1); > + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, > op2_offset + > + offsetof(XMMReg, XMM_L(0))); > + break; > + case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */ > + tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0, > + (s->mem_index >> 2) - 1); > + tcg_gen_st16_tl(cpu_tmp0, cpu_env, > op2_offset + > + offsetof(XMMReg, XMM_W(0))); > + break; > + case 0x2a: /* movntqda */ > + gen_ldo_env_A0(s->mem_index, op1_offset); > + return; > + default: > + gen_ldo_env_A0(s->mem_index, op2_offset); > + } > } > } else { > op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); > @@ -3543,25 +3629,178 @@ > gen_ldq_env_A0(s->mem_index, op2_offset); > } > } > + if (sse_op2 == SSE_SPECIAL) > + goto illegal_op; > + > tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); > tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); > tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1); > + > + if (b == 0x17) > + s->cc_op = CC_OP_EFLAGS; > break; > + case 0x338: /* crc32 */ > + crc32: > + b = modrm; > + modrm = ldub_code(s->pc++); > + reg = ((modrm >> 3) & 7) | rex_r; > + > + if (b != 0xf0 && b != 0xf1) > + goto illegal_op; > + if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) > + goto illegal_op; > + > + if (b == 0xf0) > + ot = OT_BYTE; > + else if (b == 0xf1 && s->dflag != 2) > + if (s->prefix & PREFIX_DATA) > + ot = OT_WORD; > + else > + ot = OT_LONG; > + else > + ot = OT_QUAD; > + > + gen_op_mov_TN_reg(OT_LONG, 0, reg); > + tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); > + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); > + tcg_gen_helper_1_3(helper_crc32, cpu_T[0], cpu_tmp2_i32, > + cpu_T[0], tcg_const_i32(8 << ot)); > + > + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; > + gen_op_mov_reg_T0(ot, reg); > + break; > case 0x03a: > case 0x13a: > - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) > - goto illegal_op; > - > b = modrm; > modrm = ldub_code(s->pc++); > rm = modrm & 7; > reg = ((modrm >> 3) & 7) | rex_r; > mod = (modrm >> 6) & 3; > > - sse_op2 = sse_op_table7[b][b1]; > + sse_op2 = sse_op_table7[b].op[b1]; > if (!sse_op2) > goto illegal_op; > + if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) > + goto illegal_op; > > + if (sse_op2 == SSE_SPECIAL) { > + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; > + rm = (modrm & 7) | REX_B(s); > + if (mod != 3) > + gen_lea_modrm(s, modrm, ®_addr, &offset_addr); > + reg = ((modrm >> 3) & 7) | rex_r; > + val = ldub_code(s->pc++); > + switch (b) { > + case 0x14: /* pextrb */ > + tcg_gen_ld8u_tl(cpu_T[0], cpu_env, > offsetof(CPUX86State, > + xmm_regs[reg].XMM_B(val > & 15))); > + if (mod == 3) > + gen_op_mov_reg_T0(ot, rm); > + else > + tcg_gen_qemu_st8(cpu_T[0], cpu_A0, > + (s->mem_index >> 2) - 1); > + break; > + case 0x15: /* pextrw */ > + tcg_gen_ld16u_tl(cpu_T[0], cpu_env, > offsetof(CPUX86State, > + xmm_regs[reg].XMM_W(val > & 7))); > + if (mod == 3) > + gen_op_mov_reg_T0(ot, rm); > + else > + tcg_gen_qemu_st16(cpu_T[0], cpu_A0, > + (s->mem_index >> 2) - 1); > + break; > + case 0x16: > + if (ot == OT_LONG) { /* pextrd */ > + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, > + offsetof(CPUX86State, > + > xmm_regs[reg].XMM_L(val & 3))); > + if (mod == 3) > + gen_op_mov_reg_v(ot, rm, cpu_tmp2_i32); > + else > + tcg_gen_qemu_st32(cpu_tmp2_i32, cpu_A0, > + (s->mem_index >> 2) - 1); > + } else { /* pextrq */ > + tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, > + offsetof(CPUX86State, > + > xmm_regs[reg].XMM_Q(val & 1))); > + if (mod == 3) > + gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64); > + else > + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, > + (s->mem_index >> 2) - 1); > + } > + break; > + case 0x17: /* extractps */ > + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, > offsetof(CPUX86State, > + xmm_regs[reg].XMM_L(val > & 3))); > + if (mod == 3) > + gen_op_mov_reg_T0(ot, rm); > + else > + tcg_gen_qemu_st32(cpu_T[0], cpu_A0, > + (s->mem_index >> 2) - 1); > + break; > + case 0x20: /* pinsrb */ > + if (mod == 3) > + gen_op_mov_TN_reg(OT_LONG, 0, rm); > + else > + tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0, > + (s->mem_index >> 2) - 1); > + tcg_gen_st8_tl(cpu_T[0], cpu_env, > offsetof(CPUX86State, > + xmm_regs[reg].XMM_B(val > & 15))); > + break; > + case 0x21: /* insertps */ > + if (mod == 3) > + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, > + > offsetof(CPUX86State,xmm_regs[rm] > + .XMM_L((val >> 6) & > 3))); > + else > + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, > + (s->mem_index >> 2) - 1); > + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, > + > offsetof(CPUX86State,xmm_regs[reg] > + .XMM_L((val >> 4) & 3))); > + if ((val >> 0) & 1) > + tcg_gen_st_i32(tcg_const_i32(0 / > *float32_zero*/), > + cpu_env, > offsetof(CPUX86State, > + > xmm_regs[reg].XMM_L(0))); > + if ((val >> 1) & 1) > + tcg_gen_st_i32(tcg_const_i32(0 / > *float32_zero*/), > + cpu_env, > offsetof(CPUX86State, > + > xmm_regs[reg].XMM_L(1))); > + if ((val >> 2) & 1) > + tcg_gen_st_i32(tcg_const_i32(0 / > *float32_zero*/), > + cpu_env, > offsetof(CPUX86State, > + > xmm_regs[reg].XMM_L(2))); > + if ((val >> 3) & 1) > + tcg_gen_st_i32(tcg_const_i32(0 / > *float32_zero*/), > + cpu_env, > offsetof(CPUX86State, > + > xmm_regs[reg].XMM_L(3))); > + break; > + case 0x22: > + if (ot == OT_LONG) { /* pinsrd */ > + if (mod == 3) > + gen_op_mov_v_reg(ot, cpu_tmp2_i32, rm); > + else > + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, > + (s->mem_index >> 2) - 1); > + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, > + offsetof(CPUX86State, > + > xmm_regs[reg].XMM_L(val & 3))); > + } else { /* pinsrq */ > + if (mod == 3) > + gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm); > + else > + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, > + (s->mem_index >> 2) - 1); > + tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, > + offsetof(CPUX86State, > + > xmm_regs[reg].XMM_Q(val & 1))); > + } > + break; > + } > + return; > + } > + > if (b1) { > op1_offset = offsetof(CPUX86State,xmm_regs[reg]); > if (mod == 3) { > @@ -3583,6 +3822,14 @@ > } > val = ldub_code(s->pc++); > > + if ((b & 0xfc) == 0x60) { /* pcmpXstrX */ > + s->cc_op = CC_OP_EFLAGS; > + > + if (s->dflag == 2) > + /* The helper must use entire 64-bit gp > registers */ > + val |= 1 << 8; > + } > + > tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); > tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); > tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, > tcg_const_i32(val)); > @@ -7094,7 +7341,7 @@ > gen_eob(s); > } > break; > - /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */ > + /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */ > case 0x1c3: /* MOVNTI reg, mem */ > if (!(s->cpuid_features & CPUID_SSE2)) > goto illegal_op; > @@ -7202,6 +7449,28 @@ > tcg_gen_helper_0_0(helper_rsm); > gen_eob(s); > break; > + case 0x1b8: /* SSE4.2 popcnt */ > + if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | > PREFIX_REPNZ)) != > + PREFIX_REPZ) > + goto illegal_op; > + if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT)) > + goto illegal_op; > + > + modrm = ldub_code(s->pc++); > + reg = ((modrm >> 3) & 7); > + > + if (s->prefix & PREFIX_DATA) > + ot = OT_WORD; > + else if (s->dflag != 2) > + ot = OT_LONG; > + else > + ot = OT_QUAD; > + > + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); > + tcg_gen_helper_1_2(helper_popcnt, > + cpu_T[0], cpu_T[0], tcg_const_i32(ot)); > + gen_op_mov_reg_T0(ot, reg); > + break; > case 0x10e ... 0x10f: > /* 3DNow! instructions, ignore prefixes */ > s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); > > Modified: trunk/tests/test-i386-ssse3.c > =================================================================== > --- trunk/tests/test-i386-ssse3.c 2008-10-03 23:09:08 UTC (rev 5410) > +++ trunk/tests/test-i386-ssse3.c 2008-10-04 03:27:44 UTC (rev 5411) > @@ -1,6 +1,7 @@ > /* See if various MMX/SSE SSSE3 instructions give expected results */ > #include <stdio.h> > #include <string.h> > +#include <stdint.h> > > int main(int argc, char *argv[]) { > char hello[16]; > @@ -9,9 +10,11 @@ > > uint64_t a = 0x0000000000090007; > uint64_t b = 0x0000000000000000; > + uint32_t c; > + uint16_t d; > > - const char c[16] = "LLOaaaaaaaaaaaaa"; > - const char d[16] = "aaaaaaaaaaaaaaHE"; > + const char e[16] = "LLOaaaaaaaaaaaaa"; > + const char f[16] = "aaaaaaaaaaaaaaHE"; > > /* pshufb mm1/xmm1, mm2/xmm2 */ > asm volatile ("movq (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1"); > @@ -33,10 +36,22 @@ > printf("%i - %i = %i\n", 9, 7, -(int16_t) a); > > /* palignr mm1/xmm1, m64/m128, imm8 */ > - asm volatile ("movdqa (%0), %%xmm0" : : "r" (c) : "xmm0"); > - asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (d)); > + asm volatile ("movdqa (%0), %%xmm0" : : "r" (e) : "xmm0"); > + asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f)); > asm volatile ("movdqa %%xmm0, (%0)" : : "r" (hello)); > printf("%5.5s\n", hello); > > +#if 1 /* SSE4 */ > + /* popcnt r64, r/m64 */ > + asm volatile ("movq $0x8421000010009c63, %%rax" : : : "rax"); > + asm volatile ("popcnt %%ax, %%dx" : : : "dx"); > + asm volatile ("popcnt %%eax, %%ecx" : : : "ecx"); > + asm volatile ("popcnt %rax, %rax"); > + asm volatile ("movq %%rax, %0" : "=m" (a)); > + asm volatile ("movl %%ecx, %0" : "=m" (c)); > + asm volatile ("movw %%dx, %0" : "=m" (d)); > + printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1); > +#endif > + > return 0; > } > > > > [-- Attachment #2: Type: text/html, Size: 101469 bytes --] ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [Qemu-devel] [5411] Implement SSE4.1, SSE4.2 (x86). 2008-10-04 3:56 ` C.W. Betts @ 2008-10-04 11:38 ` andrzej zaborowski 0 siblings, 0 replies; 3+ messages in thread From: andrzej zaborowski @ 2008-10-04 11:38 UTC (permalink / raw) To: qemu-devel 2008/10/4 C. W. Betts <computers57@hotmail.com>: > I get a lot of warning complaining about a number might not fit in long when > I compile it on 32-bit processors: > gcc -I. -I.. -I/Users/cwbetts/makestuff/qemu-allmac/src/target-i386 > -I/Users/cwbetts/makestuff/qemu-allmac/src -MMD -MT op_helper.o -MP > -DNEED_CPU_H -D__powerpc__ -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 > -D_LARGEFILE_SOURCE -I/Users/cwbetts/makestuff/qemu-allmac/src/tcg > -I/Users/cwbetts/makestuff/qemu-allmac/src/tcg/ppc > -I/Users/cwbetts/makestuff/qemu-allmac/src/fpu -DHAS_AUDIO > -DHAS_AUDIO_CHOICE -I/Users/cwbetts/makestuff/qemu-allmac/src/slirp -Wall > -O2 -g -fno-strict-aliasing -mdynamic-no-pic -m32 -arch ppc > -mmacosx-version-min=10.3 -isysroot /Developer/SDKs/MacOSX10.3.9.sdk > -mcpu=G3 -DMAC_OS_X_VERSION_MIN_REQUIRED=1030 -mtune=G4 -c -o op_helper.o > /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/op_helper.c > In file included from > /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/op_helper.c:5443: > /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h: In function > 'helper_blendvpd_xmm': > /Users/cwbetts/makestuff/qemu-allmac/src/target-i386/ops_sse.h:1487: > warning: integer constant is too large for 'long' type Thanks for noticing, indeed the number was too large and would get truncated to a wrong value, now corrected. Cheers ^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2008-10-04 11:38 UTC | newest] Thread overview: 3+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-10-04 3:27 [Qemu-devel] [5411] Implement SSE4.1, SSE4.2 (x86) Andrzej Zaborowski [not found] ` <0C221828-78E4-49DC-AB8E-58B5BBD3F294@hotmail.com> 2008-10-04 3:56 ` C.W. Betts 2008-10-04 11:38 ` andrzej zaborowski
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).