* [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions
@ 2015-09-19 0:03 gang.chen.5i5j
2015-09-19 2:34 ` Richard Henderson
0 siblings, 1 reply; 6+ messages in thread
From: gang.chen.5i5j @ 2015-09-19 0:03 UTC (permalink / raw)
To: peter.maydell, rth; +Cc: qemu-devel, xili_gchen_5257, Chen Gang
From: Chen Gang <gang.chen.5i5j@gmail.com>
Only according to helper_v1shrs.
Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
---
target-tilegx/helper.h | 8 +++++
target-tilegx/simd_helper.c | 77 +++++++++++++++++++++++++++++++++++++++++++++
target-tilegx/translate.c | 26 +++++++++++++--
3 files changed, 109 insertions(+), 2 deletions(-)
diff --git a/target-tilegx/helper.h b/target-tilegx/helper.h
index 15093973..c366984 100644
--- a/target-tilegx/helper.h
+++ b/target-tilegx/helper.h
@@ -5,12 +5,20 @@ DEF_HELPER_FLAGS_1(pcnt, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_1(revbits, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_3(shufflebytes, TCG_CALL_NO_RWG_SE, i64, i64, i64, i64)
+DEF_HELPER_FLAGS_2(v1add, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v1shl, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v1shru, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v1shrs, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(v1sub, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(v2add, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v2shl, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v2shru, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v2shrs, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(v2sub, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(v4add, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v4shl, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v4shru, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(v4shrs, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(v4sub, TCG_CALL_NO_RWG_SE, i64, i64, i64)
diff --git a/target-tilegx/simd_helper.c b/target-tilegx/simd_helper.c
index 6546337..ec589fe 100644
--- a/target-tilegx/simd_helper.c
+++ b/target-tilegx/simd_helper.c
@@ -22,6 +22,83 @@
#include "qemu-common.h"
#include "exec/helper-proto.h"
+uint64_t helper_v1add(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 8) {
+ int64_t ae = (int8_t)(a >> i);
+ int64_t be = (int8_t)(b >> i);
+ r |= ((ae + be) & 0xff) << i;
+ }
+ return r;
+}
+
+uint64_t helper_v2add(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 16) {
+ int64_t ae = (int16_t)(a >> i);
+ int64_t be = (int16_t)(b >> i);
+ r |= ((ae + be) & 0xffff) << i;
+ }
+ return r;
+}
+
+uint64_t helper_v4add(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 32) {
+ int64_t ae = (int32_t)(a >> i);
+ int64_t be = (int32_t)(b >> i);
+ r |= ((ae + be) & 0xffffffff) << i;
+ }
+ return r;
+}
+
+uint64_t helper_v1sub(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 8) {
+ int64_t ae = (int8_t)(a >> i);
+ int64_t be = (int8_t)(b >> i);
+ r |= ((ae - be) & 0xff) << i;
+ }
+ return r;
+}
+
+uint64_t helper_v2sub(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 16) {
+ int64_t ae = (int16_t)(a >> i);
+ int64_t be = (int16_t)(b >> i);
+ r |= ((ae - be) & 0xffff) << i;
+ }
+ return r;
+}
+
+uint64_t helper_v4sub(uint64_t a, uint64_t b)
+{
+ uint64_t r = 0;
+ int i;
+
+ for (i = 0; i < 64; i += 32) {
+ int64_t ae = (int32_t)(a >> i);
+ int64_t be = (int32_t)(b >> i);
+ r |= ((ae - be) & 0xffffffff) << i;
+ }
+ return r;
+}
uint64_t helper_v1shl(uint64_t a, uint64_t b)
{
diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c
index c8247ac..2246243 100644
--- a/target-tilegx/translate.c
+++ b/target-tilegx/translate.c
@@ -1024,8 +1024,12 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext,
break;
case OE_RRR(V1ADDUC, 0, X0):
case OE_RRR(V1ADDUC, 0, X1):
+ return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
case OE_RRR(V1ADD, 0, X0):
case OE_RRR(V1ADD, 0, X1):
+ gen_helper_v1add(tdest, tsrca, tsrcb);
+ mnemonic = "v1add";
+ break;
case OE_RRR(V1ADIFFU, 0, X0):
case OE_RRR(V1AVGU, 0, X0):
return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
@@ -1095,12 +1099,20 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext,
break;
case OE_RRR(V1SUBUC, 0, X0):
case OE_RRR(V1SUBUC, 0, X1):
+ return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
case OE_RRR(V1SUB, 0, X0):
case OE_RRR(V1SUB, 0, X1):
+ gen_helper_v1sub(tdest, tsrca, tsrcb);
+ mnemonic = "v1sub";
+ break;
case OE_RRR(V2ADDSC, 0, X0):
case OE_RRR(V2ADDSC, 0, X1):
+ return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
case OE_RRR(V2ADD, 0, X0):
case OE_RRR(V2ADD, 0, X1):
+ gen_helper_v2add(tdest, tsrca, tsrcb);
+ mnemonic = "v2add";
+ break;
case OE_RRR(V2ADIFFS, 0, X0):
case OE_RRR(V2AVGS, 0, X0):
case OE_RRR(V2CMPEQ, 0, X0):
@@ -1162,13 +1174,20 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext,
break;
case OE_RRR(V2SUBSC, 0, X0):
case OE_RRR(V2SUBSC, 0, X1):
+ return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
case OE_RRR(V2SUB, 0, X0):
case OE_RRR(V2SUB, 0, X1):
+ gen_helper_v2sub(tdest, tsrca, tsrcb);
+ mnemonic = "v2sub";
+ break;
case OE_RRR(V4ADDSC, 0, X0):
case OE_RRR(V4ADDSC, 0, X1):
+ return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
case OE_RRR(V4ADD, 0, X0):
case OE_RRR(V4ADD, 0, X1):
- return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+ gen_helper_v4add(tdest, tsrca, tsrcb);
+ mnemonic = "v4add";
+ break;
case OE_RRR(V4INT_H, 0, X0):
case OE_RRR(V4INT_H, 0, X1):
tcg_gen_shri_tl(tdest, tsrcb, 32);
@@ -1202,9 +1221,12 @@ static TileExcp gen_rrr_opcode(DisasContext *dc, unsigned opext,
break;
case OE_RRR(V4SUBSC, 0, X0):
case OE_RRR(V4SUBSC, 0, X1):
+ return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
case OE_RRR(V4SUB, 0, X0):
case OE_RRR(V4SUB, 0, X1):
- return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
+ gen_helper_v2sub(tdest, tsrca, tsrcb);
+ mnemonic = "v2sub";
+ break;
case OE_RRR(XOR, 0, X0):
case OE_RRR(XOR, 0, X1):
case OE_RRR(XOR, 5, Y0):
--
1.9.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions
2015-09-19 0:03 [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions gang.chen.5i5j
@ 2015-09-19 2:34 ` Richard Henderson
2015-09-20 22:37 ` Chen Gang
2015-09-22 5:54 ` Chen Gang
0 siblings, 2 replies; 6+ messages in thread
From: Richard Henderson @ 2015-09-19 2:34 UTC (permalink / raw)
To: gang.chen.5i5j, peter.maydell; +Cc: qemu-devel, xili_gchen_5257
On 09/18/2015 05:03 PM, gang.chen.5i5j@gmail.com wrote:
> +uint64_t helper_v1add(uint64_t a, uint64_t b)
> +{
> + uint64_t r = 0;
> + int i;
> +
> + for (i = 0; i < 64; i += 8) {
> + int64_t ae = (int8_t)(a >> i);
> + int64_t be = (int8_t)(b >> i);
> + r |= ((ae + be) & 0xff) << i;
> + }
> + return r;
> +}
> +
> +uint64_t helper_v2add(uint64_t a, uint64_t b)
> +{
> + uint64_t r = 0;
> + int i;
> +
> + for (i = 0; i < 64; i += 16) {
> + int64_t ae = (int16_t)(a >> i);
> + int64_t be = (int16_t)(b >> i);
> + r |= ((ae + be) & 0xffff) << i;
> + }
> + return r;
> +}
There's a trick for this that's more efficient for 4 or more elements per
vector (i.e. good for v2 and v1, but not v4):
a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080)
a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080)
> +uint64_t helper_v4add(uint64_t a, uint64_t b)
> +{
> + uint64_t r = 0;
> + int i;
> +
> + for (i = 0; i < 64; i += 32) {
> + int64_t ae = (int32_t)(a >> i);
> + int64_t be = (int32_t)(b >> i);
> + r |= ((ae + be) & 0xffffffff) << i;
> + }
> + return r;
> +}
I should have mentioned this in the previous patch...
I think probably it would be best to open-code all, or most of, the v4
operations. Something like
static void gen_v4op(TCGv d64, TCGv a64, TCGv b64,
void (*generate)(TCGv_i32, TCGv_i32, TCGv_i32))
{
TCGv_i32 al = tcg_temp_new_i32();
TCGv_i32 ah = tcg_temp_new_i32();
TCGv_i32 bl = tcg_temp_new_i32();
TCGv_i32 bh = tcg_temp_new_i32();
tcg_gen_extr_i64_i32(al, ah, a64);
tcg_gen_extr_i64_i32(bl, bh, b64);
generate(al, al, bl);
generate(ah, ah, bh);
tcg_gen_concat_i32_i64(d64, al, ah);
tcg_temp_free_i32(al);
tcg_temp_free_i32(ah);
tcg_temp_free_i32(bl);
tcg_temp_free_i32(bh);
}
> case OE_RRR(V4ADD, 0, X0):
> case OE_RRR(V4ADD, 0, X1):
> - return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
> + gen_helper_v4add(tdest, tsrca, tsrcb);
And then
gen_v4op(tdest, tsrca, tsrcb, tcg_gen_add_i32);
r~
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions
2015-09-19 2:34 ` Richard Henderson
@ 2015-09-20 22:37 ` Chen Gang
2015-09-22 5:54 ` Chen Gang
1 sibling, 0 replies; 6+ messages in thread
From: Chen Gang @ 2015-09-20 22:37 UTC (permalink / raw)
To: Richard Henderson, peter.maydell; +Cc: qemu-devel
On 9/19/15 10:34, Richard Henderson wrote:
> On 09/18/2015 05:03 PM, gang.chen.5i5j@gmail.com wrote:
>> +uint64_t helper_v1add(uint64_t a, uint64_t b)
>> +{
>> + uint64_t r = 0;
>> + int i;
>> +
>> + for (i = 0; i < 64; i += 8) {
>> + int64_t ae = (int8_t)(a >> i);
>> + int64_t be = (int8_t)(b >> i);
>> + r |= ((ae + be) & 0xff) << i;
>> + }
>> + return r;
>> +}
>> +
>> +uint64_t helper_v2add(uint64_t a, uint64_t b)
>> +{
>> + uint64_t r = 0;
>> + int i;
>> +
>> + for (i = 0; i < 64; i += 16) {
>> + int64_t ae = (int16_t)(a >> i);
>> + int64_t be = (int16_t)(b >> i);
>> + r |= ((ae + be) & 0xffff) << i;
>> + }
>> + return r;
>> +}
>
> There's a trick for this that's more efficient for 4 or more elements per vector (i.e. good for v2 and v1, but not v4):
>
> a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080)
>
> a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080)
>
OK, thanks, for me, it is a good idea. :-)
>> +uint64_t helper_v4add(uint64_t a, uint64_t b)
>> +{
>> + uint64_t r = 0;
>> + int i;
>> +
>> + for (i = 0; i < 64; i += 32) {
>> + int64_t ae = (int32_t)(a >> i);
>> + int64_t be = (int32_t)(b >> i);
>> + r |= ((ae + be) & 0xffffffff) << i;
>> + }
>> + return r;
>> +}
>
> I should have mentioned this in the previous patch...
>
mm... maybe, but at least, I forgot.
> I think probably it would be best to open-code all, or most of, the v4 operations. Something like
>
> static void gen_v4op(TCGv d64, TCGv a64, TCGv b64,
> void (*generate)(TCGv_i32, TCGv_i32, TCGv_i32))
> {
> TCGv_i32 al = tcg_temp_new_i32();
> TCGv_i32 ah = tcg_temp_new_i32();
> TCGv_i32 bl = tcg_temp_new_i32();
> TCGv_i32 bh = tcg_temp_new_i32();
>
> tcg_gen_extr_i64_i32(al, ah, a64);
> tcg_gen_extr_i64_i32(bl, bh, b64);
> generate(al, al, bl);
> generate(ah, ah, bh);
> tcg_gen_concat_i32_i64(d64, al, ah);
>
> tcg_temp_free_i32(al);
> tcg_temp_free_i32(ah);
> tcg_temp_free_i32(bl);
> tcg_temp_free_i32(bh);
> }
>
>> case OE_RRR(V4ADD, 0, X0):
>> case OE_RRR(V4ADD, 0, X1):
>> - return TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
>> + gen_helper_v4add(tdest, tsrca, tsrcb);
>
> And then
>
> gen_v4op(tdest, tsrca, tsrcb, tcg_gen_add_i32);
>
OK, thanks. At least for me, what you said sounds reasonalbe.
Thanks.
--
Chen Gang (陈刚)
Open, share, and attitude like air, water, and life which God blessed
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions
2015-09-19 2:34 ` Richard Henderson
2015-09-20 22:37 ` Chen Gang
@ 2015-09-22 5:54 ` Chen Gang
2015-09-22 14:45 ` Richard Henderson
1 sibling, 1 reply; 6+ messages in thread
From: Chen Gang @ 2015-09-22 5:54 UTC (permalink / raw)
To: Richard Henderson, gang.chen.5i5j, peter.maydell; +Cc: qemu-devel
On 2015年09月19日 10:34, Richard Henderson wrote:
>
> There's a trick for this that's more efficient for 4 or more elements
> per vector (i.e. good for v2 and v1, but not v4):
>
> a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080)
>
> a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080)
>
For me, we need use "(a ^ b) & 0x80..." instead of "(a ^ ~b) & 0x80...".
Thanks.
--
Chen Gang
Open, share, and attitude like air, water, and life which God blessed
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions
2015-09-22 5:54 ` Chen Gang
@ 2015-09-22 14:45 ` Richard Henderson
2015-09-22 21:41 ` Chen Gang
0 siblings, 1 reply; 6+ messages in thread
From: Richard Henderson @ 2015-09-22 14:45 UTC (permalink / raw)
To: Chen Gang, gang.chen.5i5j, peter.maydell; +Cc: qemu-devel
On 09/21/2015 10:54 PM, Chen Gang wrote:
> On 2015年09月19日 10:34, Richard Henderson wrote:
>>
>> There's a trick for this that's more efficient for 4 or more elements
>> per vector (i.e. good for v2 and v1, but not v4):
>>
>> a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080)
>>
>> a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080)
>>
>
> For me, we need use "(a ^ b) & 0x80..." instead of "(a ^ ~b) & 0x80...".
No. What you did wrong was not use (a | 0x80808080).
r~
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions
2015-09-22 14:45 ` Richard Henderson
@ 2015-09-22 21:41 ` Chen Gang
0 siblings, 0 replies; 6+ messages in thread
From: Chen Gang @ 2015-09-22 21:41 UTC (permalink / raw)
To: Richard Henderson, gang.chen.5i5j, peter.maydell; +Cc: qemu-devel
On 9/22/15 22:45, Richard Henderson wrote:
> On 09/21/2015 10:54 PM, Chen Gang wrote:
>> On 2015年09月19日 10:34, Richard Henderson wrote:
>>>
>>> There's a trick for this that's more efficient for 4 or more elements
>>> per vector (i.e. good for v2 and v1, but not v4):
>>>
>>> a + b = (a & 0x7f7f7f7f) + (b & 0x7f7f7f7f)) ^ ((a ^ b) & 0x80808080)
>>>
>>> a - b = (a | 0x80808080) - (b & 0x7f7f7f7f)) ^ ((a ^ ~b) & 0x80808080)
>>>
>>
>> For me, we need use "(a ^ b) & 0x80..." instead of "(a ^ ~b) & 0x80...".
>
> No. What you did wrong was not use (a | 0x80808080).
>
Oh, sorry. I shall send patch v3 for it. :-)
Thanks.
--
Chen Gang (陈刚)
Open, share, and attitude like air, water, and life which God blessed
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2015-09-22 21:39 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-19 0:03 [Qemu-devel] [PATCH] target-tilegx: Implement v*add and v*sub instructions gang.chen.5i5j
2015-09-19 2:34 ` Richard Henderson
2015-09-20 22:37 ` Chen Gang
2015-09-22 5:54 ` Chen Gang
2015-09-22 14:45 ` Richard Henderson
2015-09-22 21:41 ` Chen Gang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).