Re: [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Alex Bennée" <alex.bennee@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>
Cc: qemu-devel@nongnu.org, f4bug@amsat.org
Subject: Re: [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
Date: Tue, 26 Sep 2017 20:28:16 +0100	[thread overview]
Message-ID: <87shf9cl9r.fsf@linaro.org> (raw)
In-Reply-To: <20170916023417.14599-2-richard.henderson@linaro.org>


Richard Henderson <richard.henderson@linaro.org> writes:

> Nothing uses or enables them yet.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/tcg-op.h  |  26 +++++++
>  tcg/tcg-opc.h |  37 ++++++++++
>  tcg/tcg.h     |  34 +++++++++
>  tcg/tcg-op.c  | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg.c     |  77 ++++++++++++++++++-
>  tcg/README    |  46 ++++++++++++
>  6 files changed, 453 insertions(+), 1 deletion(-)
>
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 5d3278f243..b9b0b9f46f 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>  void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
>  void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>
> +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
> +void tcg_gen_movi_vec(TCGv_vec, tcg_target_long);
> +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a);
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
> +
>  #if TARGET_LONG_BITS == 64
>  #define tcg_gen_movi_tl tcg_gen_movi_i64
>  #define tcg_gen_mov_tl tcg_gen_mov_i64
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index 956fb1e9f3..8200184fa9 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
>  DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
>      TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
>
> +/* Host vector support.  */
> +
> +#define IMPLVEC  \
> +    IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256)
> +
> +DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT)
> +
> +/* ??? Simple, but perhaps dupiN would be more descriptive.  */
> +DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT)
> +
> +DEF(ld_vec, 1, 1, 2, IMPLVEC)
> +DEF(ldz_vec, 1, 1, 3, IMPLVEC)
> +DEF(st_vec, 0, 2, 2, IMPLVEC)
> +
> +DEF(add8_vec, 1, 2, 1, IMPLVEC)
> +DEF(add16_vec, 1, 2, 1, IMPLVEC)
> +DEF(add32_vec, 1, 2, 1, IMPLVEC)
> +DEF(add64_vec, 1, 2, 1, IMPLVEC)
> +
> +DEF(sub8_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub16_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub32_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub64_vec, 1, 2, 1, IMPLVEC)
> +
> +DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +
> +DEF(and_vec, 1, 2, 1, IMPLVEC)
> +DEF(or_vec, 1, 2, 1, IMPLVEC)
> +DEF(xor_vec, 1, 2, 1, IMPLVEC)
> +DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
> +DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
> +DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
> +
>  #undef TLADDR_ARGS
>  #undef DATA64_ARGS
>  #undef IMPL
>  #undef IMPL64
> +#undef IMPLVEC
>  #undef DEF
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 25662c36d4..7cd356e87f 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet;
>  # error "Missing unsigned widening multiply"
>  #endif
>
> +#ifndef TCG_TARGET_HAS_v64
> +#define TCG_TARGET_HAS_v64              0
> +#define TCG_TARGET_HAS_v128             0
> +#define TCG_TARGET_HAS_v256             0
> +#define TCG_TARGET_HAS_neg_vec          0
> +#define TCG_TARGET_HAS_not_vec          0
> +#define TCG_TARGET_HAS_andc_vec         0
> +#define TCG_TARGET_HAS_orc_vec          0
> +#endif
> +
>  #ifndef TARGET_INSN_START_EXTRA_WORDS
>  # define TARGET_INSN_START_WORDS 1
>  #else
> @@ -249,6 +259,11 @@ typedef struct TCGPool {
>  typedef enum TCGType {
>      TCG_TYPE_I32,
>      TCG_TYPE_I64,
> +
> +    TCG_TYPE_V64,
> +    TCG_TYPE_V128,
> +    TCG_TYPE_V256,
> +
>      TCG_TYPE_COUNT, /* number of different types */
>
>      /* An alias for the size of the host register.  */
> @@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg;
>      * TCGv_i32 : 32 bit integer type
>      * TCGv_i64 : 64 bit integer type
>      * TCGv_ptr : a host pointer type
> +    * TCGv_vec : a host vector type; the exact size is not exposed
> +                 to the CPU front-end code.

Isn't this a guest vector type (which is pointed to by a host pointer)?

>      * TCGv : an integer type the same size as target_ulong
>               (an alias for either TCGv_i32 or TCGv_i64)
>     The compiler's type checking will complain if you mix them
> @@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg;
>  typedef struct TCGv_i32_d *TCGv_i32;
>  typedef struct TCGv_i64_d *TCGv_i64;
>  typedef struct TCGv_ptr_d *TCGv_ptr;
> +typedef struct TCGv_vec_d *TCGv_vec;
>  typedef TCGv_ptr TCGv_env;
>  #if TARGET_LONG_BITS == 32
>  #define TCGv TCGv_i32
> @@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
>      return (TCGv_ptr)i;
>  }
>
> +static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i)
> +{
> +    return (TCGv_vec)i;
> +}
> +
>  static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
>  {
>      return (intptr_t)t;
> @@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
>      return (intptr_t)t;
>  }
>
> +static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t)
> +{
> +    return (intptr_t)t;
> +}
> +
>  #if TCG_TARGET_REG_BITS == 32
>  #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
>  #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
> @@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
>  #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
>  #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
>  #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
> +#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b))
>
>  /* Dummy definition to avoid compiler warnings.  */
>  #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
>  #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
>  #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
> +#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1)
>
>  #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
>  #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
>  #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
> +#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1)
>
>  /* call flags */
>  /* Helper does not read globals (either directly or through an exception). It
> @@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
>
>  TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
>  TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
> +TCGv_vec tcg_temp_new_vec(TCGType type);
> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
>
>  void tcg_temp_free_i32(TCGv_i32 arg);
>  void tcg_temp_free_i64(TCGv_i64 arg);
> +void tcg_temp_free_vec(TCGv_vec arg);
>
>  static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
>                                                const char *name)
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 688d91755b..50b3177e5f 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
>  GEN_ATOMIC_HELPER(xchg, mov2, 0)
>
>  #undef GEN_ATOMIC_HELPER
> +
> +static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg ai = GET_TCGV_VEC(a);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGTemp *at = &tcg_ctx.temps[ai];
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(at->base_type == type);
> +    tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64);
> +}
> +
> +static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg ai = GET_TCGV_VEC(a);
> +    TCGArg bi = GET_TCGV_VEC(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGTemp *at = &tcg_ctx.temps[ai];
> +    TCGTemp *bt = &tcg_ctx.temps[bi];
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(at->base_type == type);
> +    tcg_debug_assert(bt->base_type == type);
> +    tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (!TCGV_EQUAL_VEC(r, a)) {
> +        tcg_gen_op2_vec(INDEX_op_mov_vec, r, a);
> +    }
> +}
> +
> +void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(a == 0 || a == -1);
> +    tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType type = rt->base_type;
> +
> +    tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType type = rt->base_type;
> +
> +    tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +/* Load data into a vector R from B+O using TYPE.  If R is wider than TYPE,
> +   fill the high bits with zeros.  */
> +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType btype = rt->base_type;
> +
> +    if (type < btype) {
> +        tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o,
> +                    type - TCG_TYPE_V64, btype - TCG_TYPE_V64);
> +    } else {
> +        tcg_debug_assert(type == btype);
> +        tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
> +    }
> +}
> +
> +/* Store data from vector R into B+O using TYPE.  If R is wider than TYPE,
> +   store only the low bits.  */
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType btype = rt->base_type;
> +
> +    tcg_debug_assert(type <= btype);
> +    tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b);
> +}
> +
> +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b);
> +}
> +
> +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b);
> +}
> +
> +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b);
> +}
> +
> +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b);
> +}
> +
> +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b);
> +}
> +
> +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b);
> +}
> +
> +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    if (TCG_TARGET_HAS_andc_vec) {
> +        tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_not_vec(t, b);
> +        tcg_gen_and_vec(r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    if (TCG_TARGET_HAS_orc_vec) {
> +        tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_not_vec(t, b);
> +        tcg_gen_or_vec(r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_not_vec) {
> +        tcg_gen_op2_vec(INDEX_op_orc_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, -1);
> +        tcg_gen_xor_vec(r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub8_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub16_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub32_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub64_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index dff9999bc6..a4d55efdf0 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
>  static bool tcg_out_ldst_finalize(TCGContext *s);
>  #endif
>
> -static TCGRegSet tcg_target_available_regs[2];
> +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
>  static TCGRegSet tcg_target_call_clobber_regs;
>
>  #if TCG_TARGET_INSN_UNIT_SIZE == 1
> @@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
>      return MAKE_TCGV_I64(idx);
>  }
>
> +TCGv_vec tcg_temp_new_vec(TCGType type)
> +{
> +    int idx;
> +
> +#ifdef CONFIG_DEBUG_TCG
> +    switch (type) {
> +    case TCG_TYPE_V64:
> +        assert(TCG_TARGET_HAS_v64);
> +        break;
> +    case TCG_TYPE_V128:
> +        assert(TCG_TARGET_HAS_v128);
> +        break;
> +    case TCG_TYPE_V256:
> +        assert(TCG_TARGET_HAS_v256);
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +#endif
> +
> +    idx = tcg_temp_new_internal(type, 0);
> +    return MAKE_TCGV_VEC(idx);
> +}
> +

A one line comment wouldn't go amiss here. This looks like we are
allocating a new temp of the same type as an existing temp?

> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
> +{
> +    TCGContext *s = &tcg_ctx;
> +    int idx = GET_TCGV_VEC(match);
> +    TCGTemp *ts;
> +
> +    tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps);
> +    ts = &s->temps[idx];
> +    tcg_debug_assert(ts->temp_allocated != 0);
> +
> +    idx = tcg_temp_new_internal(ts->base_type, 0);
> +    return MAKE_TCGV_VEC(idx);
> +}
> +
>  static void tcg_temp_free_internal(int idx)
>  {
>      TCGContext *s = &tcg_ctx;
> @@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
>      tcg_temp_free_internal(GET_TCGV_I64(arg));
>  }
>
> +void tcg_temp_free_vec(TCGv_vec arg)
> +{
> +    tcg_temp_free_internal(GET_TCGV_VEC(arg));
> +}
> +
>  TCGv_i32 tcg_const_i32(int32_t val)
>  {
>      TCGv_i32 t0;
> @@ -753,6 +796,9 @@ int tcg_check_temp_count(void)
>     Test the runtime variable that controls each opcode.  */
>  bool tcg_op_supported(TCGOpcode op)
>  {
> +    const bool have_vec
> +        = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
> +
>      switch (op) {
>      case INDEX_op_discard:
>      case INDEX_op_set_label:
> @@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op)
>      case INDEX_op_mulsh_i64:
>          return TCG_TARGET_HAS_mulsh_i64;
>
> +    case INDEX_op_mov_vec:
> +    case INDEX_op_movi_vec:
> +    case INDEX_op_ld_vec:
> +    case INDEX_op_ldz_vec:
> +    case INDEX_op_st_vec:
> +    case INDEX_op_add8_vec:
> +    case INDEX_op_add16_vec:
> +    case INDEX_op_add32_vec:
> +    case INDEX_op_add64_vec:
> +    case INDEX_op_sub8_vec:
> +    case INDEX_op_sub16_vec:
> +    case INDEX_op_sub32_vec:
> +    case INDEX_op_sub64_vec:
> +    case INDEX_op_and_vec:
> +    case INDEX_op_or_vec:
> +    case INDEX_op_xor_vec:
> +        return have_vec;
> +    case INDEX_op_not_vec:
> +        return have_vec && TCG_TARGET_HAS_not_vec;
> +    case INDEX_op_neg8_vec:
> +    case INDEX_op_neg16_vec:
> +    case INDEX_op_neg32_vec:
> +    case INDEX_op_neg64_vec:
> +        return have_vec && TCG_TARGET_HAS_neg_vec;
> +    case INDEX_op_andc_vec:
> +        return have_vec && TCG_TARGET_HAS_andc_vec;
> +    case INDEX_op_orc_vec:
> +        return have_vec && TCG_TARGET_HAS_orc_vec;
> +
>      case NB_OPS:
>          break;
>      }
> diff --git a/tcg/README b/tcg/README
> index 03bfb6acd4..3bf3af67db 100644
> --- a/tcg/README
> +++ b/tcg/README
> @@ -503,6 +503,52 @@ of the memory access.
>  For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
>  64-bit memory access specified in flags.
>
> +********* Host vector operations
> +
> +All of the vector ops have a final constant argument that specifies the
> +length of the vector operation LEN as 64 << LEN bits.

That doesn't scan well. So would a 4 lane operation be encoded as 64 <<
4? Is this because we are using the bottom bits for something?

> +
> +* mov_vec   v0, v1, len
> +* ld_vec    v0, t1, len
> +* st_vec    v0, t1, len
> +
> +  Move, load and store.
> +
> +* movi_vec  v0, c, len
> +
> +  Copy C across the entire vector.
> +  At present the only supported values for C are 0 and -1.

I guess this is why the size in unimportant? This is for clearing or
setting the whole of the vector? What does len mean in this case?

> +
> +* add8_vec    v0, v1, v2, len
> +* add16_vec   v0, v1, v2, len
> +* add32_vec   v0, v1, v2, len
> +* add64_vec   v0, v1, v2, len
> +
> +  v0 = v1 + v2, in elements of 8/16/32/64 bits, across len.
> +
> +* sub8_vec    v0, v1, v2, len
> +* sub16_vec   v0, v1, v2, len
> +* sub32_vec   v0, v1, v2, len
> +* sub64_vec   v0, v1, v2, len
> +
> +  Similarly, v0 = v1 - v2.
> +
> +* neg8_vec    v0, v1, len
> +* neg16_vec   v0, v1, len
> +* neg32_vec   v0, v1, len
> +* neg64_vec   v0, v1, len
> +
> +  Similarly, v0 = -v1.
> +
> +* and_vec     v0, v1, v2, len
> +* or_vec      v0, v1, v2, len
> +* xor_vec     v0, v1, v2, len
> +* andc_vec    v0, v1, v2, len
> +* orc_vec     v0, v1, v2, len
> +* not_vec     v0, v1, len
> +
> +  Similarly, logical operations.

Similarly, logical operations with and without compliment?

> +
>  *********
>
>  Note 1: Some shortcuts are defined when the last operand is known to be


--
Alex Bennée

next prev parent reply	other threads:[~2017-09-26 19:28 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
2017-09-26 19:28   ` Alex Bennée [this message]
2017-09-27 16:18     ` Richard Henderson
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
2017-09-26 22:31   ` Alex Bennée
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
2017-09-26 22:33   ` Alex Bennée
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
2017-09-26 23:12   ` Alex Bennée
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
2017-09-16  2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-26 22:58 ` no-reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87shf9cl9r.fsf@linaro.org \
    --to=alex.bennee@linaro.org \
    --cc=f4bug@amsat.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.