From: "Alex Bennée" <alex.bennee@linaro.org>
To: qemu-arm@nongnu.org
Cc: qemu-devel@nongnu.org, richard.henderson@linaro.org,
"Alex Bennée" <alex.bennee@linaro.org>,
"Peter Maydell" <peter.maydell@linaro.org>
Subject: [Qemu-devel] [PATCH v4 06/31] arm/translate-a64: implement half-precision F(MIN|MAX)(V|NMV)
Date: Tue, 27 Feb 2018 14:38:27 +0000 [thread overview]
Message-ID: <20180227143852.11175-7-alex.bennee@linaro.org> (raw)
In-Reply-To: <20180227143852.11175-1-alex.bennee@linaro.org>
This implements the half-precision variants of the across vector
reduction operations. This involves a re-factor of the reduction code
which more closely matches the ARM ARM order (and handles 8 element
reductions).
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
--
v1
- dropped the advsimd_2a stuff
v2
- fixed up checkpatch
v3
- add TCG_CALL_NO_RWG to helper definitions
---
target/arm/helper-a64.c | 18 ++++++
target/arm/helper-a64.h | 4 ++
target/arm/translate-a64.c | 142 ++++++++++++++++++++++++++++-----------------
3 files changed, 110 insertions(+), 54 deletions(-)
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 10e08bdc1f..fddd5d242b 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -572,3 +572,21 @@ uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
{
return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
}
+
+/*
+ * AdvSIMD half-precision
+ */
+
+#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
+
+#define ADVSIMD_HALFOP(name) \
+float16 ADVSIMD_HELPER(name, h)(float16 a, float16 b, void *fpstp) \
+{ \
+ float_status *fpst = fpstp; \
+ return float16_ ## name(a, b, fpst); \
+}
+
+ADVSIMD_HALFOP(min)
+ADVSIMD_HALFOP(max)
+ADVSIMD_HALFOP(minnum)
+ADVSIMD_HALFOP(maxnum)
diff --git a/target/arm/helper-a64.h b/target/arm/helper-a64.h
index 85d86741db..cb2a73124d 100644
--- a/target/arm/helper-a64.h
+++ b/target/arm/helper-a64.h
@@ -48,3 +48,7 @@ DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, TCG_CALL_NO_WG,
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
i64, env, i64, i64, i64)
+DEF_HELPER_FLAGS_3(advsimd_maxh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+DEF_HELPER_FLAGS_3(advsimd_minh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+DEF_HELPER_FLAGS_3(advsimd_maxnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+DEF_HELPER_FLAGS_3(advsimd_minnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 91c2b8ed11..ebaf4571ac 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -5741,26 +5741,75 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
tcg_temp_free_i64(tcg_resh);
}
-static void do_minmaxop(DisasContext *s, TCGv_i32 tcg_elt1, TCGv_i32 tcg_elt2,
- int opc, bool is_min, TCGv_ptr fpst)
-{
- /* Helper function for disas_simd_across_lanes: do a single precision
- * min/max operation on the specified two inputs,
- * and return the result in tcg_elt1.
- */
- if (opc == 0xc) {
- if (is_min) {
- gen_helper_vfp_minnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
- } else {
- gen_helper_vfp_maxnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
- }
+/*
+ * do_reduction_op helper
+ *
+ * This mirrors the Reduce() pseudocode in the ARM ARM. It is
+ * important for correct NaN propagation that we do these
+ * operations in exactly the order specified by the pseudocode.
+ *
+ * This is a recursive function, TCG temps should be freed by the
+ * calling function once it is done with the values.
+ */
+static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
+ int esize, int size, int vmap, TCGv_ptr fpst)
+{
+ if (esize == size) {
+ int element;
+ TCGMemOp msize = esize == 16 ? MO_16 : MO_32;
+ TCGv_i32 tcg_elem;
+
+ /* We should have one register left here */
+ assert(ctpop8(vmap) == 1);
+ element = ctz32(vmap);
+ assert(element < 8);
+
+ tcg_elem = tcg_temp_new_i32();
+ read_vec_element_i32(s, tcg_elem, rn, element, msize);
+ return tcg_elem;
} else {
- assert(opc == 0xf);
- if (is_min) {
- gen_helper_vfp_mins(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
- } else {
- gen_helper_vfp_maxs(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+ int bits = size / 2;
+ int shift = ctpop8(vmap) / 2;
+ int vmap_lo = (vmap >> shift) & vmap;
+ int vmap_hi = (vmap & ~vmap_lo);
+ TCGv_i32 tcg_hi, tcg_lo, tcg_res;
+
+ tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
+ tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
+ tcg_res = tcg_temp_new_i32();
+
+ switch (fpopcode) {
+ case 0x0c: /* fmaxnmv half-precision */
+ gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x0f: /* fmaxv half-precision */
+ gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x1c: /* fminnmv half-precision */
+ gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x1f: /* fminv half-precision */
+ gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x2c: /* fmaxnmv */
+ gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x2f: /* fmaxv */
+ gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x3c: /* fminnmv */
+ gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ case 0x3f: /* fminv */
+ gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst);
+ break;
+ default:
+ g_assert_not_reached();
}
+
+ tcg_temp_free_i32(tcg_hi);
+ tcg_temp_free_i32(tcg_lo);
+ return tcg_res;
}
}
@@ -5802,16 +5851,21 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
break;
case 0xc: /* FMAXNMV, FMINNMV */
case 0xf: /* FMAXV, FMINV */
- if (!is_u || !is_q || extract32(size, 0, 1)) {
- unallocated_encoding(s);
- return;
- }
- /* Bit 1 of size field encodes min vs max, and actual size is always
- * 32 bits: adjust the size variable so following code can rely on it
+ /* Bit 1 of size field encodes min vs max and the actual size
+ * depends on the encoding of the U bit. If not set (and FP16
+ * enabled) then we do half-precision float instead of single
+ * precision.
*/
is_min = extract32(size, 1, 1);
is_fp = true;
- size = 2;
+ if (!is_u && arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
+ size = 1;
+ } else if (!is_u || !is_q || extract32(size, 0, 1)) {
+ unallocated_encoding(s);
+ return;
+ } else {
+ size = 2;
+ }
break;
default:
unallocated_encoding(s);
@@ -5868,38 +5922,18 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
}
} else {
- /* Floating point ops which work on 32 bit (single) intermediates.
+ /* Floating point vector reduction ops which work across 32
+ * bit (single) or 16 bit (half-precision) intermediates.
* Note that correct NaN propagation requires that we do these
* operations in exactly the order specified by the pseudocode.
*/
- TCGv_i32 tcg_elt1 = tcg_temp_new_i32();
- TCGv_i32 tcg_elt2 = tcg_temp_new_i32();
- TCGv_i32 tcg_elt3 = tcg_temp_new_i32();
- TCGv_ptr fpst = get_fpstatus_ptr(false);
-
- assert(esize == 32);
- assert(elements == 4);
-
- read_vec_element(s, tcg_elt, rn, 0, MO_32);
- tcg_gen_extrl_i64_i32(tcg_elt1, tcg_elt);
- read_vec_element(s, tcg_elt, rn, 1, MO_32);
- tcg_gen_extrl_i64_i32(tcg_elt2, tcg_elt);
-
- do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
-
- read_vec_element(s, tcg_elt, rn, 2, MO_32);
- tcg_gen_extrl_i64_i32(tcg_elt2, tcg_elt);
- read_vec_element(s, tcg_elt, rn, 3, MO_32);
- tcg_gen_extrl_i64_i32(tcg_elt3, tcg_elt);
-
- do_minmaxop(s, tcg_elt2, tcg_elt3, opcode, is_min, fpst);
-
- do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
-
- tcg_gen_extu_i32_i64(tcg_res, tcg_elt1);
- tcg_temp_free_i32(tcg_elt1);
- tcg_temp_free_i32(tcg_elt2);
- tcg_temp_free_i32(tcg_elt3);
+ TCGv_ptr fpst = get_fpstatus_ptr(size == MO_16);
+ int fpopcode = opcode | is_min << 4 | is_u << 5;
+ int vmap = (1 << elements) - 1;
+ TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
+ (is_q ? 128 : 64), vmap, fpst);
+ tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
+ tcg_temp_free_i32(tcg_res32);
tcg_temp_free_ptr(fpst);
}
--
2.15.1
next prev parent reply other threads:[~2018-02-27 14:39 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-02-27 14:38 [Qemu-devel] [PATCH v4 00/31] Add ARMv8.2 half-precision functions Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 01/31] include/exec/helper-head.h: support f16 in helper calls Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 02/31] target/arm/cpu64: introduce ARM_V8_FP16 feature bit Alex Bennée
2018-02-27 18:09 ` Peter Maydell
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 03/31] target/arm/cpu.h: update comment for half-precision values Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 04/31] target/arm/cpu.h: add additional float_status flags Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 05/31] target/arm/helper: pass explicit fpst to set_rmode Alex Bennée
2018-02-27 14:38 ` Alex Bennée [this message]
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 07/31] arm/translate-a64: handle_3same_64 comment fix Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 08/31] arm/translate-a64: initial decode for simd_three_reg_same_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 09/31] arm/translate-a64: add FP16 FADD/FABD/FSUB/FMUL/FDIV to simd_three_reg_same_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 10/31] arm/translate-a64: add FP16 F[A]C[EQ/GE/GT] " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 11/31] arm/translate-a64: add FP16 FMULA/X/S " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 12/31] arm/translate-a64: add FP16 FR[ECP/SQRT]S " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 13/31] arm/translate-a64: add FP16 pairwise ops simd_three_reg_same_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 14/31] arm/translate-a64: add FP16 FMULX/MLS/FMLA to simd_indexed Alex Bennée
2018-02-27 17:09 ` Richard Henderson
2018-02-27 17:52 ` Alex Bennée
2018-02-27 17:57 ` Peter Maydell
2018-02-27 18:17 ` Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 15/31] arm/translate-a64: add FP16 x2 ops for simd_indexed Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 16/31] arm/translate-a64: initial decode for simd_two_reg_misc_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 17/31] arm/translate-a64: add FP16 FPRINTx to simd_two_reg_misc_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 18/31] arm/translate-a64: add FCVTxx " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 19/31] arm/translate-a64: add FP16 FCMxx (zero) " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 20/31] arm/translate-a64: add FP16 SCVTF/UCVFT " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 21/31] arm/translate-a64: add FP16 FNEG/FABS " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 22/31] arm/helper.c: re-factor recpe and add recepe_f16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 23/31] arm/translate-a64: add FP16 FRECPE Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 24/31] arm/translate-a64: add FP16 FRCPX to simd_two_reg_misc_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 25/31] arm/translate-a64: add FP16 FSQRT " Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 26/31] arm/helper.c: re-factor rsqrte and add rsqrte_f16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 27/31] arm/translate-a64: add FP16 FRSQRTE to simd_two_reg_misc_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 28/31] arm/translate-a64: add FP16 FMOV to simd_mod_imm Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 29/31] arm/translate-a64: add all FP16 ops in simd_scalar_pairwise Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 30/31] arm/translate-a64: implement simd_scalar_three_reg_same_fp16 Alex Bennée
2018-02-27 14:38 ` [Qemu-devel] [PATCH v4 31/31] arm/translate-a64: add all single op FP16 to handle_fp_1src_half Alex Bennée
2018-02-27 15:14 ` [Qemu-devel] [PATCH v4 00/31] Add ARMv8.2 half-precision functions no-reply
2018-02-27 15:14 ` no-reply
2018-02-27 18:11 ` [Qemu-devel] [Qemu-arm] " Peter Maydell
2018-02-28 13:32 ` Alex Bennée
2018-02-28 15:02 ` Peter Maydell
2018-02-28 16:58 ` Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180227143852.11175-7-alex.bennee@linaro.org \
--to=alex.bennee@linaro.org \
--cc=peter.maydell@linaro.org \
--cc=qemu-arm@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).