On Mon, Mar 23, 2026 at 7:16 AM Matheus Tavares Bernardino < matheus.bernardino@oss.qualcomm.com> wrote: > Add HVX IEEE bfloat16 (bf16) instructions: > > Arithmetic operations: > - V6_vadd_sf_bf, V6_vsub_sf_bf: add/sub bf16 widening to sf output > - V6_vmpy_sf_bf: multiply bf16 widening to sf output > - V6_vmpy_sf_bf_acc: multiply-accumulate bf16 widening to sf output > > Min/Max operations: > - V6_vmin_bf, V6_vmax_bf: bf16 min/max > > Comparison operations: > - V6_vgtbf: greater-than compare > - V6_vgtbf_and, V6_vgtbf_or, V6_vgtbf_xor: predicate variants > > Conversion operations: > - V6_vcvt_bf_sf: convert sf to bf16 > > Signed-off-by: Matheus Tavares Bernardino < > matheus.bernardino@oss.qualcomm.com> > --- > target/hexagon/mmvec/kvx_ieee.h | 36 +++++++++++ > target/hexagon/mmvec/macros.h | 5 ++ > target/hexagon/mmvec/mmvec.h | 1 + > target/hexagon/mmvec/kvx_ieee.c | 3 + > target/hexagon/imported/mmvec/encode_ext.def | 15 +++++ > target/hexagon/imported/mmvec/ext.idef | 64 ++++++++++++++++++++ > 6 files changed, 124 insertions(+) > > diff --git a/target/hexagon/mmvec/kvx_ieee.h > b/target/hexagon/mmvec/kvx_ieee.h > index 8a6816f6b3..eb670d4ec3 100644 > --- a/target/hexagon/mmvec/kvx_ieee.h > +++ b/target/hexagon/mmvec/kvx_ieee.h > @@ -80,4 +80,40 @@ int16_t conv_hf_h(int16_t a, float_status *fp_status); > int32_t conv_w_sf(uint32_t a, float_status *fp_status); > int16_t conv_h_hf(uint16_t a, float_status *fp_status); > > +/* IEEE BFloat instructions */ > + > +#define fp_mult_sf_bf(A, B) \ > + fp_mult_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, > &env->fp_status) > +#define fp_add_sf_bf(A, B) \ > + fp_add_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, > &env->fp_status) > +#define fp_sub_sf_bf(A, B) \ > + fp_sub_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, > &env->fp_status) > Can we use softfloat routine bfloat16_to_float32 instead of shifting by 16? > + > +uint32_t fp_mult_sf_bf_acc(uint16_t op1, uint16_t op2, uint32_t acc, > + float_status *fp_status); > + > +#define bf_to_sf(A) (((uint32_t)(A)) << 16) > Ditto > + > +#define fp_min_bf(A, B) ({ \ > + uint32_t _bf_res = fp_min_sf(bf_to_sf(A), bf_to_sf(B), > &env->fp_status); \ > + (uint16_t)((_bf_res >> 16) & 0xffff); \ > float32_to_bfloat16 > +}) > + > +#define fp_max_bf(A, B) ({ \ > + uint32_t _bf_res = fp_max_sf(bf_to_sf(A), bf_to_sf(B), > &env->fp_status); \ > + (uint16_t)((_bf_res >> 16) & 0xffff); \ > Ditto > +}) > + > +static inline uint16_t sf_to_bf(int32_t A) > +{ > + uint32_t rslt = A; > + if ((rslt & 0x1FFFF) == 0x08000) { > + /* do not round up if exactly .5 and even already */ > + } else if ((rslt & 0x8000) == 0x8000) { > + rslt += 0x8000; /* rounding to nearest number */ > + } > + rslt = float32_is_any_nan(A) ? FP32_DEF_NAN : rslt; > + return rslt >> 16; > +} > float32_to_bfloat16 > + > #endif > diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h > index c342507d1a..b70996578e 100644 > --- a/target/hexagon/mmvec/macros.h > +++ b/target/hexagon/mmvec/macros.h > @@ -25,6 +25,9 @@ > #include "accel/tcg/probe.h" > #include "mmvec/kvx_ieee.h" > > +#define fBFLOAT() > +#define fCVI_VX_NO_TMP_LD() > + > #ifndef QEMU_GENERATE > #define VdV (*(MMVector *restrict)(VdV_void)) > #define VsV (*(MMVector *restrict)(VsV_void)) > @@ -366,4 +369,6 @@ > (int16_t)(A) > (int16_t)(B) : \ > float16_compare((A), (B), &env->fp_status) == float_relation_greater) > > +#define fCMPGT_BF(A, B) fCMPGT_SF(((int)A) << 16, ((int)B) << 16) > bfloat16_to_float32 > + > #endif > diff --git a/target/hexagon/mmvec/mmvec.h b/target/hexagon/mmvec/mmvec.h > index eaedfe0d6d..9d8d57c7c6 100644 > --- a/target/hexagon/mmvec/mmvec.h > +++ b/target/hexagon/mmvec/mmvec.h > @@ -40,6 +40,7 @@ typedef union { > int8_t b[MAX_VEC_SIZE_BYTES / 1]; > int32_t sf[MAX_VEC_SIZE_BYTES / 4]; /* single float (32-bit) */ > int16_t hf[MAX_VEC_SIZE_BYTES / 2]; /* half float (16-bit) */ > + uint16_t bf[MAX_VEC_SIZE_BYTES / 2]; /* bfloat16 */ > Consider using bfloat16 Also float32 for sf and float16 for hf. > } MMVector; Thanks, Taylor