On Mon, Mar 23, 2026 at 7:16 AM Matheus Tavares Bernardino <matheus.bernardino@oss.qualcomm.com> wrote:
Add HVX IEEE bfloat16 (bf16) instructions:

Arithmetic operations:
- V6_vadd_sf_bf, V6_vsub_sf_bf: add/sub bf16 widening to sf output
- V6_vmpy_sf_bf: multiply bf16 widening to sf output
- V6_vmpy_sf_bf_acc: multiply-accumulate bf16 widening to sf output

Min/Max operations:
- V6_vmin_bf, V6_vmax_bf: bf16 min/max

Comparison operations:
- V6_vgtbf: greater-than compare
- V6_vgtbf_and, V6_vgtbf_or, V6_vgtbf_xor: predicate variants

Conversion operations:
- V6_vcvt_bf_sf: convert sf to bf16

Signed-off-by: Matheus Tavares Bernardino <matheus.bernardino@oss.qualcomm.com>
---
 target/hexagon/mmvec/kvx_ieee.h              | 36 +++++++++++
 target/hexagon/mmvec/macros.h                |  5 ++
 target/hexagon/mmvec/mmvec.h                 |  1 +
 target/hexagon/mmvec/kvx_ieee.c              |  3 +
 target/hexagon/imported/mmvec/encode_ext.def | 15 +++++
 target/hexagon/imported/mmvec/ext.idef       | 64 ++++++++++++++++++++
 6 files changed, 124 insertions(+)

diff --git a/target/hexagon/mmvec/kvx_ieee.h b/target/hexagon/mmvec/kvx_ieee.h
index 8a6816f6b3..eb670d4ec3 100644
--- a/target/hexagon/mmvec/kvx_ieee.h
+++ b/target/hexagon/mmvec/kvx_ieee.h
@@ -80,4 +80,40 @@ int16_t conv_hf_h(int16_t a, float_status *fp_status);
 int32_t conv_w_sf(uint32_t a, float_status *fp_status);
 int16_t conv_h_hf(uint16_t a, float_status *fp_status);

+/* IEEE BFloat instructions */
+
+#define fp_mult_sf_bf(A, B) \
+    fp_mult_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, &env->fp_status)
+#define fp_add_sf_bf(A, B) \
+    fp_add_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, &env->fp_status)
+#define fp_sub_sf_bf(A, B) \
+    fp_sub_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, &env->fp_status)

Can we use softfloat routine bfloat16_to_float32 instead of shifting by 16?
 
+
+uint32_t fp_mult_sf_bf_acc(uint16_t op1, uint16_t op2, uint32_t acc,
+                           float_status *fp_status);
+
+#define bf_to_sf(A) (((uint32_t)(A)) << 16)

Ditto
 
+
+#define fp_min_bf(A, B) ({ \
+    uint32_t _bf_res = fp_min_sf(bf_to_sf(A), bf_to_sf(B), &env->fp_status); \
+    (uint16_t)((_bf_res >> 16) & 0xffff); \

float32_to_bfloat16
 
+})
+
+#define fp_max_bf(A, B) ({ \
+    uint32_t _bf_res = fp_max_sf(bf_to_sf(A), bf_to_sf(B), &env->fp_status); \
+    (uint16_t)((_bf_res >> 16) & 0xffff); \

Ditto
 
+})
+
+static inline uint16_t sf_to_bf(int32_t A)
+{
+    uint32_t rslt = A;
+    if ((rslt & 0x1FFFF) == 0x08000) {
+        /* do not round up if exactly .5 and even already */
+    } else if ((rslt & 0x8000) == 0x8000) {
+        rslt += 0x8000; /* rounding to nearest number */
+    }
+    rslt = float32_is_any_nan(A) ? FP32_DEF_NAN : rslt;
+    return rslt >> 16;
+}

float32_to_bfloat16
 
+
 #endif
diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h
index c342507d1a..b70996578e 100644
--- a/target/hexagon/mmvec/macros.h
+++ b/target/hexagon/mmvec/macros.h
@@ -25,6 +25,9 @@
 #include "accel/tcg/probe.h"
 #include "mmvec/kvx_ieee.h"

+#define fBFLOAT()
+#define fCVI_VX_NO_TMP_LD()
+
 #ifndef QEMU_GENERATE
 #define VdV      (*(MMVector *restrict)(VdV_void))
 #define VsV      (*(MMVector *restrict)(VsV_void))
@@ -366,4 +369,6 @@
     (int16_t)(A) > (int16_t)(B) : \
     float16_compare((A), (B), &env->fp_status) == float_relation_greater)

+#define fCMPGT_BF(A, B) fCMPGT_SF(((int)A) << 16, ((int)B) << 16)

bfloat16_to_float32
 
+
 #endif
diff --git a/target/hexagon/mmvec/mmvec.h b/target/hexagon/mmvec/mmvec.h
index eaedfe0d6d..9d8d57c7c6 100644
--- a/target/hexagon/mmvec/mmvec.h
+++ b/target/hexagon/mmvec/mmvec.h
@@ -40,6 +40,7 @@ typedef union {
     int8_t    b[MAX_VEC_SIZE_BYTES / 1];
     int32_t  sf[MAX_VEC_SIZE_BYTES / 4];   /* single float (32-bit) */
     int16_t  hf[MAX_VEC_SIZE_BYTES / 2];   /* half float (16-bit) */
+    uint16_t bf[MAX_VEC_SIZE_BYTES / 2];   /* bfloat16 */

Consider using bfloat16

Also float32 for sf and float16 for hf.
 
 } MMVector;

 Thanks,
Taylor