* [PATCH v3 0/2] [RISCV/RVV] Generate strided vector loads/stores with tcg nodes. @ 2025-08-16 0:56 Chao Liu 2025-08-16 0:56 ` [PATCH v3 1/2] " Chao Liu 2025-08-16 0:56 ` [PATCH v3 2/2] tests/tcg/riscv64: Add test for vlsseg8e32 instruction Chao Liu 0 siblings, 2 replies; 5+ messages in thread From: Chao Liu @ 2025-08-16 0:56 UTC (permalink / raw) To: paolo.savini, ebiggers, dbarboza, palmer, alistair.francis, liwei1518, zhiwei_liu Cc: qemu-riscv, qemu-devel, Chao Liu Hi Paolo, Eric, Daniel, patch v3 changes: - Fix the get_log2 cunftion: https://lore.kernel.org/qemu-riscv/cover.1755287531.git.chao.liu@yeah.net/T/#t - Add test for vlsseg8e32 instruction. - Rebase on top of the latest master. patch v2 changes: - Split the TCG node emulation of the complex strided load/store operation into two separate functions to simplify the implementation: https://lore.kernel.org/qemu-riscv/20250312155547.289642-1-paolo.savini@embecosm.com/ Best regards, Chao Chao Liu (2): Generate strided vector loads/stores with tcg nodes. tests/tcg/riscv64: Add test for vlsseg8e32 instruction target/riscv/insn_trans/trans_rvv.c.inc | 326 ++++++++++++++++++---- tests/tcg/riscv64/Makefile.softmmu-target | 8 +- tests/tcg/riscv64/test-vlsseg8e32.S | 107 +++++++ 3 files changed, 389 insertions(+), 52 deletions(-) create mode 100644 tests/tcg/riscv64/test-vlsseg8e32.S -- 2.50.1 ^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v3 1/2] Generate strided vector loads/stores with tcg nodes. 2025-08-16 0:56 [PATCH v3 0/2] [RISCV/RVV] Generate strided vector loads/stores with tcg nodes Chao Liu @ 2025-08-16 0:56 ` Chao Liu 2025-08-16 6:52 ` Richard Henderson 2025-08-16 0:56 ` [PATCH v3 2/2] tests/tcg/riscv64: Add test for vlsseg8e32 instruction Chao Liu 1 sibling, 1 reply; 5+ messages in thread From: Chao Liu @ 2025-08-16 0:56 UTC (permalink / raw) To: paolo.savini, ebiggers, dbarboza, palmer, alistair.francis, liwei1518, zhiwei_liu Cc: qemu-riscv, qemu-devel, Chao Liu This commit improves the performance of QEMU when emulating strided vector loads and stores by substituting the call for the helper function with the generation of equivalent TCG operations. Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> Signed-off-by: Chao Liu <chao.liu@yeah.net> Tested-by: Eric Biggers <ebiggers@kernel.org> Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> --- target/riscv/insn_trans/trans_rvv.c.inc | 326 ++++++++++++++++++++---- 1 file changed, 276 insertions(+), 50 deletions(-) diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index 71f98fb350..b5d74b0ec9 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -864,32 +864,289 @@ GEN_VEXT_TRANS(vlm_v, MO_8, vlm_v, ld_us_mask_op, ld_us_mask_check) GEN_VEXT_TRANS(vsm_v, MO_8, vsm_v, st_us_mask_op, st_us_mask_check) /* - *** stride load and store + * MAXSZ returns the maximum vector size can be operated in bytes, + * which is used in GVEC IR when vl_eq_vlmax flag is set to true + * to accelerate vector operation. + */ +static inline uint32_t MAXSZ(DisasContext *s) +{ + int max_sz = s->cfg_ptr->vlenb << 3; + return max_sz >> (3 - s->lmul); +} + +static inline uint32_t get_log2(uint32_t a) +{ + uint32_t i = 0; + if (a == 0) { + return i; + } + for (; a > 1;) { + a >>= 1; + i++; + } + return i; +} + +typedef void gen_tl_ldst(TCGv, TCGv_ptr, tcg_target_long); + +/* + * Simulate the strided load/store main loop: + * + * for (i = env->vstart; i < env->vl; env->vstart = ++i) { + * k = 0; + * while (k < nf) { + * if (!vm && !vext_elem_mask(v0, i)) { + * vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, + * (i + k * max_elems + 1) * esz); + * k++; + * continue; + * } + * target_ulong addr = base + stride * i + (k << log2_esz); + * ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); + * k++; + * } + * } + */ +static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1, + uint32_t rs2, uint32_t vm, uint32_t nf, + gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn, + bool is_load) +{ + TCGv addr = tcg_temp_new(); + TCGv base = get_gpr(s, rs1, EXT_NONE); + TCGv stride = get_gpr(s, rs2, EXT_NONE); + + TCGv i = tcg_temp_new(); + TCGv i_esz = tcg_temp_new(); + TCGv k = tcg_temp_new(); + TCGv k_esz = tcg_temp_new(); + TCGv k_max = tcg_temp_new(); + TCGv mask = tcg_temp_new(); + TCGv mask_offs = tcg_temp_new(); + TCGv mask_offs_64 = tcg_temp_new(); + TCGv mask_elem = tcg_temp_new(); + TCGv mask_offs_rem = tcg_temp_new(); + TCGv vreg = tcg_temp_new(); + TCGv dest_offs = tcg_temp_new(); + TCGv stride_offs = tcg_temp_new(); + + uint32_t max_elems = MAXSZ(s) >> s->sew; + + TCGLabel *start = gen_new_label(); + TCGLabel *end = gen_new_label(); + TCGLabel *start_k = gen_new_label(); + TCGLabel *inc_k = gen_new_label(); + TCGLabel *end_k = gen_new_label(); + + MemOp atomicity = MO_ATOM_NONE; + if (s->sew == 0) { + atomicity = MO_ATOM_NONE; + } else { + atomicity = MO_ATOM_IFALIGN_PAIR; + } + + mark_vs_dirty(s); + + tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0)); + + /* Start of outer loop. */ + tcg_gen_mov_tl(i, cpu_vstart); + gen_set_label(start); + tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end); + tcg_gen_shli_tl(i_esz, i, s->sew); + /* Start of inner loop. */ + tcg_gen_movi_tl(k, 0); + gen_set_label(start_k); + tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k); + /* + * If we are in mask agnostic regime and the operation is not unmasked we + * set the inactive elements to 1. + */ + if (!vm && s->vma) { + TCGLabel *active_element = gen_new_label(); + /* (i + k * max_elems) * esz */ + tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew)); + tcg_gen_add_tl(mask_offs, mask_offs, i_esz); + + /* + * Check whether the i bit of the mask is 0 or 1. + * + * static inline int vext_elem_mask(void *v0, int index) + * { + * int idx = index / 64; + * int pos = index % 64; + * return (((uint64_t *)v0)[idx] >> pos) & 1; + * } + */ + tcg_gen_shri_tl(mask_offs_64, mask_offs, 3); + tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask); + tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0); + tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8)); + tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem); + tcg_gen_andi_tl(mask_elem, mask_elem, 1); + tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0), + active_element); + /* + * Set masked-off elements in the destination vector register to 1s. + * Store instructions simply skip this bit as memory ops access memory + * only for active elements. + */ + if (is_load) { + tcg_gen_shli_tl(mask_offs, mask_offs, s->sew); + tcg_gen_add_tl(mask_offs, mask_offs, dest); + st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0); + } + tcg_gen_br(inc_k); + gen_set_label(active_element); + } + /* + * The element is active, calculate the address with stride: + * target_ulong addr = base + stride * i + (k << log2_esz); + */ + tcg_gen_mul_tl(stride_offs, stride, i); + tcg_gen_shli_tl(k_esz, k, s->sew); + tcg_gen_add_tl(stride_offs, stride_offs, k_esz); + tcg_gen_add_tl(addr, base, stride_offs); + /* Calculate the offset in the dst/src vector register. */ + tcg_gen_shli_tl(k_max, k, get_log2(max_elems)); + tcg_gen_add_tl(dest_offs, i, k_max); + tcg_gen_shli_tl(dest_offs, dest_offs, s->sew); + tcg_gen_add_tl(dest_offs, dest_offs, dest); + if (is_load) { + tcg_gen_qemu_ld_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity); + st_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0); + } else { + ld_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0); + tcg_gen_qemu_st_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity); + } + /* + * We don't execute the load/store above if the element was inactive. + * We jump instead directly to incrementing k and continuing the loop. + */ + if (!vm && s->vma) { + gen_set_label(inc_k); + } + tcg_gen_addi_tl(k, k, 1); + tcg_gen_br(start_k); + /* End of the inner loop. */ + gen_set_label(end_k); + + tcg_gen_addi_tl(i, i, 1); + tcg_gen_mov_tl(cpu_vstart, i); + tcg_gen_br(start); + + /* End of the outer loop. */ + gen_set_label(end); + + return; +} + + +/* + * Set the tail bytes of the strided loads/stores to 1: + * + * for (k = 0; k < nf; ++k) { + * cnt = (k * max_elems + vl) * esz; + * tot = (k * max_elems + max_elems) * esz; + * for (i = cnt; i < tot; i += esz) { + * store_1s(-1, vd[vl+i]); + * } + * } */ -typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv, - TCGv, TCGv_env, TCGv_i32); +static void gen_ldst_stride_tail_loop(DisasContext *s, TCGv dest, uint32_t nf, + gen_tl_ldst *st_fn) +{ + TCGv i = tcg_temp_new(); + TCGv k = tcg_temp_new(); + TCGv tail_cnt = tcg_temp_new(); + TCGv tail_tot = tcg_temp_new(); + TCGv tail_addr = tcg_temp_new(); + + TCGLabel *start = gen_new_label(); + TCGLabel *end = gen_new_label(); + TCGLabel *start_i = gen_new_label(); + TCGLabel *end_i = gen_new_label(); + + uint32_t max_elems_b = MAXSZ(s); + uint32_t esz = 1 << s->sew; + + /* Start of the outer loop. */ + tcg_gen_movi_tl(k, 0); + tcg_gen_shli_tl(tail_cnt, cpu_vl, s->sew); + tcg_gen_movi_tl(tail_tot, max_elems_b); + tcg_gen_add_tl(tail_addr, dest, tail_cnt); + gen_set_label(start); + tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end); + /* Start of the inner loop. */ + tcg_gen_mov_tl(i, tail_cnt); + gen_set_label(start_i); + tcg_gen_brcond_tl(TCG_COND_GE, i, tail_tot, end_i); + /* store_1s(-1, vd[vl+i]); */ + st_fn(tcg_constant_tl(-1), (TCGv_ptr)tail_addr, 0); + tcg_gen_addi_tl(tail_addr, tail_addr, esz); + tcg_gen_addi_tl(i, i, esz); + tcg_gen_br(start_i); + /* End of the inner loop. */ + gen_set_label(end_i); + /* Update the counts */ + tcg_gen_addi_tl(tail_cnt, tail_cnt, max_elems_b); + tcg_gen_addi_tl(tail_tot, tail_cnt, max_elems_b); + tcg_gen_addi_tl(k, k, 1); + tcg_gen_br(start); + /* End of the outer loop. */ + gen_set_label(end); + + return; +} static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, - uint32_t data, gen_helper_ldst_stride *fn, - DisasContext *s) + uint32_t data, DisasContext *s, bool is_load) { - TCGv_ptr dest, mask; - TCGv base, stride; - TCGv_i32 desc; + if (!s->vstart_eq_zero) { + return false; + } - dest = tcg_temp_new_ptr(); - mask = tcg_temp_new_ptr(); - base = get_gpr(s, rs1, EXT_NONE); - stride = get_gpr(s, rs2, EXT_NONE); - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, - s->cfg_ptr->vlenb, data)); + TCGv dest = tcg_temp_new(); - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); - tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0)); + uint32_t nf = FIELD_EX32(data, VDATA, NF); + uint32_t vm = FIELD_EX32(data, VDATA, VM); + + /* Destination register and mask register */ + tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd)); + + /* + * Select the appropriate load/tore to retrieve data from the vector + * register given a specific sew. + */ + static gen_tl_ldst * const ld_fns[4] = { + tcg_gen_ld8u_tl, tcg_gen_ld16u_tl, + tcg_gen_ld32u_tl, tcg_gen_ld_tl + }; + + static gen_tl_ldst * const st_fns[4] = { + tcg_gen_st8_tl, tcg_gen_st16_tl, + tcg_gen_st32_tl, tcg_gen_st_tl + }; + + gen_tl_ldst *ld_fn = ld_fns[s->sew]; + gen_tl_ldst *st_fn = st_fns[s->sew]; + + if (ld_fn == NULL || st_fn == NULL) { + return false; + } mark_vs_dirty(s); - fn(dest, mask, base, stride, tcg_env, desc); + gen_ldst_stride_main_loop(s, dest, rs1, rs2, vm, nf, ld_fn, st_fn, is_load); + + tcg_gen_movi_tl(cpu_vstart, 0); + + /* + * Set the tail bytes to 1 if tail agnostic: + */ + if (s->vta != 0 && is_load) { + gen_ldst_stride_tail_loop(s, dest, nf, st_fn); + } finalize_rvv_inst(s); return true; @@ -898,16 +1155,6 @@ static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) { uint32_t data = 0; - gen_helper_ldst_stride *fn; - static gen_helper_ldst_stride * const fns[4] = { - gen_helper_vlse8_v, gen_helper_vlse16_v, - gen_helper_vlse32_v, gen_helper_vlse64_v - }; - - fn = fns[eew]; - if (fn == NULL) { - return false; - } uint8_t emul = vext_get_emul(s, eew); data = FIELD_DP32(data, VDATA, VM, a->vm); @@ -915,7 +1162,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, NF, a->nf); data = FIELD_DP32(data, VDATA, VTA, s->vta); data = FIELD_DP32(data, VDATA, VMA, s->vma); - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s); + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, s, true); } static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew) @@ -933,23 +1180,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, ld_stride_check) static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) { uint32_t data = 0; - gen_helper_ldst_stride *fn; - static gen_helper_ldst_stride * const fns[4] = { - /* masked stride store */ - gen_helper_vsse8_v, gen_helper_vsse16_v, - gen_helper_vsse32_v, gen_helper_vsse64_v - }; uint8_t emul = vext_get_emul(s, eew); data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); - fn = fns[eew]; - if (fn == NULL) { - return false; - } - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s); + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, s, false); } static bool st_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew) @@ -1300,17 +1537,6 @@ GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false) *** Vector Integer Arithmetic Instructions */ -/* - * MAXSZ returns the maximum vector size can be operated in bytes, - * which is used in GVEC IR when vl_eq_vlmax flag is set to true - * to accelerate vector operation. - */ -static inline uint32_t MAXSZ(DisasContext *s) -{ - int max_sz = s->cfg_ptr->vlenb * 8; - return max_sz >> (3 - s->lmul); -} - static bool opivv_check(DisasContext *s, arg_rmrr *a) { return require_rvv(s) && -- 2.50.1 ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v3 1/2] Generate strided vector loads/stores with tcg nodes. 2025-08-16 0:56 ` [PATCH v3 1/2] " Chao Liu @ 2025-08-16 6:52 ` Richard Henderson 2025-08-16 8:39 ` Chao Liu 0 siblings, 1 reply; 5+ messages in thread From: Richard Henderson @ 2025-08-16 6:52 UTC (permalink / raw) To: Chao Liu, paolo.savini, ebiggers, dbarboza, palmer, alistair.francis, liwei1518, zhiwei_liu Cc: qemu-riscv, qemu-devel On 8/16/25 10:56, Chao Liu wrote: > This commit improves the performance of QEMU when emulating strided vector > loads and stores by substituting the call for the helper function with the > generation of equivalent TCG operations. > > Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> > Signed-off-by: Chao Liu <chao.liu@yeah.net> > Tested-by: Eric Biggers <ebiggers@kernel.org> > Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> > --- > target/riscv/insn_trans/trans_rvv.c.inc | 326 ++++++++++++++++++++---- > 1 file changed, 276 insertions(+), 50 deletions(-) > > diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc > index 71f98fb350..b5d74b0ec9 100644 > --- a/target/riscv/insn_trans/trans_rvv.c.inc > +++ b/target/riscv/insn_trans/trans_rvv.c.inc > @@ -864,32 +864,289 @@ GEN_VEXT_TRANS(vlm_v, MO_8, vlm_v, ld_us_mask_op, ld_us_mask_check) > GEN_VEXT_TRANS(vsm_v, MO_8, vsm_v, st_us_mask_op, st_us_mask_check) > > /* > - *** stride load and store > + * MAXSZ returns the maximum vector size can be operated in bytes, > + * which is used in GVEC IR when vl_eq_vlmax flag is set to true > + * to accelerate vector operation. > + */ > +static inline uint32_t MAXSZ(DisasContext *s) > +{ > + int max_sz = s->cfg_ptr->vlenb << 3; > + return max_sz >> (3 - s->lmul); > +} > + > +static inline uint32_t get_log2(uint32_t a) > +{ > + uint32_t i = 0; > + if (a == 0) { > + return i; > + } > + for (; a > 1;) { > + a >>= 1; > + i++; > + } > + return i; > +} Since I didn't see v3 had been posted before I replied to v2, I'll repeat my suggestion of assert(is_power_of_2(a)); return ctz32(a); as a better implementation of this function. r~ > + > +typedef void gen_tl_ldst(TCGv, TCGv_ptr, tcg_target_long); > + > +/* > + * Simulate the strided load/store main loop: > + * > + * for (i = env->vstart; i < env->vl; env->vstart = ++i) { > + * k = 0; > + * while (k < nf) { > + * if (!vm && !vext_elem_mask(v0, i)) { > + * vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, > + * (i + k * max_elems + 1) * esz); > + * k++; > + * continue; > + * } > + * target_ulong addr = base + stride * i + (k << log2_esz); > + * ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); > + * k++; > + * } > + * } > + */ > +static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1, > + uint32_t rs2, uint32_t vm, uint32_t nf, > + gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn, > + bool is_load) > +{ > + TCGv addr = tcg_temp_new(); > + TCGv base = get_gpr(s, rs1, EXT_NONE); > + TCGv stride = get_gpr(s, rs2, EXT_NONE); > + > + TCGv i = tcg_temp_new(); > + TCGv i_esz = tcg_temp_new(); > + TCGv k = tcg_temp_new(); > + TCGv k_esz = tcg_temp_new(); > + TCGv k_max = tcg_temp_new(); > + TCGv mask = tcg_temp_new(); > + TCGv mask_offs = tcg_temp_new(); > + TCGv mask_offs_64 = tcg_temp_new(); > + TCGv mask_elem = tcg_temp_new(); > + TCGv mask_offs_rem = tcg_temp_new(); > + TCGv vreg = tcg_temp_new(); > + TCGv dest_offs = tcg_temp_new(); > + TCGv stride_offs = tcg_temp_new(); > + > + uint32_t max_elems = MAXSZ(s) >> s->sew; > + > + TCGLabel *start = gen_new_label(); > + TCGLabel *end = gen_new_label(); > + TCGLabel *start_k = gen_new_label(); > + TCGLabel *inc_k = gen_new_label(); > + TCGLabel *end_k = gen_new_label(); > + > + MemOp atomicity = MO_ATOM_NONE; > + if (s->sew == 0) { > + atomicity = MO_ATOM_NONE; > + } else { > + atomicity = MO_ATOM_IFALIGN_PAIR; > + } > + > + mark_vs_dirty(s); > + > + tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0)); > + > + /* Start of outer loop. */ > + tcg_gen_mov_tl(i, cpu_vstart); > + gen_set_label(start); > + tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end); > + tcg_gen_shli_tl(i_esz, i, s->sew); > + /* Start of inner loop. */ > + tcg_gen_movi_tl(k, 0); > + gen_set_label(start_k); > + tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k); > + /* > + * If we are in mask agnostic regime and the operation is not unmasked we > + * set the inactive elements to 1. > + */ > + if (!vm && s->vma) { > + TCGLabel *active_element = gen_new_label(); > + /* (i + k * max_elems) * esz */ > + tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew)); > + tcg_gen_add_tl(mask_offs, mask_offs, i_esz); > + > + /* > + * Check whether the i bit of the mask is 0 or 1. > + * > + * static inline int vext_elem_mask(void *v0, int index) > + * { > + * int idx = index / 64; > + * int pos = index % 64; > + * return (((uint64_t *)v0)[idx] >> pos) & 1; > + * } > + */ > + tcg_gen_shri_tl(mask_offs_64, mask_offs, 3); > + tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask); > + tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0); > + tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8)); > + tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem); > + tcg_gen_andi_tl(mask_elem, mask_elem, 1); > + tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0), > + active_element); > + /* > + * Set masked-off elements in the destination vector register to 1s. > + * Store instructions simply skip this bit as memory ops access memory > + * only for active elements. > + */ > + if (is_load) { > + tcg_gen_shli_tl(mask_offs, mask_offs, s->sew); > + tcg_gen_add_tl(mask_offs, mask_offs, dest); > + st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0); > + } > + tcg_gen_br(inc_k); > + gen_set_label(active_element); > + } > + /* > + * The element is active, calculate the address with stride: > + * target_ulong addr = base + stride * i + (k << log2_esz); > + */ > + tcg_gen_mul_tl(stride_offs, stride, i); > + tcg_gen_shli_tl(k_esz, k, s->sew); > + tcg_gen_add_tl(stride_offs, stride_offs, k_esz); > + tcg_gen_add_tl(addr, base, stride_offs); > + /* Calculate the offset in the dst/src vector register. */ > + tcg_gen_shli_tl(k_max, k, get_log2(max_elems)); > + tcg_gen_add_tl(dest_offs, i, k_max); > + tcg_gen_shli_tl(dest_offs, dest_offs, s->sew); > + tcg_gen_add_tl(dest_offs, dest_offs, dest); > + if (is_load) { > + tcg_gen_qemu_ld_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity); > + st_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0); > + } else { > + ld_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0); > + tcg_gen_qemu_st_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity); > + } > + /* > + * We don't execute the load/store above if the element was inactive. > + * We jump instead directly to incrementing k and continuing the loop. > + */ > + if (!vm && s->vma) { > + gen_set_label(inc_k); > + } > + tcg_gen_addi_tl(k, k, 1); > + tcg_gen_br(start_k); > + /* End of the inner loop. */ > + gen_set_label(end_k); > + > + tcg_gen_addi_tl(i, i, 1); > + tcg_gen_mov_tl(cpu_vstart, i); > + tcg_gen_br(start); > + > + /* End of the outer loop. */ > + gen_set_label(end); > + > + return; > +} > + > + > +/* > + * Set the tail bytes of the strided loads/stores to 1: > + * > + * for (k = 0; k < nf; ++k) { > + * cnt = (k * max_elems + vl) * esz; > + * tot = (k * max_elems + max_elems) * esz; > + * for (i = cnt; i < tot; i += esz) { > + * store_1s(-1, vd[vl+i]); > + * } > + * } > */ > -typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv, > - TCGv, TCGv_env, TCGv_i32); > +static void gen_ldst_stride_tail_loop(DisasContext *s, TCGv dest, uint32_t nf, > + gen_tl_ldst *st_fn) > +{ > + TCGv i = tcg_temp_new(); > + TCGv k = tcg_temp_new(); > + TCGv tail_cnt = tcg_temp_new(); > + TCGv tail_tot = tcg_temp_new(); > + TCGv tail_addr = tcg_temp_new(); > + > + TCGLabel *start = gen_new_label(); > + TCGLabel *end = gen_new_label(); > + TCGLabel *start_i = gen_new_label(); > + TCGLabel *end_i = gen_new_label(); > + > + uint32_t max_elems_b = MAXSZ(s); > + uint32_t esz = 1 << s->sew; > + > + /* Start of the outer loop. */ > + tcg_gen_movi_tl(k, 0); > + tcg_gen_shli_tl(tail_cnt, cpu_vl, s->sew); > + tcg_gen_movi_tl(tail_tot, max_elems_b); > + tcg_gen_add_tl(tail_addr, dest, tail_cnt); > + gen_set_label(start); > + tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end); > + /* Start of the inner loop. */ > + tcg_gen_mov_tl(i, tail_cnt); > + gen_set_label(start_i); > + tcg_gen_brcond_tl(TCG_COND_GE, i, tail_tot, end_i); > + /* store_1s(-1, vd[vl+i]); */ > + st_fn(tcg_constant_tl(-1), (TCGv_ptr)tail_addr, 0); > + tcg_gen_addi_tl(tail_addr, tail_addr, esz); > + tcg_gen_addi_tl(i, i, esz); > + tcg_gen_br(start_i); > + /* End of the inner loop. */ > + gen_set_label(end_i); > + /* Update the counts */ > + tcg_gen_addi_tl(tail_cnt, tail_cnt, max_elems_b); > + tcg_gen_addi_tl(tail_tot, tail_cnt, max_elems_b); > + tcg_gen_addi_tl(k, k, 1); > + tcg_gen_br(start); > + /* End of the outer loop. */ > + gen_set_label(end); > + > + return; > +} > > static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, > - uint32_t data, gen_helper_ldst_stride *fn, > - DisasContext *s) > + uint32_t data, DisasContext *s, bool is_load) > { > - TCGv_ptr dest, mask; > - TCGv base, stride; > - TCGv_i32 desc; > + if (!s->vstart_eq_zero) { > + return false; > + } > > - dest = tcg_temp_new_ptr(); > - mask = tcg_temp_new_ptr(); > - base = get_gpr(s, rs1, EXT_NONE); > - stride = get_gpr(s, rs2, EXT_NONE); > - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, > - s->cfg_ptr->vlenb, data)); > + TCGv dest = tcg_temp_new(); > > - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); > - tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0)); > + uint32_t nf = FIELD_EX32(data, VDATA, NF); > + uint32_t vm = FIELD_EX32(data, VDATA, VM); > + > + /* Destination register and mask register */ > + tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd)); > + > + /* > + * Select the appropriate load/tore to retrieve data from the vector > + * register given a specific sew. > + */ > + static gen_tl_ldst * const ld_fns[4] = { > + tcg_gen_ld8u_tl, tcg_gen_ld16u_tl, > + tcg_gen_ld32u_tl, tcg_gen_ld_tl > + }; > + > + static gen_tl_ldst * const st_fns[4] = { > + tcg_gen_st8_tl, tcg_gen_st16_tl, > + tcg_gen_st32_tl, tcg_gen_st_tl > + }; > + > + gen_tl_ldst *ld_fn = ld_fns[s->sew]; > + gen_tl_ldst *st_fn = st_fns[s->sew]; > + > + if (ld_fn == NULL || st_fn == NULL) { > + return false; > + } > > mark_vs_dirty(s); > > - fn(dest, mask, base, stride, tcg_env, desc); > + gen_ldst_stride_main_loop(s, dest, rs1, rs2, vm, nf, ld_fn, st_fn, is_load); > + > + tcg_gen_movi_tl(cpu_vstart, 0); > + > + /* > + * Set the tail bytes to 1 if tail agnostic: > + */ > + if (s->vta != 0 && is_load) { > + gen_ldst_stride_tail_loop(s, dest, nf, st_fn); > + } > > finalize_rvv_inst(s); > return true; > @@ -898,16 +1155,6 @@ static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, > static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > { > uint32_t data = 0; > - gen_helper_ldst_stride *fn; > - static gen_helper_ldst_stride * const fns[4] = { > - gen_helper_vlse8_v, gen_helper_vlse16_v, > - gen_helper_vlse32_v, gen_helper_vlse64_v > - }; > - > - fn = fns[eew]; > - if (fn == NULL) { > - return false; > - } > > uint8_t emul = vext_get_emul(s, eew); > data = FIELD_DP32(data, VDATA, VM, a->vm); > @@ -915,7 +1162,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, NF, a->nf); > data = FIELD_DP32(data, VDATA, VTA, s->vta); > data = FIELD_DP32(data, VDATA, VMA, s->vma); > - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s); > + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, s, true); > } > > static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew) > @@ -933,23 +1180,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, ld_stride_check) > static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > { > uint32_t data = 0; > - gen_helper_ldst_stride *fn; > - static gen_helper_ldst_stride * const fns[4] = { > - /* masked stride store */ > - gen_helper_vsse8_v, gen_helper_vsse16_v, > - gen_helper_vsse32_v, gen_helper_vsse64_v > - }; > > uint8_t emul = vext_get_emul(s, eew); > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > - fn = fns[eew]; > - if (fn == NULL) { > - return false; > - } > > - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s); > + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, s, false); > } > > static bool st_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew) > @@ -1300,17 +1537,6 @@ GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false) > *** Vector Integer Arithmetic Instructions > */ > > -/* > - * MAXSZ returns the maximum vector size can be operated in bytes, > - * which is used in GVEC IR when vl_eq_vlmax flag is set to true > - * to accelerate vector operation. > - */ > -static inline uint32_t MAXSZ(DisasContext *s) > -{ > - int max_sz = s->cfg_ptr->vlenb * 8; > - return max_sz >> (3 - s->lmul); > -} > - > static bool opivv_check(DisasContext *s, arg_rmrr *a) > { > return require_rvv(s) && ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v3 1/2] Generate strided vector loads/stores with tcg nodes. 2025-08-16 6:52 ` Richard Henderson @ 2025-08-16 8:39 ` Chao Liu 0 siblings, 0 replies; 5+ messages in thread From: Chao Liu @ 2025-08-16 8:39 UTC (permalink / raw) To: Richard Henderson, paolo.savini, ebiggers, dbarboza, palmer, alistair.francis, liwei1518, zhiwei_liu Cc: qemu-riscv, qemu-devel On 8/16/2025 2:52 PM, Richard Henderson wrote: > On 8/16/25 10:56, Chao Liu wrote: >> This commit improves the performance of QEMU when emulating strided vector >> loads and stores by substituting the call for the helper function with the >> generation of equivalent TCG operations. >> >> Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> >> Signed-off-by: Chao Liu <chao.liu@yeah.net> >> Tested-by: Eric Biggers <ebiggers@kernel.org> >> Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> >> --- >> target/riscv/insn_trans/trans_rvv.c.inc | 326 ++++++++++++++++++++---- >> 1 file changed, 276 insertions(+), 50 deletions(-) >> >> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc >> index 71f98fb350..b5d74b0ec9 100644 >> --- a/target/riscv/insn_trans/trans_rvv.c.inc >> +++ b/target/riscv/insn_trans/trans_rvv.c.inc >> @@ -864,32 +864,289 @@ GEN_VEXT_TRANS(vlm_v, MO_8, vlm_v, ld_us_mask_op, ld_us_mask_check) >> GEN_VEXT_TRANS(vsm_v, MO_8, vsm_v, st_us_mask_op, st_us_mask_check) >> /* >> - *** stride load and store >> + * MAXSZ returns the maximum vector size can be operated in bytes, >> + * which is used in GVEC IR when vl_eq_vlmax flag is set to true >> + * to accelerate vector operation. >> + */ >> +static inline uint32_t MAXSZ(DisasContext *s) >> +{ >> + int max_sz = s->cfg_ptr->vlenb << 3; >> + return max_sz >> (3 - s->lmul); >> +} >> + >> +static inline uint32_t get_log2(uint32_t a) >> +{ >> + uint32_t i = 0; >> + if (a == 0) { >> + return i; >> + } >> + for (; a > 1;) { >> + a >>= 1; >> + i++; >> + } >> + return i; >> +} > > Since I didn't see v3 had been posted before I replied to v2, I'll repeat my suggestion of > > assert(is_power_of_2(a)); > return ctz32(a); > > as a better implementation of this function. > > > r~ > This is a great idea. I'll adopt it and release a new version of the patch. Best regards, Chao > >> + >> +typedef void gen_tl_ldst(TCGv, TCGv_ptr, tcg_target_long); >> + >> +/* >> + * Simulate the strided load/store main loop: >> + * >> + * for (i = env->vstart; i < env->vl; env->vstart = ++i) { >> + * k = 0; >> + * while (k < nf) { >> + * if (!vm && !vext_elem_mask(v0, i)) { >> + * vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, >> + * (i + k * max_elems + 1) * esz); >> + * k++; >> + * continue; >> + * } >> + * target_ulong addr = base + stride * i + (k << log2_esz); >> + * ldst(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); >> + * k++; >> + * } >> + * } >> + */ >> +static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t rs1, >> + uint32_t rs2, uint32_t vm, uint32_t nf, >> + gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn, >> + bool is_load) >> +{ >> + TCGv addr = tcg_temp_new(); >> + TCGv base = get_gpr(s, rs1, EXT_NONE); >> + TCGv stride = get_gpr(s, rs2, EXT_NONE); >> + >> + TCGv i = tcg_temp_new(); >> + TCGv i_esz = tcg_temp_new(); >> + TCGv k = tcg_temp_new(); >> + TCGv k_esz = tcg_temp_new(); >> + TCGv k_max = tcg_temp_new(); >> + TCGv mask = tcg_temp_new(); >> + TCGv mask_offs = tcg_temp_new(); >> + TCGv mask_offs_64 = tcg_temp_new(); >> + TCGv mask_elem = tcg_temp_new(); >> + TCGv mask_offs_rem = tcg_temp_new(); >> + TCGv vreg = tcg_temp_new(); >> + TCGv dest_offs = tcg_temp_new(); >> + TCGv stride_offs = tcg_temp_new(); >> + >> + uint32_t max_elems = MAXSZ(s) >> s->sew; >> + >> + TCGLabel *start = gen_new_label(); >> + TCGLabel *end = gen_new_label(); >> + TCGLabel *start_k = gen_new_label(); >> + TCGLabel *inc_k = gen_new_label(); >> + TCGLabel *end_k = gen_new_label(); >> + >> + MemOp atomicity = MO_ATOM_NONE; >> + if (s->sew == 0) { >> + atomicity = MO_ATOM_NONE; >> + } else { >> + atomicity = MO_ATOM_IFALIGN_PAIR; >> + } >> + >> + mark_vs_dirty(s); >> + >> + tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0)); >> + >> + /* Start of outer loop. */ >> + tcg_gen_mov_tl(i, cpu_vstart); >> + gen_set_label(start); >> + tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end); >> + tcg_gen_shli_tl(i_esz, i, s->sew); >> + /* Start of inner loop. */ >> + tcg_gen_movi_tl(k, 0); >> + gen_set_label(start_k); >> + tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k); >> + /* >> + * If we are in mask agnostic regime and the operation is not unmasked we >> + * set the inactive elements to 1. >> + */ >> + if (!vm && s->vma) { >> + TCGLabel *active_element = gen_new_label(); >> + /* (i + k * max_elems) * esz */ >> + tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew)); >> + tcg_gen_add_tl(mask_offs, mask_offs, i_esz); >> + >> + /* >> + * Check whether the i bit of the mask is 0 or 1. >> + * >> + * static inline int vext_elem_mask(void *v0, int index) >> + * { >> + * int idx = index / 64; >> + * int pos = index % 64; >> + * return (((uint64_t *)v0)[idx] >> pos) & 1; >> + * } >> + */ >> + tcg_gen_shri_tl(mask_offs_64, mask_offs, 3); >> + tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask); >> + tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0); >> + tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8)); >> + tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem); >> + tcg_gen_andi_tl(mask_elem, mask_elem, 1); >> + tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0), >> + active_element); >> + /* >> + * Set masked-off elements in the destination vector register to 1s. >> + * Store instructions simply skip this bit as memory ops access memory >> + * only for active elements. >> + */ >> + if (is_load) { >> + tcg_gen_shli_tl(mask_offs, mask_offs, s->sew); >> + tcg_gen_add_tl(mask_offs, mask_offs, dest); >> + st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0); >> + } >> + tcg_gen_br(inc_k); >> + gen_set_label(active_element); >> + } >> + /* >> + * The element is active, calculate the address with stride: >> + * target_ulong addr = base + stride * i + (k << log2_esz); >> + */ >> + tcg_gen_mul_tl(stride_offs, stride, i); >> + tcg_gen_shli_tl(k_esz, k, s->sew); >> + tcg_gen_add_tl(stride_offs, stride_offs, k_esz); >> + tcg_gen_add_tl(addr, base, stride_offs); >> + /* Calculate the offset in the dst/src vector register. */ >> + tcg_gen_shli_tl(k_max, k, get_log2(max_elems)); >> + tcg_gen_add_tl(dest_offs, i, k_max); >> + tcg_gen_shli_tl(dest_offs, dest_offs, s->sew); >> + tcg_gen_add_tl(dest_offs, dest_offs, dest); >> + if (is_load) { >> + tcg_gen_qemu_ld_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity); >> + st_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0); >> + } else { >> + ld_fn((TCGv)vreg, (TCGv_ptr)dest_offs, 0); >> + tcg_gen_qemu_st_tl(vreg, addr, s->mem_idx, MO_LE | s->sew | atomicity); >> + } >> + /* >> + * We don't execute the load/store above if the element was inactive. >> + * We jump instead directly to incrementing k and continuing the loop. >> + */ >> + if (!vm && s->vma) { >> + gen_set_label(inc_k); >> + } >> + tcg_gen_addi_tl(k, k, 1); >> + tcg_gen_br(start_k); >> + /* End of the inner loop. */ >> + gen_set_label(end_k); >> + >> + tcg_gen_addi_tl(i, i, 1); >> + tcg_gen_mov_tl(cpu_vstart, i); >> + tcg_gen_br(start); >> + >> + /* End of the outer loop. */ >> + gen_set_label(end); >> + >> + return; >> +} >> + >> + >> +/* >> + * Set the tail bytes of the strided loads/stores to 1: >> + * >> + * for (k = 0; k < nf; ++k) { >> + * cnt = (k * max_elems + vl) * esz; >> + * tot = (k * max_elems + max_elems) * esz; >> + * for (i = cnt; i < tot; i += esz) { >> + * store_1s(-1, vd[vl+i]); >> + * } >> + * } >> */ >> -typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv, >> - TCGv, TCGv_env, TCGv_i32); >> +static void gen_ldst_stride_tail_loop(DisasContext *s, TCGv dest, uint32_t nf, >> + gen_tl_ldst *st_fn) >> +{ >> + TCGv i = tcg_temp_new(); >> + TCGv k = tcg_temp_new(); >> + TCGv tail_cnt = tcg_temp_new(); >> + TCGv tail_tot = tcg_temp_new(); >> + TCGv tail_addr = tcg_temp_new(); >> + >> + TCGLabel *start = gen_new_label(); >> + TCGLabel *end = gen_new_label(); >> + TCGLabel *start_i = gen_new_label(); >> + TCGLabel *end_i = gen_new_label(); >> + >> + uint32_t max_elems_b = MAXSZ(s); >> + uint32_t esz = 1 << s->sew; >> + >> + /* Start of the outer loop. */ >> + tcg_gen_movi_tl(k, 0); >> + tcg_gen_shli_tl(tail_cnt, cpu_vl, s->sew); >> + tcg_gen_movi_tl(tail_tot, max_elems_b); >> + tcg_gen_add_tl(tail_addr, dest, tail_cnt); >> + gen_set_label(start); >> + tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end); >> + /* Start of the inner loop. */ >> + tcg_gen_mov_tl(i, tail_cnt); >> + gen_set_label(start_i); >> + tcg_gen_brcond_tl(TCG_COND_GE, i, tail_tot, end_i); >> + /* store_1s(-1, vd[vl+i]); */ >> + st_fn(tcg_constant_tl(-1), (TCGv_ptr)tail_addr, 0); >> + tcg_gen_addi_tl(tail_addr, tail_addr, esz); >> + tcg_gen_addi_tl(i, i, esz); >> + tcg_gen_br(start_i); >> + /* End of the inner loop. */ >> + gen_set_label(end_i); >> + /* Update the counts */ >> + tcg_gen_addi_tl(tail_cnt, tail_cnt, max_elems_b); >> + tcg_gen_addi_tl(tail_tot, tail_cnt, max_elems_b); >> + tcg_gen_addi_tl(k, k, 1); >> + tcg_gen_br(start); >> + /* End of the outer loop. */ >> + gen_set_label(end); >> + >> + return; >> +} >> static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, >> - uint32_t data, gen_helper_ldst_stride *fn, >> - DisasContext *s) >> + uint32_t data, DisasContext *s, bool is_load) >> { >> - TCGv_ptr dest, mask; >> - TCGv base, stride; >> - TCGv_i32 desc; >> + if (!s->vstart_eq_zero) { >> + return false; >> + } >> - dest = tcg_temp_new_ptr(); >> - mask = tcg_temp_new_ptr(); >> - base = get_gpr(s, rs1, EXT_NONE); >> - stride = get_gpr(s, rs2, EXT_NONE); >> - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, >> - s->cfg_ptr->vlenb, data)); >> + TCGv dest = tcg_temp_new(); >> - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); >> - tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0)); >> + uint32_t nf = FIELD_EX32(data, VDATA, NF); >> + uint32_t vm = FIELD_EX32(data, VDATA, VM); >> + >> + /* Destination register and mask register */ >> + tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd)); >> + >> + /* >> + * Select the appropriate load/tore to retrieve data from the vector >> + * register given a specific sew. >> + */ >> + static gen_tl_ldst * const ld_fns[4] = { >> + tcg_gen_ld8u_tl, tcg_gen_ld16u_tl, >> + tcg_gen_ld32u_tl, tcg_gen_ld_tl >> + }; >> + >> + static gen_tl_ldst * const st_fns[4] = { >> + tcg_gen_st8_tl, tcg_gen_st16_tl, >> + tcg_gen_st32_tl, tcg_gen_st_tl >> + }; >> + >> + gen_tl_ldst *ld_fn = ld_fns[s->sew]; >> + gen_tl_ldst *st_fn = st_fns[s->sew]; >> + >> + if (ld_fn == NULL || st_fn == NULL) { >> + return false; >> + } >> mark_vs_dirty(s); >> - fn(dest, mask, base, stride, tcg_env, desc); >> + gen_ldst_stride_main_loop(s, dest, rs1, rs2, vm, nf, ld_fn, st_fn, is_load); >> + >> + tcg_gen_movi_tl(cpu_vstart, 0); >> + >> + /* >> + * Set the tail bytes to 1 if tail agnostic: >> + */ >> + if (s->vta != 0 && is_load) { >> + gen_ldst_stride_tail_loop(s, dest, nf, st_fn); >> + } >> finalize_rvv_inst(s); >> return true; >> @@ -898,16 +1155,6 @@ static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2, >> static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) >> { >> uint32_t data = 0; >> - gen_helper_ldst_stride *fn; >> - static gen_helper_ldst_stride * const fns[4] = { >> - gen_helper_vlse8_v, gen_helper_vlse16_v, >> - gen_helper_vlse32_v, gen_helper_vlse64_v >> - }; >> - >> - fn = fns[eew]; >> - if (fn == NULL) { >> - return false; >> - } >> uint8_t emul = vext_get_emul(s, eew); >> data = FIELD_DP32(data, VDATA, VM, a->vm); >> @@ -915,7 +1162,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) >> data = FIELD_DP32(data, VDATA, NF, a->nf); >> data = FIELD_DP32(data, VDATA, VTA, s->vta); >> data = FIELD_DP32(data, VDATA, VMA, s->vma); >> - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s); >> + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, s, true); >> } >> static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew) >> @@ -933,23 +1180,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, ld_stride_check) >> static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) >> { >> uint32_t data = 0; >> - gen_helper_ldst_stride *fn; >> - static gen_helper_ldst_stride * const fns[4] = { >> - /* masked stride store */ >> - gen_helper_vsse8_v, gen_helper_vsse16_v, >> - gen_helper_vsse32_v, gen_helper_vsse64_v >> - }; >> uint8_t emul = vext_get_emul(s, eew); >> data = FIELD_DP32(data, VDATA, VM, a->vm); >> data = FIELD_DP32(data, VDATA, LMUL, emul); >> data = FIELD_DP32(data, VDATA, NF, a->nf); >> - fn = fns[eew]; >> - if (fn == NULL) { >> - return false; >> - } >> - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s); >> + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, s, false); >> } >> static bool st_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew) >> @@ -1300,17 +1537,6 @@ GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false) >> *** Vector Integer Arithmetic Instructions >> */ >> -/* >> - * MAXSZ returns the maximum vector size can be operated in bytes, >> - * which is used in GVEC IR when vl_eq_vlmax flag is set to true >> - * to accelerate vector operation. >> - */ >> -static inline uint32_t MAXSZ(DisasContext *s) >> -{ >> - int max_sz = s->cfg_ptr->vlenb * 8; >> - return max_sz >> (3 - s->lmul); >> -} >> - >> static bool opivv_check(DisasContext *s, arg_rmrr *a) >> { >> return require_rvv(s) && ^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v3 2/2] tests/tcg/riscv64: Add test for vlsseg8e32 instruction 2025-08-16 0:56 [PATCH v3 0/2] [RISCV/RVV] Generate strided vector loads/stores with tcg nodes Chao Liu 2025-08-16 0:56 ` [PATCH v3 1/2] " Chao Liu @ 2025-08-16 0:56 ` Chao Liu 1 sibling, 0 replies; 5+ messages in thread From: Chao Liu @ 2025-08-16 0:56 UTC (permalink / raw) To: paolo.savini, ebiggers, dbarboza, palmer, alistair.francis, liwei1518, zhiwei_liu Cc: qemu-riscv, qemu-devel, Chao Liu This case, it copied 64 bytes from a0 to a1 with vlsseg8e32. Signed-off-by: Chao Liu <chao.liu@yeah.net> --- tests/tcg/riscv64/Makefile.softmmu-target | 8 +- tests/tcg/riscv64/test-vlsseg8e32.S | 107 ++++++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 tests/tcg/riscv64/test-vlsseg8e32.S diff --git a/tests/tcg/riscv64/Makefile.softmmu-target b/tests/tcg/riscv64/Makefile.softmmu-target index 3ca595335d..384c291554 100644 --- a/tests/tcg/riscv64/Makefile.softmmu-target +++ b/tests/tcg/riscv64/Makefile.softmmu-target @@ -7,14 +7,14 @@ VPATH += $(TEST_SRC) LINK_SCRIPT = $(TEST_SRC)/semihost.ld LDFLAGS = -T $(LINK_SCRIPT) -CFLAGS += -g -Og +CFLAGS += -march=rv64gcv -mabi=lp64d -g -Og %.o: %.S $(CC) $(CFLAGS) $< -Wa,--noexecstack -c -o $@ %: %.o $(LINK_SCRIPT) $(LD) $(LDFLAGS) $< -o $@ -QEMU_OPTS += -M virt -display none -semihosting -device loader,file= +QEMU_OPTS += -M virt -cpu rv64,v=true -display none -semihosting -device loader,file= EXTRA_RUNS += run-issue1060 run-issue1060: issue1060 @@ -24,5 +24,9 @@ EXTRA_RUNS += run-test-mepc-masking run-test-mepc-masking: test-mepc-masking $(call run-test, $<, $(QEMU) $(QEMU_OPTS)$<) +EXTRA_RUNS += run-vlsseg8e32 +run-vlsseg8e32: test-vlsseg8e32 + $(call run-test, $<, $(QEMU) $(QEMU_OPTS)$<) + # We don't currently support the multiarch system tests undefine MULTIARCH_TESTS diff --git a/tests/tcg/riscv64/test-vlsseg8e32.S b/tests/tcg/riscv64/test-vlsseg8e32.S new file mode 100644 index 0000000000..bbc79d5e8d --- /dev/null +++ b/tests/tcg/riscv64/test-vlsseg8e32.S @@ -0,0 +1,107 @@ +# +# QEMU RISC-V Vector Strided Load Instruction testcase +# +# Copyright (c) 2025 Chao Liu chao.liu@yeah.net +# +# SPDX-License-Identifier: GPL-2.0-or-later +# + .option norvc + + .section .data + .align 4 +source_data: + .asciz "Test the vssseg8e32 insn by copy 64b and verifying correctness." + .equ source_len, 64 + + .text + .global _start +_start: + lla t0, trap + csrw mtvec, t0 + +enable_rvv: + + li x15, 0x800000000024112d + csrw 0x301, x15 + li x1, 0x2200 + csrr x2, mstatus + or x2, x2, x1 + csrw mstatus, x2 + +rvv_test_func: + la a0, source_data + li a1, 0x80020000 + vsetivli zero, 1, e32, m1, ta, ma + li t0, 64 + + vlsseg8e32.v v0, (a0), t0 + addi a0, a0, 32 + vlsseg8e32.v v8, (a0), t0 + + vssseg8e32.v v0, (a1), t0 + addi a1, a1, 32 + vssseg8e32.v v8, (a1), t0 + +compare_start: + la a0, source_data + li a1, 0x80020000 + li t0, 0 + li t1, source_len + +compare_loop: + # when t0 >= len, compare end + bge t0, t1, compare_done + + lb t2, 0(a0) + lb t3, 0(a1) + bne t2, t3, compare_fail + + addi a0, a0, 1 + addi a1, a1, 1 + addi t0, t0, 1 + j compare_loop + +compare_done: + # compare ok, return 0 + li a0, 0 + j _exit + +compare_fail: + # compare failed, return 2 + li a0, 2 + j _exit + +trap: + # When an instruction traps, compare it to the insn in memory. + csrr t0, mepc + csrr t1, mtval + lwu t2, 0(t0) + bne t1, t2, fail + + # Skip the insn and continue. + addi t0, t0, 4 + csrw mepc, t0 + mret + +fail: + li a0, 1 + +# Exit code in a0 +_exit: + lla a1, semiargs + li t0, 0x20026 # ADP_Stopped_ApplicationExit + sd t0, 0(a1) + sd a0, 8(a1) + li a0, 0x20 # TARGET_SYS_EXIT_EXTENDED + + # Semihosting call sequence + .balign 16 + slli zero, zero, 0x1f + ebreak + srai zero, zero, 0x7 + j . + + .data + .balign 16 +semiargs: + .space 16 -- 2.50.1 ^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-08-16 8:41 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2025-08-16 0:56 [PATCH v3 0/2] [RISCV/RVV] Generate strided vector loads/stores with tcg nodes Chao Liu 2025-08-16 0:56 ` [PATCH v3 1/2] " Chao Liu 2025-08-16 6:52 ` Richard Henderson 2025-08-16 8:39 ` Chao Liu 2025-08-16 0:56 ` [PATCH v3 2/2] tests/tcg/riscv64: Add test for vlsseg8e32 instruction Chao Liu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).