qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/7] Add LoongArch v1.1 instructions
@ 2025-11-19 12:24 Jiajie Chen
  2025-11-19 12:24 ` [PATCH v2 1/7] target/loongarch: Require atomics to be aligned Jiajie Chen
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

Latest revision of LoongArch ISA is out at
https://www.loongson.cn/uploads/images/2023102309132647981.%E9%BE%99%E8%8A%AF%E6%9E%B6%E6%9E%84%E5%8F%82%E8%80%83%E6%89%8B%E5%86%8C%E5%8D%B7%E4%B8%80_r1p10.pdf
(Chinese only). The revision includes the following updates:

- estimated fp reciporcal instructions: frecip -> frecipe, frsqrt ->
  frsqrte
- 128-bit width store-conditional instruction: sc.q
- ll.w/d with acquire semantic: llacq.w/d, sc.w/d with release semantic:
  screl.w/d
- compare and swap instructions: amcas[_db].b/w/h/d
- byte and word-wide amswap/add instructions: am{swap/add}[_db].{b/h}
- new definition for dbar hints
- clarify 32-bit division instruction hebavior
- clarify load ordering when accessing the same address
- introduce message signaled interrupt
- introduce hardware page table walker

The new revision is implemented in the Loongson 3A6000 processor.

This patch series implements all the new instructions. The v1 version
can be found at
https://patchew.org/QEMU/20231023153029.269211-2-c@jia.je/.

A simple testcase to test the new fp and sc.q instructions:

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

void test_fp() {
  float a = 3.0;
  float b;

  asm volatile("frecip.s %0, %1" : "=f"(b) : "f"(a));
  printf("frecip: %f\n", b);
  asm volatile("frecipe.s %0, %1" : "=f"(b) : "f"(a));
  printf("frecipe: %f\n", b);

  asm volatile("frsqrt.s %0, %1" : "=f"(b) : "f"(a));
  printf("frsqrt: %f\n", b);
  asm volatile("frsqrte.s %0, %1" : "=f"(b) : "f"(a));
  printf("frsqrte: %f\n", b);
}

uint64_t rand64() { return ((uint64_t)rand() << 32) | rand(); }

void test_sc_q() {
  __int128 val = rand64();
  val = (val << 64) | rand64();
  __int128 *ptr = &val;
  uint64_t add_lo = rand64();
  uint64_t add_hi = rand64();
  __int128 add = add_hi;
  add = (add << 64) | add_lo;
  __int128 expect = val + add;
  int res = 0;

  asm volatile("ll.d $t1, %1, 0\nld.d $t2, %1, 8\nadd.d $t1, $t1, %2\nadd.d "
               "$t2, $t2, %3\nsc.q $t1, $t2, %1\nmove %0, $t1"
               : "=r"(res), "+r"(ptr)
               : "r"(add_lo), "r"(add_hi)
               : "$t1", "$t2", "memory");
  assert(res == 1);
  assert(val == expect);

  // change memory content to make sc fail
  res = 1;
  asm volatile("ll.d $t1, %1, 0\nld.d $t2, %1, 8\naddi.d $t1, $t1, 1\nst.d "
               "$t1, %1, 0\nsc.q $t1, $t2, %1\nmove %0, $t1"
               : "=r"(res), "+r"(ptr)
               :
               : "$t1", "$t2", "memory");
  assert(res == 0);

  res = 1;
  asm volatile("ll.d $t1, %1, 0\nld.d $t2, %1, 8\naddi.d $t2, $t2, 1\nst.d "
               "$t2, %1, 8\nsc.q $t1, $t2, %1\nmove %0, $t1"
               : "=r"(res), "+r"(ptr)
               :
               : "$t1", "$t2", "memory");
  assert(res == 0);

  printf("SC.Q passed\n");
}

int main(int argc, char *argv[]) {
  test_fp();
  test_sc_q();
  return 0;
}

Compile and test by:

loongarch64-linux-gnu-gcc test.c -o test -static && ./qemu-loongarch64 -cpu max test

Jiajie Chen (7):
  target/loongarch: Require atomics to be aligned
  target/loongarch: Add am{swap/add}[_db].{b/h}
  target/loongarch: Add amcas[_db].{b/h/w/d}
  target/loongarch: Add estimated reciprocal instructions
  target/loongarch: Add llacq/screl instructions
  target/loongarch: Add sc.q instructions
  target/loongarch: Add LA v1.1 instructions to max cpu

 target/loongarch/cpu.c                        |  11 +-
 target/loongarch/cpu.h                        |   7 +
 target/loongarch/disas.c                      |  33 ++++
 target/loongarch/insns.decode                 |  34 ++++
 .../tcg/insn_trans/trans_atomic.c.inc         | 145 ++++++++++++++++--
 .../tcg/insn_trans/trans_farith.c.inc         |   4 +
 .../tcg/insn_trans/trans_memory.c.inc         |  22 +++
 .../loongarch/tcg/insn_trans/trans_vec.c.inc  |   8 +
 target/loongarch/tcg/translate.c              |   6 +-
 target/loongarch/translate.h                  |  30 ++--
 10 files changed, 280 insertions(+), 20 deletions(-)

-- 
2.51.0



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2 1/7] target/loongarch: Require atomics to be aligned
  2025-11-19 12:24 [PATCH v2 0/7] Add LoongArch v1.1 instructions Jiajie Chen
@ 2025-11-19 12:24 ` Jiajie Chen
  2025-11-19 12:24 ` [PATCH v2 2/7] target/loongarch: Add am{swap/add}[_db].{b/h} Jiajie Chen
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

Currently, all atomic instructions in LoongArch require the address to
be aligned.

Signed-off-by: Jiajie Chen <c@jia.je>
---
 target/loongarch/tcg/insn_trans/trans_atomic.c.inc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
index 77eeedbc42..5622202a67 100644
--- a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
@@ -9,7 +9,7 @@ static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop)
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
     TCGv t0 = make_address_i(ctx, src1, a->imm);
 
-    tcg_gen_qemu_ld_i64(t1, t0, ctx->mem_idx, mop);
+    tcg_gen_qemu_ld_i64(t1, t0, ctx->mem_idx, mop | MO_ALIGN);
     tcg_gen_st_tl(t0, tcg_env, offsetof(CPULoongArchState, lladdr));
     tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval));
     gen_set_gpr(a->rd, t1, EXT_NONE);
@@ -37,7 +37,7 @@ static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop)
     tcg_gen_mov_tl(val, src2);
     /* generate cmpxchg */
     tcg_gen_atomic_cmpxchg_tl(t0, cpu_lladdr, cpu_llval,
-                              val, ctx->mem_idx, mop);
+                              val, ctx->mem_idx, mop | MO_ALIGN);
     tcg_gen_setcond_tl(TCG_COND_EQ, dest, t0, cpu_llval);
     gen_set_label(done);
     gen_set_gpr(a->rd, dest, EXT_NONE);
@@ -63,7 +63,7 @@ static bool gen_am(DisasContext *ctx, arg_rrr *a,
 
     addr = make_address_i(ctx, addr, 0);
 
-    func(dest, addr, val, ctx->mem_idx, mop);
+    func(dest, addr, val, ctx->mem_idx, mop | MO_ALIGN);
     gen_set_gpr(a->rd, dest, EXT_NONE);
 
     return true;
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 2/7] target/loongarch: Add am{swap/add}[_db].{b/h}
  2025-11-19 12:24 [PATCH v2 0/7] Add LoongArch v1.1 instructions Jiajie Chen
  2025-11-19 12:24 ` [PATCH v2 1/7] target/loongarch: Require atomics to be aligned Jiajie Chen
@ 2025-11-19 12:24 ` Jiajie Chen
  2025-11-19 12:24 ` [PATCH v2 3/7] target/loongarch: Add amcas[_db].{b/h/w/d} Jiajie Chen
  2025-11-19 12:30 ` [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions Jiajie Chen
  3 siblings, 0 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

The new instructions are introduced in LoongArch v1.1:

- amswap.b
- amswap.h
- amadd.b
- amadd.h
- amswap_db.b
- amswap_db.h
- amadd_db.b
- amadd_db.h

The instructions are gated by CPUCFG2.LAM_BH.

Signed-off-by: Jiajie Chen <c@jia.je>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/cpu.h                        |  1 +
 target/loongarch/disas.c                      |  8 ++++++++
 target/loongarch/insns.decode                 |  8 ++++++++
 .../tcg/insn_trans/trans_atomic.c.inc         |  8 ++++++++
 target/loongarch/translate.h                  | 19 ++++++++++---------
 5 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index 1a14469b3b..aa3d976875 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -138,6 +138,7 @@ FIELD(CPUCFG2, LBT_ALL, 18, 3)
 FIELD(CPUCFG2, LSPW, 21, 1)
 FIELD(CPUCFG2, LAM, 22, 1)
 FIELD(CPUCFG2, HPTW, 24, 1)
+FIELD(CPUCFG2, LAM_BH, 27, 1)
 
 /* cpucfg[3] bits */
 FIELD(CPUCFG3, CCDMA, 0, 1)
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 63989a6282..1a0f527cb1 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -580,6 +580,14 @@ INSN(fldx_s,       frr)
 INSN(fldx_d,       frr)
 INSN(fstx_s,       frr)
 INSN(fstx_d,       frr)
+INSN(amswap_b,     rrr)
+INSN(amswap_h,     rrr)
+INSN(amadd_b,      rrr)
+INSN(amadd_h,      rrr)
+INSN(amswap_db_b,  rrr)
+INSN(amswap_db_h,  rrr)
+INSN(amadd_db_b,   rrr)
+INSN(amadd_db_h,   rrr)
 INSN(amswap_w,     rrr)
 INSN(amswap_d,     rrr)
 INSN(amadd_w,      rrr)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index 62f58cc541..678ce42038 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -261,6 +261,14 @@ ll_w            0010 0000 .............. ..... .....     @rr_i14s2
 sc_w            0010 0001 .............. ..... .....     @rr_i14s2
 ll_d            0010 0010 .............. ..... .....     @rr_i14s2
 sc_d            0010 0011 .............. ..... .....     @rr_i14s2
+amswap_b        0011 10000101 11000 ..... ..... .....    @rrr
+amswap_h        0011 10000101 11001 ..... ..... .....    @rrr
+amadd_b         0011 10000101 11010 ..... ..... .....    @rrr
+amadd_h         0011 10000101 11011 ..... ..... .....    @rrr
+amswap_db_b     0011 10000101 11100 ..... ..... .....    @rrr
+amswap_db_h     0011 10000101 11101 ..... ..... .....    @rrr
+amadd_db_b      0011 10000101 11110 ..... ..... .....    @rrr
+amadd_db_h      0011 10000101 11111 ..... ..... .....    @rrr
 amswap_w        0011 10000110 00000 ..... ..... .....    @rrr
 amswap_d        0011 10000110 00001 ..... ..... .....    @rrr
 amadd_w         0011 10000110 00010 ..... ..... .....    @rrr
diff --git a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
index 5622202a67..0d837d08b6 100644
--- a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
@@ -73,6 +73,14 @@ TRANS(ll_w, ALL, gen_ll, MO_TESL)
 TRANS(sc_w, ALL, gen_sc, MO_TESL)
 TRANS(ll_d, 64, gen_ll, MO_TEUQ)
 TRANS(sc_d, 64, gen_sc, MO_TEUQ)
+TRANS(amswap_b, LAM_BH, gen_am, tcg_gen_atomic_xchg_tl, MO_SB)
+TRANS(amswap_h, LAM_BH, gen_am, tcg_gen_atomic_xchg_tl, MO_TESW)
+TRANS(amadd_b, LAM_BH, gen_am, tcg_gen_atomic_fetch_add_tl, MO_SB)
+TRANS(amadd_h, LAM_BH, gen_am, tcg_gen_atomic_fetch_add_tl, MO_TESW)
+TRANS(amswap_db_b, LAM_BH, gen_am, tcg_gen_atomic_xchg_tl, MO_SB)
+TRANS(amswap_db_h, LAM_BH, gen_am, tcg_gen_atomic_xchg_tl, MO_TESW)
+TRANS(amadd_db_b, LAM_BH, gen_am, tcg_gen_atomic_fetch_add_tl, MO_SB)
+TRANS(amadd_db_h, LAM_BH, gen_am, tcg_gen_atomic_fetch_add_tl, MO_TESW)
 TRANS(amswap_w, LAM, gen_am, tcg_gen_atomic_xchg_tl, MO_TESL)
 TRANS64(amswap_d, LAM, gen_am, tcg_gen_atomic_xchg_tl, MO_TEUQ)
 TRANS(amadd_w, LAM, gen_am, tcg_gen_atomic_fetch_add_tl, MO_TESL)
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index bbe015ba57..eb424bb0da 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -21,15 +21,16 @@
 #define avail_ALL(C)   true
 #define avail_64(C)    (FIELD_EX32((C)->cpucfg1, CPUCFG1, ARCH) == \
                         CPUCFG1_ARCH_LA64)
-#define avail_FP(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, FP))
-#define avail_FP_SP(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, FP_SP))
-#define avail_FP_DP(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, FP_DP))
-#define avail_LSPW(C)  (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSPW))
-#define avail_LAM(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAM))
-#define avail_LSX(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSX))
-#define avail_LASX(C)  (FIELD_EX32((C)->cpucfg2, CPUCFG2, LASX))
-#define avail_IOCSR(C) (FIELD_EX32((C)->cpucfg1, CPUCFG1, IOCSR))
-#define avail_CRC(C)   (FIELD_EX32((C)->cpucfg1, CPUCFG1, CRC))
+#define avail_FP(C)     (FIELD_EX32((C)->cpucfg2, CPUCFG2, FP))
+#define avail_FP_SP(C)  (FIELD_EX32((C)->cpucfg2, CPUCFG2, FP_SP))
+#define avail_FP_DP(C)  (FIELD_EX32((C)->cpucfg2, CPUCFG2, FP_DP))
+#define avail_LSPW(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSPW))
+#define avail_LAM(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAM))
+#define avail_LAM_BH(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAM_BH))
+#define avail_LSX(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSX))
+#define avail_LASX(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LASX))
+#define avail_IOCSR(C)  (FIELD_EX32((C)->cpucfg1, CPUCFG1, IOCSR))
+#define avail_CRC(C)    (FIELD_EX32((C)->cpucfg1, CPUCFG1, CRC))
 
 /*
  * If an operation is being performed on less than TARGET_LONG_BITS,
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 3/7] target/loongarch: Add amcas[_db].{b/h/w/d}
  2025-11-19 12:24 [PATCH v2 0/7] Add LoongArch v1.1 instructions Jiajie Chen
  2025-11-19 12:24 ` [PATCH v2 1/7] target/loongarch: Require atomics to be aligned Jiajie Chen
  2025-11-19 12:24 ` [PATCH v2 2/7] target/loongarch: Add am{swap/add}[_db].{b/h} Jiajie Chen
@ 2025-11-19 12:24 ` Jiajie Chen
  2025-11-19 12:30 ` [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions Jiajie Chen
  3 siblings, 0 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

The new instructions are introduced in LoongArch v1.1:

- amcas.b
- amcas.h
- amcas.w
- amcas.d
- amcas_db.b
- amcas_db.h
- amcas_db.w
- amcas_db.d

The new instructions are gated by CPUCFG2.LAMCAS.

Signed-off-by: Jiajie Chen <c@jia.je>
---
 target/loongarch/cpu.h                        |  1 +
 target/loongarch/disas.c                      |  8 ++++++
 target/loongarch/insns.decode                 |  8 ++++++
 .../tcg/insn_trans/trans_atomic.c.inc         | 25 +++++++++++++++++++
 target/loongarch/translate.h                  |  1 +
 5 files changed, 43 insertions(+)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index aa3d976875..9ca7af9b4a 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -139,6 +139,7 @@ FIELD(CPUCFG2, LSPW, 21, 1)
 FIELD(CPUCFG2, LAM, 22, 1)
 FIELD(CPUCFG2, HPTW, 24, 1)
 FIELD(CPUCFG2, LAM_BH, 27, 1)
+FIELD(CPUCFG2, LAMCAS, 28, 1)
 
 /* cpucfg[3] bits */
 FIELD(CPUCFG3, CCDMA, 0, 1)
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 1a0f527cb1..66c0cae5a9 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -580,6 +580,14 @@ INSN(fldx_s,       frr)
 INSN(fldx_d,       frr)
 INSN(fstx_s,       frr)
 INSN(fstx_d,       frr)
+INSN(amcas_b,      rrr)
+INSN(amcas_h,      rrr)
+INSN(amcas_w,      rrr)
+INSN(amcas_d,      rrr)
+INSN(amcas_db_b,   rrr)
+INSN(amcas_db_h,   rrr)
+INSN(amcas_db_w,   rrr)
+INSN(amcas_db_d,   rrr)
 INSN(amswap_b,     rrr)
 INSN(amswap_h,     rrr)
 INSN(amadd_b,      rrr)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index 678ce42038..cf4123cd46 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -261,6 +261,14 @@ ll_w            0010 0000 .............. ..... .....     @rr_i14s2
 sc_w            0010 0001 .............. ..... .....     @rr_i14s2
 ll_d            0010 0010 .............. ..... .....     @rr_i14s2
 sc_d            0010 0011 .............. ..... .....     @rr_i14s2
+amcas_b         0011 10000101 10000 ..... ..... .....    @rrr
+amcas_h         0011 10000101 10001 ..... ..... .....    @rrr
+amcas_w         0011 10000101 10010 ..... ..... .....    @rrr
+amcas_d         0011 10000101 10011 ..... ..... .....    @rrr
+amcas_db_b      0011 10000101 10100 ..... ..... .....    @rrr
+amcas_db_h      0011 10000101 10101 ..... ..... .....    @rrr
+amcas_db_w      0011 10000101 10110 ..... ..... .....    @rrr
+amcas_db_d      0011 10000101 10111 ..... ..... .....    @rrr
 amswap_b        0011 10000101 11000 ..... ..... .....    @rrr
 amswap_h        0011 10000101 11001 ..... ..... .....    @rrr
 amadd_b         0011 10000101 11010 ..... ..... .....    @rrr
diff --git a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
index 0d837d08b6..1b2673b82d 100644
--- a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
@@ -45,6 +45,23 @@ static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop)
     return true;
 }
 
+static bool gen_cas(DisasContext *ctx, arg_rrr *a,
+                    void (*func)(TCGv, TCGv, TCGv, TCGv, TCGArg, MemOp),
+                    MemOp mop)
+{
+    TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
+    TCGv addr = gpr_src(ctx, a->rj, EXT_NONE);
+    TCGv val = gpr_src(ctx, a->rk, EXT_NONE);
+    TCGv old = gpr_src(ctx, a->rd, EXT_NONE);
+
+    addr = make_address_i(ctx, addr, 0);
+
+    func(dest, addr, old, val, ctx->mem_idx, mop | MO_ALIGN);
+    gen_set_gpr(a->rd, dest, EXT_NONE);
+
+    return true;
+}
+
 static bool gen_am(DisasContext *ctx, arg_rrr *a,
                    void (*func)(TCGv, TCGv, TCGv, TCGArg, MemOp),
                    MemOp mop)
@@ -73,6 +90,14 @@ TRANS(ll_w, ALL, gen_ll, MO_TESL)
 TRANS(sc_w, ALL, gen_sc, MO_TESL)
 TRANS(ll_d, 64, gen_ll, MO_TEUQ)
 TRANS(sc_d, 64, gen_sc, MO_TEUQ)
+TRANS(amcas_b, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_SB)
+TRANS(amcas_h, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TESW)
+TRANS(amcas_w, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TESL)
+TRANS(amcas_d, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TEUQ)
+TRANS(amcas_db_b, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_SB)
+TRANS(amcas_db_h, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TESW)
+TRANS(amcas_db_w, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TESL)
+TRANS(amcas_db_d, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TEUQ)
 TRANS(amswap_b, LAM_BH, gen_am, tcg_gen_atomic_xchg_tl, MO_SB)
 TRANS(amswap_h, LAM_BH, gen_am, tcg_gen_atomic_xchg_tl, MO_TESW)
 TRANS(amadd_b, LAM_BH, gen_am, tcg_gen_atomic_fetch_add_tl, MO_SB)
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index eb424bb0da..9ba3b425c1 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -27,6 +27,7 @@
 #define avail_LSPW(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSPW))
 #define avail_LAM(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAM))
 #define avail_LAM_BH(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAM_BH))
+#define avail_LAMCAS(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAMCAS))
 #define avail_LSX(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSX))
 #define avail_LASX(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LASX))
 #define avail_IOCSR(C)  (FIELD_EX32((C)->cpucfg1, CPUCFG1, IOCSR))
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions
  2025-11-19 12:24 [PATCH v2 0/7] Add LoongArch v1.1 instructions Jiajie Chen
                   ` (2 preceding siblings ...)
  2025-11-19 12:24 ` [PATCH v2 3/7] target/loongarch: Add amcas[_db].{b/h/w/d} Jiajie Chen
@ 2025-11-19 12:30 ` Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 5/7] target/loongarch: Add llacq/screl instructions Jiajie Chen
                     ` (2 more replies)
  3 siblings, 3 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

Add the following new instructions in LoongArch v1.1:

- frecipe.s
- frecipe.d
- frsqrte.s
- frsqrte.d
- vfrecipe.s
- vfrecipe.d
- vfrsqrte.s
- vfrsqrte.d
- xvfrecipe.s
- xvfrecipe.d
- xvfrsqrte.s
- xvfrsqrte.d

They are guarded by CPUCFG2.FRECIPE. Altought the instructions allow
implementation to improve performance by reducing precision, we use the
existing softfloat implementation.

Signed-off-by: Jiajie Chen <c@jia.je>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/cpu.h                             |  1 +
 target/loongarch/disas.c                           | 12 ++++++++++++
 target/loongarch/insns.decode                      | 12 ++++++++++++
 target/loongarch/tcg/insn_trans/trans_farith.c.inc |  4 ++++
 target/loongarch/tcg/insn_trans/trans_vec.c.inc    |  8 ++++++++
 target/loongarch/translate.h                       |  6 ++++++
 6 files changed, 43 insertions(+)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index 9ca7af9b4a..740e474d79 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -138,6 +138,7 @@ FIELD(CPUCFG2, LBT_ALL, 18, 3)
 FIELD(CPUCFG2, LSPW, 21, 1)
 FIELD(CPUCFG2, LAM, 22, 1)
 FIELD(CPUCFG2, HPTW, 24, 1)
+FIELD(CPUCFG2, FRECIPE, 25, 1)
 FIELD(CPUCFG2, LAM_BH, 27, 1)
 FIELD(CPUCFG2, LAMCAS, 28, 1)
 
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 66c0cae5a9..e5e1b37ce0 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -478,6 +478,10 @@ INSN(frecip_s,     ff)
 INSN(frecip_d,     ff)
 INSN(frsqrt_s,     ff)
 INSN(frsqrt_d,     ff)
+INSN(frecipe_s,    ff)
+INSN(frecipe_d,    ff)
+INSN(frsqrte_s,    ff)
+INSN(frsqrte_d,    ff)
 INSN(fmov_s,       ff)
 INSN(fmov_d,       ff)
 INSN(movgr2fr_w,   fr)
@@ -1429,6 +1433,10 @@ INSN_LSX(vfrecip_s,        vv)
 INSN_LSX(vfrecip_d,        vv)
 INSN_LSX(vfrsqrt_s,        vv)
 INSN_LSX(vfrsqrt_d,        vv)
+INSN_LSX(vfrecipe_s,       vv)
+INSN_LSX(vfrecipe_d,       vv)
+INSN_LSX(vfrsqrte_s,       vv)
+INSN_LSX(vfrsqrte_d,       vv)
 
 INSN_LSX(vfcvtl_s_h,       vv)
 INSN_LSX(vfcvth_s_h,       vv)
@@ -2343,6 +2351,10 @@ INSN_LASX(xvfrecip_s,        vv)
 INSN_LASX(xvfrecip_d,        vv)
 INSN_LASX(xvfrsqrt_s,        vv)
 INSN_LASX(xvfrsqrt_d,        vv)
+INSN_LASX(xvfrecipe_s,       vv)
+INSN_LASX(xvfrecipe_d,       vv)
+INSN_LASX(xvfrsqrte_s,       vv)
+INSN_LASX(xvfrsqrte_d,       vv)
 
 INSN_LASX(xvfcvtl_s_h,       vv)
 INSN_LASX(xvfcvth_s_h,       vv)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index cf4123cd46..92078f0f9f 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -371,6 +371,10 @@ frecip_s        0000 00010001 01000 10101 ..... .....    @ff
 frecip_d        0000 00010001 01000 10110 ..... .....    @ff
 frsqrt_s        0000 00010001 01000 11001 ..... .....    @ff
 frsqrt_d        0000 00010001 01000 11010 ..... .....    @ff
+frecipe_s       0000 00010001 01000 11101 ..... .....    @ff
+frecipe_d       0000 00010001 01000 11110 ..... .....    @ff
+frsqrte_s       0000 00010001 01001 00001 ..... .....    @ff
+frsqrte_d       0000 00010001 01001 00010 ..... .....    @ff
 fscaleb_s       0000 00010001 00001 ..... ..... .....    @fff
 fscaleb_d       0000 00010001 00010 ..... ..... .....    @fff
 flogb_s         0000 00010001 01000 01001 ..... .....    @ff
@@ -1115,6 +1119,10 @@ vfrecip_s        0111 00101001 11001 11101 ..... .....    @vv
 vfrecip_d        0111 00101001 11001 11110 ..... .....    @vv
 vfrsqrt_s        0111 00101001 11010 00001 ..... .....    @vv
 vfrsqrt_d        0111 00101001 11010 00010 ..... .....    @vv
+vfrecipe_s       0111 00101001 11010 00101 ..... .....    @vv
+vfrecipe_d       0111 00101001 11010 00110 ..... .....    @vv
+vfrsqrte_s       0111 00101001 11010 01001 ..... .....    @vv
+vfrsqrte_d       0111 00101001 11010 01010 ..... .....    @vv
 
 vfcvtl_s_h       0111 00101001 11011 11010 ..... .....    @vv
 vfcvth_s_h       0111 00101001 11011 11011 ..... .....    @vv
@@ -1879,6 +1887,10 @@ xvfrecip_s       0111 01101001 11001 11101 ..... .....    @vv
 xvfrecip_d       0111 01101001 11001 11110 ..... .....    @vv
 xvfrsqrt_s       0111 01101001 11010 00001 ..... .....    @vv
 xvfrsqrt_d       0111 01101001 11010 00010 ..... .....    @vv
+xvfrecipe_s      0111 01101001 11010 00101 ..... .....    @vv
+xvfrecipe_d      0111 01101001 11010 00110 ..... .....    @vv
+xvfrsqrte_s      0111 01101001 11010 01001 ..... .....    @vv
+xvfrsqrte_d      0111 01101001 11010 01010 ..... .....    @vv
 
 xvfcvtl_s_h      0111 01101001 11011 11010 ..... .....    @vv
 xvfcvth_s_h      0111 01101001 11011 11011 ..... .....    @vv
diff --git a/target/loongarch/tcg/insn_trans/trans_farith.c.inc b/target/loongarch/tcg/insn_trans/trans_farith.c.inc
index ff6cf3448e..eed6ab7312 100644
--- a/target/loongarch/tcg/insn_trans/trans_farith.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_farith.c.inc
@@ -191,6 +191,10 @@ TRANS(frecip_s, FP_SP, gen_ff, gen_helper_frecip_s)
 TRANS(frecip_d, FP_DP, gen_ff, gen_helper_frecip_d)
 TRANS(frsqrt_s, FP_SP, gen_ff, gen_helper_frsqrt_s)
 TRANS(frsqrt_d, FP_DP, gen_ff, gen_helper_frsqrt_d)
+TRANS(frecipe_s, FRECIPE_FP_SP, gen_ff, gen_helper_frecip_s)
+TRANS(frecipe_d, FRECIPE_FP_DP, gen_ff, gen_helper_frecip_d)
+TRANS(frsqrte_s, FRECIPE_FP_SP, gen_ff, gen_helper_frsqrt_s)
+TRANS(frsqrte_d, FRECIPE_FP_DP, gen_ff, gen_helper_frsqrt_d)
 TRANS64(flogb_s, FP_SP, gen_ff, gen_helper_flogb_s)
 TRANS64(flogb_d, FP_DP, gen_ff, gen_helper_flogb_d)
 TRANS(fclass_s, FP_SP, gen_ff, gen_helper_fclass_s)
diff --git a/target/loongarch/tcg/insn_trans/trans_vec.c.inc b/target/loongarch/tcg/insn_trans/trans_vec.c.inc
index 38bccf2838..ef57abe408 100644
--- a/target/loongarch/tcg/insn_trans/trans_vec.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_vec.c.inc
@@ -4407,12 +4407,20 @@ TRANS(vfrecip_s, LSX, gen_vv_ptr, gen_helper_vfrecip_s)
 TRANS(vfrecip_d, LSX, gen_vv_ptr, gen_helper_vfrecip_d)
 TRANS(vfrsqrt_s, LSX, gen_vv_ptr, gen_helper_vfrsqrt_s)
 TRANS(vfrsqrt_d, LSX, gen_vv_ptr, gen_helper_vfrsqrt_d)
+TRANS(vfrecipe_s, FRECIPE_LSX, gen_vv_ptr, gen_helper_vfrecip_s)
+TRANS(vfrecipe_d, FRECIPE_LSX, gen_vv_ptr, gen_helper_vfrecip_d)
+TRANS(vfrsqrte_s, FRECIPE_LSX, gen_vv_ptr, gen_helper_vfrsqrt_s)
+TRANS(vfrsqrte_d, FRECIPE_LSX, gen_vv_ptr, gen_helper_vfrsqrt_d)
 TRANS(xvfsqrt_s, LASX, gen_xx_ptr, gen_helper_vfsqrt_s)
 TRANS(xvfsqrt_d, LASX, gen_xx_ptr, gen_helper_vfsqrt_d)
 TRANS(xvfrecip_s, LASX, gen_xx_ptr, gen_helper_vfrecip_s)
 TRANS(xvfrecip_d, LASX, gen_xx_ptr, gen_helper_vfrecip_d)
 TRANS(xvfrsqrt_s, LASX, gen_xx_ptr, gen_helper_vfrsqrt_s)
 TRANS(xvfrsqrt_d, LASX, gen_xx_ptr, gen_helper_vfrsqrt_d)
+TRANS(xvfrecipe_s, FRECIPE_LASX, gen_xx_ptr, gen_helper_vfrecip_s)
+TRANS(xvfrecipe_d, FRECIPE_LASX, gen_xx_ptr, gen_helper_vfrecip_d)
+TRANS(xvfrsqrte_s, FRECIPE_LASX, gen_xx_ptr, gen_helper_vfrsqrt_s)
+TRANS(xvfrsqrte_d, FRECIPE_LASX, gen_xx_ptr, gen_helper_vfrsqrt_d)
 
 TRANS(vfcvtl_s_h, LSX, gen_vv_ptr, gen_helper_vfcvtl_s_h)
 TRANS(vfcvth_s_h, LSX, gen_vv_ptr, gen_helper_vfcvth_s_h)
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index 9ba3b425c1..331f79c8f2 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -33,6 +33,12 @@
 #define avail_IOCSR(C)  (FIELD_EX32((C)->cpucfg1, CPUCFG1, IOCSR))
 #define avail_CRC(C)    (FIELD_EX32((C)->cpucfg1, CPUCFG1, CRC))
 
+#define avail_FRECIPE(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, FRECIPE))
+#define avail_FRECIPE_FP_SP(C) (avail_FRECIPE(C) && avail_FP_SP(C))
+#define avail_FRECIPE_FP_DP(C) (avail_FRECIPE(C) && avail_FP_DP(C))
+#define avail_FRECIPE_LSX(C)   (avail_FRECIPE(C) && avail_LSX(C))
+#define avail_FRECIPE_LASX(C)   (avail_FRECIPE(C) && avail_LASX(C))
+
 /*
  * If an operation is being performed on less than TARGET_LONG_BITS,
  * it may require the inputs to be sign- or zero-extended; which will
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 5/7] target/loongarch: Add llacq/screl instructions
  2025-11-19 12:30 ` [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions Jiajie Chen
@ 2025-11-19 12:30   ` Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 6/7] target/loongarch: Add sc.q instructions Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 7/7] target/loongarch: Add LA v1.1 instructions to max cpu Jiajie Chen
  2 siblings, 0 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

Add the following instructions in LoongArch v1.1:

- llacq.w
- screl.w
- llacq.d
- screl.d

They are guarded by CPUCFG2.LLACQ_SCREL.

Signed-off-by: Jiajie Chen <c@jia.je>
Co-developed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/loongarch/cpu.h                        |  1 +
 target/loongarch/disas.c                      |  4 ++++
 target/loongarch/insns.decode                 |  5 ++++
 .../tcg/insn_trans/trans_atomic.c.inc         | 24 ++++++++++++++-----
 target/loongarch/translate.h                  |  3 +++
 5 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index 740e474d79..5cab02ad6f 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -141,6 +141,7 @@ FIELD(CPUCFG2, HPTW, 24, 1)
 FIELD(CPUCFG2, FRECIPE, 25, 1)
 FIELD(CPUCFG2, LAM_BH, 27, 1)
 FIELD(CPUCFG2, LAMCAS, 28, 1)
+FIELD(CPUCFG2, LLACQ_SCREL, 29, 1)
 
 /* cpucfg[3] bits */
 FIELD(CPUCFG3, CCDMA, 0, 1)
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index e5e1b37ce0..3164fade9b 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -584,6 +584,10 @@ INSN(fldx_s,       frr)
 INSN(fldx_d,       frr)
 INSN(fstx_s,       frr)
 INSN(fstx_d,       frr)
+INSN(llacq_w,      rr_i)
+INSN(screl_w,      rr_i)
+INSN(llacq_d,      rr_i)
+INSN(screl_d,      rr_i)
 INSN(amcas_b,      rrr)
 INSN(amcas_h,      rrr)
 INSN(amcas_w,      rrr)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index 92078f0f9f..7898f5f719 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -69,6 +69,7 @@
 @rr_i14s2         .... ....  .............. rj:5 rd:5    &rr_i imm=%i14s2
 @rr_i16                     .... .. imm:s16 rj:5 rd:5    &rr_i
 @rr_i16s2         .... ..  ................ rj:5 rd:5    &rr_i imm=%offs16
+@rr_i0            .... ..  ................ rj:5 rd:5    &rr_i imm=0
 @hint_r_i12           .... ...... imm:s12 rj:5 hint:5    &hint_r_i
 @hint_rr         .... ........ ..... rk:5 rj:5 hint:5    &hint_rr
 @rrr_sa2p1        .... ........ ... .. rk:5 rj:5 rd:5    &rrr_sa  sa=%sa2p1
@@ -261,6 +262,10 @@ ll_w            0010 0000 .............. ..... .....     @rr_i14s2
 sc_w            0010 0001 .............. ..... .....     @rr_i14s2
 ll_d            0010 0010 .............. ..... .....     @rr_i14s2
 sc_d            0010 0011 .............. ..... .....     @rr_i14s2
+llacq_w         0011 10000101 01111 00000 ..... .....    @rr_i0
+screl_w         0011 10000101 01111 00001 ..... .....    @rr_i0
+llacq_d         0011 10000101 01111 00010 ..... .....    @rr_i0
+screl_d         0011 10000101 01111 00011 ..... .....    @rr_i0
 amcas_b         0011 10000101 10000 ..... ..... .....    @rrr
 amcas_h         0011 10000101 10001 ..... ..... .....    @rrr
 amcas_w         0011 10000101 10010 ..... ..... .....    @rrr
diff --git a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
index 1b2673b82d..c9a6dcfdeb 100644
--- a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
@@ -3,7 +3,7 @@
  * Copyright (c) 2021 Loongson Technology Corporation Limited
  */
 
-static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop)
+static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop, bool acq)
 {
     TCGv t1 = tcg_temp_new();
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
@@ -14,10 +14,14 @@ static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop)
     tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval));
     gen_set_gpr(a->rd, t1, EXT_NONE);
 
+    if (acq) {
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+    }
+
     return true;
 }
 
-static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop)
+static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop, bool rel)
 {
     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
@@ -29,6 +33,10 @@ static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop)
     TCGLabel *done = gen_new_label();
 
     tcg_gen_addi_tl(t0, src1, a->imm);
+
+    if (rel) {
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+    }
     tcg_gen_brcond_tl(TCG_COND_EQ, t0, cpu_lladdr, l1);
     tcg_gen_movi_tl(dest, 0);
     tcg_gen_br(done);
@@ -86,10 +94,14 @@ static bool gen_am(DisasContext *ctx, arg_rrr *a,
     return true;
 }
 
-TRANS(ll_w, ALL, gen_ll, MO_TESL)
-TRANS(sc_w, ALL, gen_sc, MO_TESL)
-TRANS(ll_d, 64, gen_ll, MO_TEUQ)
-TRANS(sc_d, 64, gen_sc, MO_TEUQ)
+TRANS(ll_w, ALL, gen_ll, MO_TESL, false)
+TRANS(sc_w, ALL, gen_sc, MO_TESL, false)
+TRANS(ll_d, 64, gen_ll, MO_TEUQ, false)
+TRANS(sc_d, 64, gen_sc, MO_TEUQ, false)
+TRANS(llacq_w, LLACQ_SCREL, gen_ll, MO_TESL, true)
+TRANS(screl_w, LLACQ_SCREL, gen_sc, MO_TESL, true)
+TRANS(llacq_d, LLACQ_SCREL_64, gen_ll, MO_TEUQ, true)
+TRANS(screl_d, LLACQ_SCREL_64, gen_sc, MO_TEUQ, true)
 TRANS(amcas_b, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_SB)
 TRANS(amcas_h, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TESW)
 TRANS(amcas_w, LAMCAS, gen_cas, tcg_gen_atomic_cmpxchg_tl, MO_TESL)
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index 331f79c8f2..76bceedf98 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -39,6 +39,9 @@
 #define avail_FRECIPE_LSX(C)   (avail_FRECIPE(C) && avail_LSX(C))
 #define avail_FRECIPE_LASX(C)   (avail_FRECIPE(C) && avail_LASX(C))
 
+#define avail_LLACQ_SCREL(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
+#define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
+
 /*
  * If an operation is being performed on less than TARGET_LONG_BITS,
  * it may require the inputs to be sign- or zero-extended; which will
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 6/7] target/loongarch: Add sc.q instructions
  2025-11-19 12:30 ` [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 5/7] target/loongarch: Add llacq/screl instructions Jiajie Chen
@ 2025-11-19 12:30   ` Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 7/7] target/loongarch: Add LA v1.1 instructions to max cpu Jiajie Chen
  2 siblings, 0 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

Add the sc.q instruction in LoongArch v1.1, guarded by CPUCFG2.SCQ. It
is implemented by reading 128bit data (llval + llval_high) in ll.d when
aligned to 16B boundary, and cmpxchg 128bit in sc.q. If ld.d
matches the higher part of the 128bit, its data is taken from
llval_high.

Expected assembly sequence:

ll.d lo, base, 0
ld.d hi, base, 8
sc.q lo, hi, base

Signed-off-by: Jiajie Chen <c@jia.je>
---
 target/loongarch/cpu.h                        |  3 +
 target/loongarch/disas.c                      |  1 +
 target/loongarch/insns.decode                 |  1 +
 .../tcg/insn_trans/trans_atomic.c.inc         | 82 +++++++++++++++++++
 .../tcg/insn_trans/trans_memory.c.inc         | 22 +++++
 target/loongarch/tcg/translate.c              |  6 +-
 target/loongarch/translate.h                  |  1 +
 7 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index 5cab02ad6f..0a89c06b01 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -142,6 +142,7 @@ FIELD(CPUCFG2, FRECIPE, 25, 1)
 FIELD(CPUCFG2, LAM_BH, 27, 1)
 FIELD(CPUCFG2, LAMCAS, 28, 1)
 FIELD(CPUCFG2, LLACQ_SCREL, 29, 1)
+FIELD(CPUCFG2, SCQ, 30, 1)
 
 /* cpucfg[3] bits */
 FIELD(CPUCFG3, CCDMA, 0, 1)
@@ -377,6 +378,8 @@ typedef struct CPUArchState {
     uint32_t fcsr0_mask;
     uint64_t lladdr; /* LL virtual address compared against SC */
     uint64_t llval;
+    uint64_t llval_high; /* For 128-bit atomic SC.Q */
+    uint64_t llbit_scq; /* Potential LL.D+LD.D+SC.Q sequence in effect */
 #endif
 #ifndef CONFIG_USER_ONLY
 #ifdef CONFIG_TCG
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 3164fade9b..3249ab7ac6 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -584,6 +584,7 @@ INSN(fldx_s,       frr)
 INSN(fldx_d,       frr)
 INSN(fstx_s,       frr)
 INSN(fstx_d,       frr)
+INSN(sc_q,         rrr)
 INSN(llacq_w,      rr_i)
 INSN(screl_w,      rr_i)
 INSN(llacq_d,      rr_i)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index 7898f5f719..3089d42044 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -262,6 +262,7 @@ ll_w            0010 0000 .............. ..... .....     @rr_i14s2
 sc_w            0010 0001 .............. ..... .....     @rr_i14s2
 ll_d            0010 0010 .............. ..... .....     @rr_i14s2
 sc_d            0010 0011 .............. ..... .....     @rr_i14s2
+sc_q            0011 10000101 01110 ..... ..... .....    @rrr
 llacq_w         0011 10000101 01111 00000 ..... .....    @rr_i0
 screl_w         0011 10000101 01111 00001 ..... .....    @rr_i0
 llacq_d         0011 10000101 01111 00010 ..... .....    @rr_i0
diff --git a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
index c9a6dcfdeb..565daa7219 100644
--- a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
@@ -6,14 +6,48 @@
 static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop, bool acq)
 {
     TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
     TCGv t0 = make_address_i(ctx, src1, a->imm);
+    TCGv_i128 t16 = tcg_temp_new_i128();
+    TCGv mask = tcg_constant_tl(0xf);
+    TCGv one = tcg_constant_tl(1);
+    TCGv zero = tcg_constant_tl(0);
+    TCGLabel *l1 = gen_new_label();
+    TCGLabel *done = gen_new_label();
+
+    if (avail_SCQ(ctx) && mop == MO_TEUQ) {
+        /*
+         * The LL.D+LD.D may be paired with SC.Q,
+         * load 128-bit if aligned: (t0 & 0xf) == 0
+         */
+        tcg_gen_and_tl(t1, t0, mask);
+        tcg_gen_brcond_tl(TCG_COND_EQ, t1, zero, l1);
+        /* fallthrough if not aligned to 16B */
+    }
 
     tcg_gen_qemu_ld_i64(t1, t0, ctx->mem_idx, mop | MO_ALIGN);
     tcg_gen_st_tl(t0, tcg_env, offsetof(CPULoongArchState, lladdr));
     tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval));
     gen_set_gpr(a->rd, t1, EXT_NONE);
 
+    if (avail_SCQ(ctx) && mop == MO_TEUQ) {
+        tcg_gen_br(done);
+
+        gen_set_label(l1);
+
+        /* Load 16B data and save into llval/llval_high */
+        tcg_gen_qemu_ld_i128(t16, t0, ctx->mem_idx, MO_128 | MO_ALIGN);
+        tcg_gen_st_tl(t0, tcg_env, offsetof(CPULoongArchState, lladdr));
+        tcg_gen_extr_i128_i64(t1, t2, t16);
+        tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval));
+        tcg_gen_st_tl(t2, tcg_env, offsetof(CPULoongArchState, llval_high));
+        tcg_gen_st_tl(one, tcg_env, offsetof(CPULoongArchState, llbit_scq));
+        gen_set_gpr(a->rd, t1, EXT_NONE);
+
+        gen_set_label(done);
+    }
+
     if (acq) {
         tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
     }
@@ -28,6 +62,7 @@ static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop, bool rel)
     TCGv src2 = gpr_src(ctx, a->rd, EXT_NONE);
     TCGv t0 = tcg_temp_new();
     TCGv val = tcg_temp_new();
+    TCGv zero = tcg_constant_tl(0);
 
     TCGLabel *l1 = gen_new_label();
     TCGLabel *done = gen_new_label();
@@ -37,6 +72,11 @@ static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop, bool rel)
     if (rel) {
         tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
     }
+
+    if (avail_SCQ(ctx)) {
+        tcg_gen_st_tl(zero, tcg_env, offsetof(CPULoongArchState, llbit_scq));
+    }
+
     tcg_gen_brcond_tl(TCG_COND_EQ, t0, cpu_lladdr, l1);
     tcg_gen_movi_tl(dest, 0);
     tcg_gen_br(done);
@@ -53,6 +93,47 @@ static bool gen_sc(DisasContext *ctx, arg_rr_i *a, MemOp mop, bool rel)
     return true;
 }
 
+static bool gen_sc_q(DisasContext *ctx, arg_rrr *a, MemOp mop)
+{
+    TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
+    TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
+    TCGv src2 = gpr_src(ctx, a->rd, EXT_NONE);
+    TCGv src3 = gpr_src(ctx, a->rk, EXT_NONE);
+    TCGv_i128 t16 = tcg_temp_new_i128();
+    TCGv_i128 c16 = tcg_temp_new_i128();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv zero = tcg_constant_tl(0);
+
+    TCGLabel *l1 = gen_new_label();
+    TCGLabel *done = gen_new_label();
+
+    tcg_gen_st_tl(zero, tcg_env, offsetof(CPULoongArchState, llbit_scq));
+    tcg_gen_brcond_tl(TCG_COND_EQ, src1, cpu_lladdr, l1);
+    tcg_gen_movi_tl(dest, 0);
+    tcg_gen_br(done);
+
+    gen_set_label(l1);
+    tcg_gen_concat_i64_i128(t16, src2, src3);
+    tcg_gen_concat_i64_i128(c16, cpu_llval,
+                            cpu_llval_high);
+
+    /* generate cmpxchg */
+    tcg_gen_atomic_cmpxchg_i128(t16, cpu_lladdr, c16,
+                              t16, ctx->mem_idx, mop | MO_ALIGN);
+
+    /* check if success */
+    tcg_gen_extr_i128_i64(t1, t2, t16);
+    tcg_gen_xor_i64(t1, t1, cpu_llval);
+    tcg_gen_xor_i64(t2, t2, cpu_llval_high);
+    tcg_gen_or_i64(t1, t1, t2);
+    tcg_gen_setcondi_i64(TCG_COND_EQ, dest, t1, 0);
+    gen_set_label(done);
+    gen_set_gpr(a->rd, dest, EXT_NONE);
+
+    return true;
+}
+
 static bool gen_cas(DisasContext *ctx, arg_rrr *a,
                     void (*func)(TCGv, TCGv, TCGv, TCGv, TCGArg, MemOp),
                     MemOp mop)
@@ -98,6 +179,7 @@ TRANS(ll_w, ALL, gen_ll, MO_TESL, false)
 TRANS(sc_w, ALL, gen_sc, MO_TESL, false)
 TRANS(ll_d, 64, gen_ll, MO_TEUQ, false)
 TRANS(sc_d, 64, gen_sc, MO_TEUQ, false)
+TRANS(sc_q, 64, gen_sc_q, MO_128)
 TRANS(llacq_w, LLACQ_SCREL, gen_ll, MO_TESL, true)
 TRANS(screl_w, LLACQ_SCREL, gen_sc, MO_TESL, true)
 TRANS(llacq_d, LLACQ_SCREL_64, gen_ll, MO_TEUQ, true)
diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
index 42f4e74012..8b3c1b037c 100644
--- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
@@ -7,11 +7,33 @@ static bool gen_load(DisasContext *ctx, arg_rr_i *a, MemOp mop)
 {
     TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
     TCGv addr = gpr_src(ctx, a->rj, EXT_NONE);
+    TCGv t1 = tcg_temp_new();
+    TCGv mask = tcg_constant_tl(0x8);
+    TCGv zero = tcg_constant_tl(0);
+    TCGLabel *done = gen_new_label();
+    TCGLabel *l1 = gen_new_label();
 
     addr = make_address_i(ctx, addr, a->imm);
 
+    if (avail_SCQ(ctx) && mop == MO_TEUQ) {
+        /*
+         * The LL.D+LD.D may be paired with SC.Q,
+         * use llval_high if llbit_scq && (addr == lladdr ^ 0x8)
+         */
+        tcg_gen_brcond_tl(TCG_COND_EQ, cpu_llbit_scq, zero, l1);
+        tcg_gen_xor_tl(t1, addr, mask);
+        tcg_gen_brcond_tl(TCG_COND_NE, cpu_lladdr, t1, l1);
+        gen_set_gpr(a->rd, cpu_llval_high, EXT_NONE);
+        tcg_gen_br(done);
+        gen_set_label(l1);
+    }
+
     tcg_gen_qemu_ld_tl(dest, addr, ctx->mem_idx, mop);
     gen_set_gpr(a->rd, dest, EXT_NONE);
+
+    if (avail_SCQ(ctx) && mop == MO_TEUQ) {
+        gen_set_label(done);
+    }
     return true;
 }
 
diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
index 055f6fb604..2023f892be 100644
--- a/target/loongarch/tcg/translate.c
+++ b/target/loongarch/tcg/translate.c
@@ -24,7 +24,7 @@
 
 /* Global register indices */
 TCGv cpu_gpr[32], cpu_pc;
-static TCGv cpu_lladdr, cpu_llval;
+static TCGv cpu_lladdr, cpu_llval, cpu_llval_high, cpu_llbit_scq;
 
 #define HELPER_H "helper.h"
 #include "exec/helper-info.c.inc"
@@ -360,6 +360,10 @@ void loongarch_translate_init(void)
                     offsetof(CPULoongArchState, lladdr), "lladdr");
     cpu_llval = tcg_global_mem_new(tcg_env,
                     offsetof(CPULoongArchState, llval), "llval");
+    cpu_llval_high = tcg_global_mem_new(tcg_env,
+                    offsetof(CPULoongArchState, llval_high), "llval_high");
+    cpu_llbit_scq = tcg_global_mem_new(tcg_env,
+                    offsetof(CPULoongArchState, llbit_scq), "llbit_scq");
 
 #ifndef CONFIG_USER_ONLY
     loongarch_csr_translate_init();
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index 76bceedf98..ba1c89e57b 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -30,6 +30,7 @@
 #define avail_LAMCAS(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LAMCAS))
 #define avail_LSX(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LSX))
 #define avail_LASX(C)   (FIELD_EX32((C)->cpucfg2, CPUCFG2, LASX))
+#define avail_SCQ(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, SCQ))
 #define avail_IOCSR(C)  (FIELD_EX32((C)->cpucfg1, CPUCFG1, IOCSR))
 #define avail_CRC(C)    (FIELD_EX32((C)->cpucfg1, CPUCFG1, CRC))
 
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 7/7] target/loongarch: Add LA v1.1 instructions to max cpu
  2025-11-19 12:30 ` [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 5/7] target/loongarch: Add llacq/screl instructions Jiajie Chen
  2025-11-19 12:30   ` [PATCH v2 6/7] target/loongarch: Add sc.q instructions Jiajie Chen
@ 2025-11-19 12:30   ` Jiajie Chen
  2 siblings, 0 replies; 8+ messages in thread
From: Jiajie Chen @ 2025-11-19 12:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson, gaosong, git, Jiajie Chen

Add LA v1.1 new instructinos to max cpu by enabling new features in
CPUCFG2.

Signed-off-by: Jiajie Chen <c@jia.je>
---
 target/loongarch/cpu.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index d74c3c3766..2c357f9342 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -413,7 +413,16 @@ static void loongarch_max_initfn(Object *obj)
     if (tcg_enabled()) {
         cpu->env.cpucfg[1] = FIELD_DP32(cpu->env.cpucfg[1], CPUCFG1, MSG_INT, 1);
         cpu->msgint = ON_OFF_AUTO_AUTO;
-        cpu->env.cpucfg[2] = FIELD_DP32(cpu->env.cpucfg[2], CPUCFG2, HPTW, 1);
+
+        uint32_t data = cpu->env.cpucfg[2];
+        data = FIELD_DP32(data, CPUCFG2, HPTW, 1);
+        /* Enable LA v1.1 instructions */
+        data = FIELD_DP32(data, CPUCFG2, FRECIPE, 1);
+        data = FIELD_DP32(data, CPUCFG2, LAM_BH, 1);
+        data = FIELD_DP32(data, CPUCFG2, LAMCAS, 1);
+        data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
+        data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
+        cpu->env.cpucfg[2] = data;
     }
 }
 
-- 
2.51.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-11-19 12:33 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-19 12:24 [PATCH v2 0/7] Add LoongArch v1.1 instructions Jiajie Chen
2025-11-19 12:24 ` [PATCH v2 1/7] target/loongarch: Require atomics to be aligned Jiajie Chen
2025-11-19 12:24 ` [PATCH v2 2/7] target/loongarch: Add am{swap/add}[_db].{b/h} Jiajie Chen
2025-11-19 12:24 ` [PATCH v2 3/7] target/loongarch: Add amcas[_db].{b/h/w/d} Jiajie Chen
2025-11-19 12:30 ` [PATCH v2 4/7] target/loongarch: Add estimated reciprocal instructions Jiajie Chen
2025-11-19 12:30   ` [PATCH v2 5/7] target/loongarch: Add llacq/screl instructions Jiajie Chen
2025-11-19 12:30   ` [PATCH v2 6/7] target/loongarch: Add sc.q instructions Jiajie Chen
2025-11-19 12:30   ` [PATCH v2 7/7] target/loongarch: Add LA v1.1 instructions to max cpu Jiajie Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).