[PULL 26/41] target/arm: Implement v8.1M low-overhead-loop instructions

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Subject: [PULL 26/41] target/arm: Implement v8.1M low-overhead-loop instructions
Date: Tue, 20 Oct 2020 16:56:41 +0100	[thread overview]
Message-ID: <20201020155656.8045-27-peter.maydell@linaro.org> (raw)
In-Reply-To: <20201020155656.8045-1-peter.maydell@linaro.org>

v8.1M's "low-overhead-loop" extension has three instructions
for looping:
 * DLS (start of a do-loop)
 * WLS (start of a while-loop)
 * LE (end of a loop)

The loop-start instructions are both simple operations to start a
loop whose iteration count (if any) is in LR.  The loop-end
instruction handles "decrement iteration count and jump back to loop
start"; it also caches the information about the branch back to the
start of the loop to improve performance of the branch on subsequent
iterations.

As with the branch-future instructions, the architecture permits an
implementation to discard the LO_BRANCH_INFO cache at any time, and
QEMU takes the IMPDEF option to never set it in the first place
(equivalent to discarding it immediately), because for us a "real"
implementation would be unnecessary complexity.

(This implementation only provides the simple looping constructs; the
vector extension MVE (Helium) adds some extra variants to handle
looping across vectors.  We'll add those later when we implement
MVE.)

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20201019151301.2046-8-peter.maydell@linaro.org
---
 target/arm/t32.decode  |  8 ++++
 target/arm/translate.c | 93 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/target/arm/t32.decode b/target/arm/t32.decode
index 3015731a8d0..8152739b52b 100644
--- a/target/arm/t32.decode
+++ b/target/arm/t32.decode
@@ -659,4 +659,12 @@ BL               1111 0. .......... 11.1 ............         @branch24
     BF           1111 0 boff:4 10 ----- 1110 - ---------- 1    # BF
     BF           1111 0 boff:4 11 ----- 1110 0 0000000000 1    # BFX, BFLX
   ]
+  [
+    # LE and WLS immediate
+    %lob_imm 1:10 11:1 !function=times_2
+
+    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001
+    WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm
+    LE           1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
+  ]
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index a5ebe568804..38371db5401 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -2490,17 +2490,23 @@ static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static inline void gen_jmp (DisasContext *s, uint32_t dest)
+/* Jump, specifying which TB number to use if we gen_goto_tb() */
+static inline void gen_jmp_tb(DisasContext *s, uint32_t dest, int tbno)
 {
     if (unlikely(is_singlestepping(s))) {
         /* An indirect jump so that we still trigger the debug exception.  */
         gen_set_pc_im(s, dest);
         s->base.is_jmp = DISAS_JUMP;
     } else {
-        gen_goto_tb(s, 0, dest);
+        gen_goto_tb(s, tbno, dest);
     }
 }
 
+static inline void gen_jmp(DisasContext *s, uint32_t dest)
+{
+    gen_jmp_tb(s, dest, 0);
+}
+
 static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
 {
     if (x)
@@ -7991,6 +7997,89 @@ static bool trans_BF(DisasContext *s, arg_BF *a)
     return true;
 }
 
+static bool trans_DLS(DisasContext *s, arg_DLS *a)
+{
+    /* M-profile low-overhead loop start */
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->rn == 13 || a->rn == 15) {
+        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+        return false;
+    }
+
+    /* Not a while loop, no tail predication: just set LR to the count */
+    tmp = load_reg(s, a->rn);
+    store_reg(s, 14, tmp);
+    return true;
+}
+
+static bool trans_WLS(DisasContext *s, arg_WLS *a)
+{
+    /* M-profile low-overhead while-loop start */
+    TCGv_i32 tmp;
+    TCGLabel *nextlabel;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->rn == 13 || a->rn == 15) {
+        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+        return false;
+    }
+    if (s->condexec_mask) {
+        /*
+         * WLS in an IT block is CONSTRAINED UNPREDICTABLE;
+         * we choose to UNDEF, because otherwise our use of
+         * gen_goto_tb(1) would clash with the use of TB exit 1
+         * in the dc->condjmp condition-failed codepath in
+         * arm_tr_tb_stop() and we'd get an assertion.
+         */
+        return false;
+    }
+    nextlabel = gen_new_label();
+    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel);
+    tmp = load_reg(s, a->rn);
+    store_reg(s, 14, tmp);
+    gen_jmp_tb(s, s->base.pc_next, 1);
+
+    gen_set_label(nextlabel);
+    gen_jmp(s, read_pc(s) + a->imm);
+    return true;
+}
+
+static bool trans_LE(DisasContext *s, arg_LE *a)
+{
+    /*
+     * M-profile low-overhead loop end. The architecture permits an
+     * implementation to discard the LO_BRANCH_INFO cache at any time,
+     * and we take the IMPDEF option to never set it in the first place
+     * (equivalent to always discarding it immediately), because for QEMU
+     * a "real" implementation would be complicated and wouldn't execute
+     * any faster.
+     */
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+
+    if (!a->f) {
+        /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
+        arm_gen_condlabel(s);
+        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, s->condlabel);
+        /* Decrement LR */
+        tmp = load_reg(s, 14);
+        tcg_gen_addi_i32(tmp, tmp, -1);
+        store_reg(s, 14, tmp);
+    }
+    /* Jump back to the loop start */
+    gen_jmp(s, read_pc(s) - a->imm);
+    return true;
+}
+
 static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
 {
     TCGv_i32 addr, tmp;
-- 
2.20.1

next prev parent reply	other threads:[~2020-10-20 16:18 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-20 15:56 [PULL 00/41] target-arm queue Peter Maydell
2020-10-20 15:56 ` [PULL 01/41] target/arm: Fix SMLAD incorrect setting of Q bit Peter Maydell
2020-10-20 15:56 ` [PULL 02/41] target/arm: AArch32 VCVT fixed-point to float is always round-to-nearest Peter Maydell
2020-10-20 15:56 ` [PULL 03/41] hw/arm/strongarm: Fix 'time to transmit a char' unit comment Peter Maydell
2020-10-20 15:56 ` [PULL 04/41] hw/arm: Restrict APEI tables generation to the 'virt' machine Peter Maydell
2020-10-20 15:56 ` [PULL 05/41] hw/timer/bcm2835: Introduce BCM2835_SYSTIMER_COUNT definition Peter Maydell
2020-10-20 15:56 ` [PULL 06/41] hw/timer/bcm2835: Rename variable holding CTRL_STATUS register Peter Maydell
2020-10-20 15:56 ` [PULL 07/41] hw/timer/bcm2835: Support the timer COMPARE registers Peter Maydell
2020-10-20 15:56 ` [PULL 08/41] hw/arm/bcm2835_peripherals: Correctly wire the SYS_timer IRQs Peter Maydell
2020-10-20 15:56 ` [PULL 09/41] accel/tcg: Add tlb_flush_page_bits_by_mmuidx* Peter Maydell
2020-10-20 15:56 ` [PULL 10/41] target/arm: Use tlb_flush_page_bits_by_mmuidx* Peter Maydell
2020-10-20 15:56 ` [PULL 11/41] tests/qtest: Add npcm7xx timer test Peter Maydell
2020-10-20 15:56 ` [PULL 12/41] loads-stores.rst: add footnote that clarifies GETPC usage Peter Maydell
2020-10-20 15:56 ` [PULL 13/41] hw/intc/bcm2835_ic: Trace GPU/CPU IRQ handlers Peter Maydell
2020-10-20 15:56 ` [PULL 14/41] hw/intc/bcm2836_control: Use IRQ definitions instead of magic numbers Peter Maydell
2020-10-20 15:56 ` [PULL 15/41] target/arm: Remove redundant mmu_idx lookup Peter Maydell
2020-10-20 15:56 ` [PULL 16/41] target/arm: Fix reported EL for mte_check_fail Peter Maydell
2020-10-20 15:56 ` [PULL 17/41] target/arm: Ignore HCR_EL2.ATA when {E2H,TGE} != 11 Peter Maydell
2020-10-20 15:56 ` [PULL 18/41] microbit_i2c: Fix coredump when dump-vmstate Peter Maydell
2020-10-20 15:56 ` [PULL 19/41] hw/arm/nseries: Fix loading kernel image on n8x0 machines Peter Maydell
2020-10-20 15:56 ` [PULL 20/41] decodetree: Fix codegen for non-overlapping group inside overlapping group Peter Maydell
2020-10-20 15:56 ` [PULL 21/41] target/arm: Implement v8.1M NOCP handling Peter Maydell
2020-10-20 15:56 ` [PULL 22/41] target/arm: Implement v8.1M conditional-select insns Peter Maydell
2020-10-20 15:56 ` [PULL 23/41] target/arm: Make the t32 insn[25:23]=111 group non-overlapping Peter Maydell
2020-10-20 15:56 ` [PULL 24/41] target/arm: Don't allow BLX imm for M-profile Peter Maydell
2020-10-20 15:56 ` [PULL 25/41] target/arm: Implement v8.1M branch-future insns (as NOPs) Peter Maydell
2020-10-20 15:56 ` Peter Maydell [this message]
2020-10-20 15:56 ` [PULL 27/41] target/arm: Fix has_vfp/has_neon ID reg squashing for M-profile Peter Maydell
2020-10-20 15:56 ` [PULL 28/41] target/arm: Allow M-profile CPUs with FP16 to set FPSCR.FP16 Peter Maydell
2020-10-20 15:56 ` [PULL 29/41] target/arm: Implement FPSCR.LTPSIZE for M-profile LOB extension Peter Maydell
2020-10-20 15:56 ` [PULL 30/41] linux-user/aarch64: Reset btype for signals Peter Maydell
2020-10-20 15:56 ` [PULL 31/41] linux-user: Set PAGE_TARGET_1 for TARGET_PROT_BTI Peter Maydell
2020-10-20 15:56 ` [PULL 32/41] include/elf: Add defines related to GNU property notes for AArch64 Peter Maydell
2020-10-20 15:56 ` [PULL 33/41] linux-user/elfload: Avoid leaking interp_name using GLib memory API Peter Maydell
2020-10-20 15:56 ` [PULL 34/41] linux-user/elfload: Fix coding style in load_elf_image Peter Maydell
2020-10-20 15:56 ` [PULL 35/41] linux-user/elfload: Adjust iteration over phdr Peter Maydell
2020-10-20 15:56 ` [PULL 36/41] linux-user/elfload: Move PT_INTERP detection to first loop Peter Maydell
2020-10-20 15:56 ` [PULL 37/41] linux-user/elfload: Use Error for load_elf_image Peter Maydell
2020-10-20 15:56 ` [PULL 38/41] linux-user/elfload: Use Error for load_elf_interp Peter Maydell
2020-10-20 15:56 ` [PULL 39/41] linux-user/elfload: Parse NT_GNU_PROPERTY_TYPE_0 notes Peter Maydell
2020-10-20 15:56 ` [PULL 40/41] linux-user/elfload: Parse GNU_PROPERTY_AARCH64_FEATURE_1_AND Peter Maydell
2020-10-20 15:56 ` [PULL 41/41] tests/tcg/aarch64: Add bti smoke tests Peter Maydell
2020-10-20 16:36 ` [PULL 00/41] target-arm queue Philippe Mathieu-Daudé
2020-10-20 16:36 ` no-reply

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:3015731a8d dfblob:8152739b52 dfblob:a5ebe56880
dfblob:38371db540 )
 OR (
bs:"[PULL 26/41] target/arm: Implement v8.1M low-overhead-loop instructions" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201020155656.8045-27-peter.maydell@linaro.org \
    --to=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).