qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif)
@ 2025-05-08  9:48 Jim Shu
  2025-06-02  8:55 ` Jim Shu
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Jim Shu @ 2025-05-08  9:48 UTC (permalink / raw)
  To: qemu-devel, qemu-riscv
  Cc: Palmer Dabbelt, Alistair Francis, Weiwei Li,
	Daniel Henrique Barboza, Liu Zhiwei, Jim Shu

Support 4-byte atomic instruction fetch when instruction is natural
aligned.

Current implementation is not atomic because it loads instruction twice
for first and last 2 bytes. We load 4 bytes at once to keep the
atomicity. This instruction preload method only applys when instruction
is 4-byte aligned. If instruction is unaligned, it could be across pages
so that preload will trigger additional page fault.

We encounter this issue when doing pressure test of enabling & disabling
Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
modification and execution of instruction, so non-atomic instruction
fetch will cause the race condition. We may fetch the wrong instruction
which is the mixing of 2 instructions.

Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
Ziccif protects the atomicity of instruction fetch when it is
natural aligned.

This commit depends on the atomic read support of translator_ld in
the commit 6a9dfe1984b0c593fb0ddb52d4e70832e6201dd6.

Signed-off-by: Jim Shu <jim.shu@sifive.com>
Reviewed-by: Frank Chang <frank.chang@sifive.com>
---
 target/riscv/translate.c | 46 +++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 85128f997b..77edf04803 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -1222,13 +1222,35 @@ const RISCVDecoder decoder_table[] = {
 
 const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
 
-static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
+static void decode_opc(CPURISCVState *env, DisasContext *ctx)
 {
+    uint32_t opcode;
+    bool pc_is_4byte_align = ((ctx->base.pc_next % 4) == 0);
+
     ctx->virt_inst_excp = false;
-    ctx->cur_insn_len = insn_len(opcode);
+    if (pc_is_4byte_align) {
+        /*
+         * Load 4 bytes at once to make instruction fetch atomically.
+         *
+         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
+         * across pages. We could preload 4 bytes instruction no matter
+         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
+         * additional page fault.
+         */
+        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
+    } else {
+        /*
+         * For unaligned pc, instruction preload may trigger additional
+         * page fault so we only load 2 bytes here.
+         */
+        opcode = (uint32_t) translator_lduw(env, &ctx->base, ctx->base.pc_next);
+    }
+    ctx->ol = ctx->xl;
+
+    ctx->cur_insn_len = insn_len((uint16_t)opcode);
     /* Check for compressed insn */
     if (ctx->cur_insn_len == 2) {
-        ctx->opcode = opcode;
+        ctx->opcode = (uint16_t)opcode;
         /*
          * The Zca extension is added as way to refer to instructions in the C
          * extension that do not include the floating-point loads and stores
@@ -1238,15 +1260,17 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
             return;
         }
     } else {
-        uint32_t opcode32 = opcode;
-        opcode32 = deposit32(opcode32, 16, 16,
-                             translator_lduw(env, &ctx->base,
-                                             ctx->base.pc_next + 2));
-        ctx->opcode = opcode32;
+        if (!pc_is_4byte_align) {
+            /* Load last 2 bytes of instruction here */
+            opcode = deposit32(opcode, 16, 16,
+                               translator_lduw(env, &ctx->base,
+                                               ctx->base.pc_next + 2));
+        }
+        ctx->opcode = opcode;
 
         for (guint i = 0; i < ctx->decoders->len; ++i) {
             riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
-            if (func(ctx, opcode32)) {
+            if (func(ctx, opcode)) {
                 return;
             }
         }
@@ -1324,10 +1348,8 @@ static void riscv_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
     CPURISCVState *env = cpu_env(cpu);
-    uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next);
 
-    ctx->ol = ctx->xl;
-    decode_opc(env, ctx, opcode16);
+    decode_opc(env, ctx);
     ctx->base.pc_next += ctx->cur_insn_len;
 
     /*
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif)
  2025-05-08  9:48 [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif) Jim Shu
@ 2025-06-02  8:55 ` Jim Shu
  2025-06-05  4:21 ` Alistair Francis
  2025-06-05  6:38 ` Alistair Francis
  2 siblings, 0 replies; 4+ messages in thread
From: Jim Shu @ 2025-06-02  8:55 UTC (permalink / raw)
  To: qemu-devel, qemu-riscv
  Cc: Palmer Dabbelt, Alistair Francis, Weiwei Li,
	Daniel Henrique Barboza, Liu Zhiwei

Hi,

Gentle ping on this patch.

Thanks,
Jim Shu


On Thu, May 8, 2025 at 5:48 PM Jim Shu <jim.shu@sifive.com> wrote:
>
> Support 4-byte atomic instruction fetch when instruction is natural
> aligned.
>
> Current implementation is not atomic because it loads instruction twice
> for first and last 2 bytes. We load 4 bytes at once to keep the
> atomicity. This instruction preload method only applys when instruction
> is 4-byte aligned. If instruction is unaligned, it could be across pages
> so that preload will trigger additional page fault.
>
> We encounter this issue when doing pressure test of enabling & disabling
> Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
> modification and execution of instruction, so non-atomic instruction
> fetch will cause the race condition. We may fetch the wrong instruction
> which is the mixing of 2 instructions.
>
> Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
> Ziccif protects the atomicity of instruction fetch when it is
> natural aligned.
>
> This commit depends on the atomic read support of translator_ld in
> the commit 6a9dfe1984b0c593fb0ddb52d4e70832e6201dd6.
>
> Signed-off-by: Jim Shu <jim.shu@sifive.com>
> Reviewed-by: Frank Chang <frank.chang@sifive.com>
> ---
>  target/riscv/translate.c | 46 +++++++++++++++++++++++++++++-----------
>  1 file changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> index 85128f997b..77edf04803 100644
> --- a/target/riscv/translate.c
> +++ b/target/riscv/translate.c
> @@ -1222,13 +1222,35 @@ const RISCVDecoder decoder_table[] = {
>
>  const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
>
> -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
> +static void decode_opc(CPURISCVState *env, DisasContext *ctx)
>  {
> +    uint32_t opcode;
> +    bool pc_is_4byte_align = ((ctx->base.pc_next % 4) == 0);
> +
>      ctx->virt_inst_excp = false;
> -    ctx->cur_insn_len = insn_len(opcode);
> +    if (pc_is_4byte_align) {
> +        /*
> +         * Load 4 bytes at once to make instruction fetch atomically.
> +         *
> +         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
> +         * across pages. We could preload 4 bytes instruction no matter
> +         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
> +         * additional page fault.
> +         */
> +        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
> +    } else {
> +        /*
> +         * For unaligned pc, instruction preload may trigger additional
> +         * page fault so we only load 2 bytes here.
> +         */
> +        opcode = (uint32_t) translator_lduw(env, &ctx->base, ctx->base.pc_next);
> +    }
> +    ctx->ol = ctx->xl;
> +
> +    ctx->cur_insn_len = insn_len((uint16_t)opcode);
>      /* Check for compressed insn */
>      if (ctx->cur_insn_len == 2) {
> -        ctx->opcode = opcode;
> +        ctx->opcode = (uint16_t)opcode;
>          /*
>           * The Zca extension is added as way to refer to instructions in the C
>           * extension that do not include the floating-point loads and stores
> @@ -1238,15 +1260,17 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
>              return;
>          }
>      } else {
> -        uint32_t opcode32 = opcode;
> -        opcode32 = deposit32(opcode32, 16, 16,
> -                             translator_lduw(env, &ctx->base,
> -                                             ctx->base.pc_next + 2));
> -        ctx->opcode = opcode32;
> +        if (!pc_is_4byte_align) {
> +            /* Load last 2 bytes of instruction here */
> +            opcode = deposit32(opcode, 16, 16,
> +                               translator_lduw(env, &ctx->base,
> +                                               ctx->base.pc_next + 2));
> +        }
> +        ctx->opcode = opcode;
>
>          for (guint i = 0; i < ctx->decoders->len; ++i) {
>              riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
> -            if (func(ctx, opcode32)) {
> +            if (func(ctx, opcode)) {
>                  return;
>              }
>          }
> @@ -1324,10 +1348,8 @@ static void riscv_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
>  {
>      DisasContext *ctx = container_of(dcbase, DisasContext, base);
>      CPURISCVState *env = cpu_env(cpu);
> -    uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next);
>
> -    ctx->ol = ctx->xl;
> -    decode_opc(env, ctx, opcode16);
> +    decode_opc(env, ctx);
>      ctx->base.pc_next += ctx->cur_insn_len;
>
>      /*
> --
> 2.17.1
>


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif)
  2025-05-08  9:48 [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif) Jim Shu
  2025-06-02  8:55 ` Jim Shu
@ 2025-06-05  4:21 ` Alistair Francis
  2025-06-05  6:38 ` Alistair Francis
  2 siblings, 0 replies; 4+ messages in thread
From: Alistair Francis @ 2025-06-05  4:21 UTC (permalink / raw)
  To: Jim Shu
  Cc: qemu-devel, qemu-riscv, Palmer Dabbelt, Alistair Francis,
	Weiwei Li, Daniel Henrique Barboza, Liu Zhiwei

On Thu, May 8, 2025 at 7:49 PM Jim Shu <jim.shu@sifive.com> wrote:
>
> Support 4-byte atomic instruction fetch when instruction is natural
> aligned.
>
> Current implementation is not atomic because it loads instruction twice
> for first and last 2 bytes. We load 4 bytes at once to keep the
> atomicity. This instruction preload method only applys when instruction
> is 4-byte aligned. If instruction is unaligned, it could be across pages
> so that preload will trigger additional page fault.
>
> We encounter this issue when doing pressure test of enabling & disabling
> Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
> modification and execution of instruction, so non-atomic instruction
> fetch will cause the race condition. We may fetch the wrong instruction
> which is the mixing of 2 instructions.
>
> Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
> Ziccif protects the atomicity of instruction fetch when it is
> natural aligned.
>
> This commit depends on the atomic read support of translator_ld in
> the commit 6a9dfe1984b0c593fb0ddb52d4e70832e6201dd6.
>
> Signed-off-by: Jim Shu <jim.shu@sifive.com>
> Reviewed-by: Frank Chang <frank.chang@sifive.com>

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>

Alistair

> ---
>  target/riscv/translate.c | 46 +++++++++++++++++++++++++++++-----------
>  1 file changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> index 85128f997b..77edf04803 100644
> --- a/target/riscv/translate.c
> +++ b/target/riscv/translate.c
> @@ -1222,13 +1222,35 @@ const RISCVDecoder decoder_table[] = {
>
>  const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
>
> -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
> +static void decode_opc(CPURISCVState *env, DisasContext *ctx)
>  {
> +    uint32_t opcode;
> +    bool pc_is_4byte_align = ((ctx->base.pc_next % 4) == 0);
> +
>      ctx->virt_inst_excp = false;
> -    ctx->cur_insn_len = insn_len(opcode);
> +    if (pc_is_4byte_align) {
> +        /*
> +         * Load 4 bytes at once to make instruction fetch atomically.
> +         *
> +         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
> +         * across pages. We could preload 4 bytes instruction no matter
> +         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
> +         * additional page fault.
> +         */
> +        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
> +    } else {
> +        /*
> +         * For unaligned pc, instruction preload may trigger additional
> +         * page fault so we only load 2 bytes here.
> +         */
> +        opcode = (uint32_t) translator_lduw(env, &ctx->base, ctx->base.pc_next);
> +    }
> +    ctx->ol = ctx->xl;
> +
> +    ctx->cur_insn_len = insn_len((uint16_t)opcode);
>      /* Check for compressed insn */
>      if (ctx->cur_insn_len == 2) {
> -        ctx->opcode = opcode;
> +        ctx->opcode = (uint16_t)opcode;
>          /*
>           * The Zca extension is added as way to refer to instructions in the C
>           * extension that do not include the floating-point loads and stores
> @@ -1238,15 +1260,17 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
>              return;
>          }
>      } else {
> -        uint32_t opcode32 = opcode;
> -        opcode32 = deposit32(opcode32, 16, 16,
> -                             translator_lduw(env, &ctx->base,
> -                                             ctx->base.pc_next + 2));
> -        ctx->opcode = opcode32;
> +        if (!pc_is_4byte_align) {
> +            /* Load last 2 bytes of instruction here */
> +            opcode = deposit32(opcode, 16, 16,
> +                               translator_lduw(env, &ctx->base,
> +                                               ctx->base.pc_next + 2));
> +        }
> +        ctx->opcode = opcode;
>
>          for (guint i = 0; i < ctx->decoders->len; ++i) {
>              riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
> -            if (func(ctx, opcode32)) {
> +            if (func(ctx, opcode)) {
>                  return;
>              }
>          }
> @@ -1324,10 +1348,8 @@ static void riscv_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
>  {
>      DisasContext *ctx = container_of(dcbase, DisasContext, base);
>      CPURISCVState *env = cpu_env(cpu);
> -    uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next);
>
> -    ctx->ol = ctx->xl;
> -    decode_opc(env, ctx, opcode16);
> +    decode_opc(env, ctx);
>      ctx->base.pc_next += ctx->cur_insn_len;
>
>      /*
> --
> 2.17.1
>
>


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif)
  2025-05-08  9:48 [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif) Jim Shu
  2025-06-02  8:55 ` Jim Shu
  2025-06-05  4:21 ` Alistair Francis
@ 2025-06-05  6:38 ` Alistair Francis
  2 siblings, 0 replies; 4+ messages in thread
From: Alistair Francis @ 2025-06-05  6:38 UTC (permalink / raw)
  To: Jim Shu
  Cc: qemu-devel, qemu-riscv, Palmer Dabbelt, Alistair Francis,
	Weiwei Li, Daniel Henrique Barboza, Liu Zhiwei

On Thu, May 8, 2025 at 7:49 PM Jim Shu <jim.shu@sifive.com> wrote:
>
> Support 4-byte atomic instruction fetch when instruction is natural
> aligned.
>
> Current implementation is not atomic because it loads instruction twice
> for first and last 2 bytes. We load 4 bytes at once to keep the
> atomicity. This instruction preload method only applys when instruction
> is 4-byte aligned. If instruction is unaligned, it could be across pages
> so that preload will trigger additional page fault.
>
> We encounter this issue when doing pressure test of enabling & disabling
> Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
> modification and execution of instruction, so non-atomic instruction
> fetch will cause the race condition. We may fetch the wrong instruction
> which is the mixing of 2 instructions.
>
> Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
> Ziccif protects the atomicity of instruction fetch when it is
> natural aligned.
>
> This commit depends on the atomic read support of translator_ld in
> the commit 6a9dfe1984b0c593fb0ddb52d4e70832e6201dd6.
>
> Signed-off-by: Jim Shu <jim.shu@sifive.com>
> Reviewed-by: Frank Chang <frank.chang@sifive.com>

Thanks!

Applied to riscv-to-apply.next

Alistair

> ---
>  target/riscv/translate.c | 46 +++++++++++++++++++++++++++++-----------
>  1 file changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> index 85128f997b..77edf04803 100644
> --- a/target/riscv/translate.c
> +++ b/target/riscv/translate.c
> @@ -1222,13 +1222,35 @@ const RISCVDecoder decoder_table[] = {
>
>  const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
>
> -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
> +static void decode_opc(CPURISCVState *env, DisasContext *ctx)
>  {
> +    uint32_t opcode;
> +    bool pc_is_4byte_align = ((ctx->base.pc_next % 4) == 0);
> +
>      ctx->virt_inst_excp = false;
> -    ctx->cur_insn_len = insn_len(opcode);
> +    if (pc_is_4byte_align) {
> +        /*
> +         * Load 4 bytes at once to make instruction fetch atomically.
> +         *
> +         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
> +         * across pages. We could preload 4 bytes instruction no matter
> +         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
> +         * additional page fault.
> +         */
> +        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
> +    } else {
> +        /*
> +         * For unaligned pc, instruction preload may trigger additional
> +         * page fault so we only load 2 bytes here.
> +         */
> +        opcode = (uint32_t) translator_lduw(env, &ctx->base, ctx->base.pc_next);
> +    }
> +    ctx->ol = ctx->xl;
> +
> +    ctx->cur_insn_len = insn_len((uint16_t)opcode);
>      /* Check for compressed insn */
>      if (ctx->cur_insn_len == 2) {
> -        ctx->opcode = opcode;
> +        ctx->opcode = (uint16_t)opcode;
>          /*
>           * The Zca extension is added as way to refer to instructions in the C
>           * extension that do not include the floating-point loads and stores
> @@ -1238,15 +1260,17 @@ static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
>              return;
>          }
>      } else {
> -        uint32_t opcode32 = opcode;
> -        opcode32 = deposit32(opcode32, 16, 16,
> -                             translator_lduw(env, &ctx->base,
> -                                             ctx->base.pc_next + 2));
> -        ctx->opcode = opcode32;
> +        if (!pc_is_4byte_align) {
> +            /* Load last 2 bytes of instruction here */
> +            opcode = deposit32(opcode, 16, 16,
> +                               translator_lduw(env, &ctx->base,
> +                                               ctx->base.pc_next + 2));
> +        }
> +        ctx->opcode = opcode;
>
>          for (guint i = 0; i < ctx->decoders->len; ++i) {
>              riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
> -            if (func(ctx, opcode32)) {
> +            if (func(ctx, opcode)) {
>                  return;
>              }
>          }
> @@ -1324,10 +1348,8 @@ static void riscv_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
>  {
>      DisasContext *ctx = container_of(dcbase, DisasContext, base);
>      CPURISCVState *env = cpu_env(cpu);
> -    uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next);
>
> -    ctx->ol = ctx->xl;
> -    decode_opc(env, ctx, opcode16);
> +    decode_opc(env, ctx);
>      ctx->base.pc_next += ctx->cur_insn_len;
>
>      /*
> --
> 2.17.1
>
>


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-06-05  6:39 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-08  9:48 [PATCH v2] target/riscv: support atomic instruction fetch (Ziccif) Jim Shu
2025-06-02  8:55 ` Jim Shu
2025-06-05  4:21 ` Alistair Francis
2025-06-05  6:38 ` Alistair Francis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).