[Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel]  [mttcg RFC v4 0/6] Atomic slow-path for mttcg
@ 2015-08-14 15:55 Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 1/6] cpus: async_run_on_cpu: kick only if needed Alvise Rigo
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

This is just an update with the relevant patches needed to port the v4
of "Slow-path for atomic instruction translation" to mttcg v7.
The full source code is available at the following GIT repository:
https://git.virtualopensystems.com/dev/qemu-mt.git
branch:
slowpath-for-atomic-v4-mttcg

The [PATCH 6/6] is the first step to use runtime helpers in place of the
TCG load/store instructions as suggested by Aurelien Jarno.

The patch-series addresses also some of the Paolo's comments, however
the exclusive bitmap is still using one bit for each vCPU.

Alvise Rigo (6):
  cpus: async_run_on_cpu: kick only if needed
  cputlb: wrap tlb_flush with the a new function
  exec: ram_addr: Fix exclusive bitmap accessor
  softmmu_llsc_template.h: move to multithreading
  softmmu_template.h: move to multithreading
  target-arm: Use a runtime helper for excl accesses

 cpus.c                  |  4 +++-
 cputlb.c                | 25 ++++++++++++++++----
 include/exec/exec-all.h |  1 +
 include/exec/ram_addr.h | 61 +++++++++++++++++++++++++++++++++++++++++--------
 include/qom/cpu.h       |  4 ++++
 softmmu_llsc_template.h | 59 +++++++++++++++++++++++++++++++++++------------
 softmmu_template.h      | 36 ++++++++++++++++++++++++-----
 target-arm/helper.h     |  2 ++
 target-arm/op_helper.c  | 11 +++++++++
 target-arm/translate.c  | 12 +++++++++-
 10 files changed, 177 insertions(+), 38 deletions(-)

-- 
2.5.0

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Qemu-devel] [mttcg RFC v4 1/6] cpus: async_run_on_cpu: kick only if needed
  2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
@ 2015-08-14 15:55 ` Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 2/6] cputlb: wrap tlb_flush with the a new function Alvise Rigo
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

In some unique situations a vCPU can be kicked even if it's not ready to
execute TCG code i.e. when current_tb has never been set before.
This can happen with the atomic stress test (not kvm-unit-test based),
where a vCPU can query some work to a not yet started vCPU.

Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 cpus.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpus.c b/cpus.c
index f61530c..3d90142 100644
--- a/cpus.c
+++ b/cpus.c
@@ -935,7 +935,9 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
     wi->done = false;
     qemu_mutex_unlock(&cpu->work_mutex);
 
-    qemu_cpu_kick(cpu);
+    if (tcg_enabled() && (atomic_read(&cpu->tcg_exec_flag) == 1)) {
+        qemu_cpu_kick(cpu);
+    }
 }
 
 void async_run_safe_work_on_cpu(CPUState *cpu, void (*func)(void *data),
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [mttcg RFC v4 2/6] cputlb: wrap tlb_flush with the a new function
  2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 1/6] cpus: async_run_on_cpu: kick only if needed Alvise Rigo
@ 2015-08-14 15:55 ` Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 3/6] exec: ram_addr: Fix exclusive bitmap accessor Alvise Rigo
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

Introduce the new tlb_query_flush_cpu function to query a TLB flush
to a given vCPU.
The function takes care to check and set a new flag (pending_tlb_flush)
to avoid unnecessary flushes.

Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 cputlb.c                | 21 ++++++++++++++++-----
 include/exec/exec-all.h |  1 +
 include/qom/cpu.h       |  4 ++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 538c92d..7cbaaca 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -79,13 +79,27 @@ static void tlb_flush_async_work(void *opaque)
     struct TLBFlushParams *params = opaque;
 
     tlb_flush(params->cpu, params->flush_global);
+    atomic_set(&params->cpu->pending_tlb_flush, 0);
+
     g_free(params);
 }
 
+void tlb_query_flush_cpu(CPUState *cpu, int flush_global) {
+    struct TLBFlushParams *params;
+
+    if (!atomic_read(&cpu->pending_tlb_flush)) {
+        params = g_malloc(sizeof(struct TLBFlushParams));
+        params->cpu = cpu;
+        params->flush_global = flush_global;
+
+        atomic_set(&cpu->pending_tlb_flush, 1);
+        async_run_on_cpu(cpu, tlb_flush_async_work, params);
+    }
+}
+
 void tlb_flush_all(int flush_global)
 {
     CPUState *cpu;
-    struct TLBFlushParams *params;
 
 #if 0 /* MTTCG */
     CPU_FOREACH(cpu) {
@@ -99,10 +113,7 @@ void tlb_flush_all(int flush_global)
              */
             tlb_flush(cpu, flush_global);
         } else {
-            params = g_malloc(sizeof(struct TLBFlushParams));
-            params->cpu = cpu;
-            params->flush_global = flush_global;
-            async_run_on_cpu(cpu, tlb_flush_async_work, params);
+            tlb_query_flush_cpu(cpu, flush_global);
         }
     }
 #endif /* MTTCG */
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 246df68..3c36724 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -99,6 +99,7 @@ void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as);
 /* cputlb.c */
 void tlb_flush_page_all(target_ulong addr);
 void tlb_flush_page(CPUState *cpu, target_ulong addr);
+void tlb_query_flush_cpu(CPUState *cpu, int flush_global);
 void tlb_flush_all(int flush_global);
 void tlb_flush(CPUState *cpu, int flush_global);
 void tlb_set_page(CPUState *cpu, target_ulong vaddr,
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 23418c0..62abf6e 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -299,6 +299,10 @@ struct CPUState {
 
     void *opaque;
 
+    /* True if the CPU has a pending request for a TLB flush. While this value
+     * is true, any flush request will be ignored. */
+    int pending_tlb_flush;
+
     /* In order to avoid passing too many arguments to the MMIO helpers,
      * we store some rarely used information in the CPU context.
      */
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [mttcg RFC v4 3/6] exec: ram_addr: Fix exclusive bitmap accessor
  2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 1/6] cpus: async_run_on_cpu: kick only if needed Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 2/6] cputlb: wrap tlb_flush with the a new function Alvise Rigo
@ 2015-08-14 15:55 ` Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 4/6] softmmu_llsc_template.h: move to multithreading Alvise Rigo
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 include/exec/ram_addr.h | 61 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 6b678d6..34bb486 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -269,11 +269,28 @@ static inline int cpu_physical_memory_excl_atleast_one_clean(ram_addr_t addr)
     unsigned long next, end;
 
     if (likely(smp_cpus <= BITS_PER_LONG)) {
-        unsigned long mask = (1 << smp_cpus) - 1;
-
-        return
-            (mask & (bitmap[BIT_WORD(EXCL_BITMAP_GET_OFFSET(addr))] >>
-            (EXCL_BITMAP_GET_OFFSET(addr) & (BITS_PER_LONG-1)))) != mask;
+        unsigned long mask1;
+        uint32_t shift, first_off;
+        /* Number of vCPUs bits in the next long. */
+        int bits_left;
+
+        first_off = BIT_WORD(EXCL_BITMAP_GET_OFFSET(addr));
+        shift = (EXCL_BITMAP_GET_OFFSET(addr) & (BITS_PER_LONG-1));
+        bits_left = (shift + smp_cpus) - BITS_PER_LONG;
+
+        if (bits_left <= 0) {
+            mask1 = (1 << smp_cpus) - 1;
+            return (mask1 & (bitmap[first_off] >> shift)) != mask1;
+        } else {
+            /* The bits we need to access span two different longs. */
+            unsigned long mask2;
+
+            mask2 = (1 << bits_left) - 1;
+            mask1 = (1 << (smp_cpus - bits_left)) - 1;
+
+            return !(((mask2 & bitmap[first_off + 1]) == mask2) &&
+                   ((mask1 & (bitmap[first_off] >> shift)) == mask1));
+        }
     }
 
     end = BIT_WORD(EXCL_BITMAP_GET_OFFSET(addr)) + smp_cpus;
@@ -288,15 +305,41 @@ static inline int cpu_physical_memory_excl_is_dirty(ram_addr_t addr,
 {
     unsigned long *bitmap = ram_list.dirty_memory[DIRTY_MEMORY_EXCLUSIVE];
     unsigned long end, next;
-    uint32_t add;
+    uint32_t add, first_off;
 
     assert(cpu <= smp_cpus);
 
     if (likely(smp_cpus <= BITS_PER_LONG)) {
-        cpu = (cpu == smp_cpus) ? (1 << cpu) - 1 : (1 << cpu);
+        uint32_t shift = 0;
+
+        if (cpu == smp_cpus) {
+            unsigned long mask1, mask2;
+            int bits_left;
+
+            first_off = BIT_WORD(EXCL_BITMAP_GET_OFFSET(addr));
+            shift = (EXCL_BITMAP_GET_OFFSET(addr) & (BITS_PER_LONG-1));
+            bits_left = (shift + cpu) - BITS_PER_LONG;
+
+            if (bits_left <= 0) {
+                mask1 = (1 << cpu) - 1;
+
+                return mask1 & (bitmap[first_off] >> shift);
+            }
+
+            mask2 = (1 << bits_left) - 1;
+            mask1 = (1 << (cpu - bits_left)) - 1;
+
+            return (mask1 & (bitmap[first_off] >> shift)) |
+                   (mask2 & (bitmap[first_off + 1]));
+        } else {
+            first_off = BIT_WORD(EXCL_BITMAP_GET_OFFSET(addr) + cpu);
+            shift = ((EXCL_BITMAP_GET_OFFSET(addr) + cpu) & (BITS_PER_LONG-1));
+
+            return 1 & (bitmap[first_off] >> shift);
+        }
 
         return cpu & (bitmap[BIT_WORD(EXCL_BITMAP_GET_OFFSET(addr))] >>
-                     (EXCL_BITMAP_GET_OFFSET(addr) & (BITS_PER_LONG-1)));
+                     (shift));
     }
 
     add = (cpu == smp_cpus) ? 0 : 1;
@@ -315,7 +358,5 @@ static inline bool cpu_physical_memory_clear_excl_dirty(ram_addr_t addr,
                                 EXCL_BITMAP_GET_OFFSET(addr) + cpu_index, 1);
 }
 
-
-
 #endif
 #endif
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [mttcg RFC v4 4/6] softmmu_llsc_template.h: move to multithreading
  2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
                   ` (2 preceding siblings ...)
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 3/6] exec: ram_addr: Fix exclusive bitmap accessor Alvise Rigo
@ 2015-08-14 15:55 ` Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 5/6] softmmu_template.h: " Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 6/6] target-arm: Use a runtime helper for excl accesses Alvise Rigo
  5 siblings, 0 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

Update the TCG LL/SC helpers to work in multi-threading.
The basic idea remains untouched, but the whole mechanism takes now into
account of the multiple, concurrent, vCPUs execution.

In essence, if a vCPU does a LL it checks the vCPUs that have not the
excl bit set for the accessed page. For those vCPUs it then:
- sets the excl bit
- queries a TLB flush

Doing so, we make sure that all the vCPUs will have the EXCL flag in the
TLB entry for that specific page *before* entering the next TB

Changes from v3:
- The rendez-vous mechanism has been removed since the reworked
  TLB flush query addresses the same purpose.

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 cputlb.c                |  4 ++++
 softmmu_llsc_template.h | 59 ++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 7cbaaca..08949df 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -36,6 +36,10 @@
 /* statistics */
 int tlb_flush_count;
 
+/* For atomic instruction handling. */
+int exit_flush_request = 0;
+QemuMutex tcg_excl_access_lock;
+
 /* NOTE:
  * If flush_global is true (the usual case), flush all tlb entries.
  * If flush_global is false, flush (at least) all tlb entries not
diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
index d2e92b4..9486385 100644
--- a/softmmu_llsc_template.h
+++ b/softmmu_llsc_template.h
@@ -33,25 +33,39 @@
 
 #define helper_ldlink_name  glue(glue(helper_be_ldlink, USUFFIX), MMUSUFFIX)
 #define helper_stcond_name  glue(glue(helper_be_stcond, SUFFIX), MMUSUFFIX)
-#define helper_ld_legacy glue(glue(helper_be_ld, USUFFIX), MMUSUFFIX)
-#define helper_st_legacy glue(glue(helper_be_st, SUFFIX), MMUSUFFIX)
+#define helper_ld glue(glue(helper_be_ld, USUFFIX), MMUSUFFIX)
+#define helper_st glue(glue(helper_be_st, SUFFIX), MMUSUFFIX)
 
 #else /* LE helpers + 8bit helpers (generated only once for both LE end BE) */
 
 #if DATA_SIZE > 1
 #define helper_ldlink_name  glue(glue(helper_le_ldlink, USUFFIX), MMUSUFFIX)
 #define helper_stcond_name  glue(glue(helper_le_stcond, SUFFIX), MMUSUFFIX)
-#define helper_ld_legacy glue(glue(helper_le_ld, USUFFIX), MMUSUFFIX)
-#define helper_st_legacy glue(glue(helper_le_st, SUFFIX), MMUSUFFIX)
+#define helper_ld glue(glue(helper_le_ld, USUFFIX), MMUSUFFIX)
+#define helper_st glue(glue(helper_le_st, SUFFIX), MMUSUFFIX)
 #else /* DATA_SIZE <= 1 */
 #define helper_ldlink_name  glue(glue(helper_ret_ldlink, USUFFIX), MMUSUFFIX)
 #define helper_stcond_name  glue(glue(helper_ret_stcond, SUFFIX), MMUSUFFIX)
-#define helper_ld_legacy glue(glue(helper_ret_ld, USUFFIX), MMUSUFFIX)
-#define helper_st_legacy glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)
+#define helper_ld glue(glue(helper_ret_ld, USUFFIX), MMUSUFFIX)
+#define helper_st glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)
 #endif
 
 #endif
 
+#define is_read_tlb_entry_set(env, page, index)                              \
+({                                                                           \
+    (addr & TARGET_PAGE_MASK)                                                \
+         == ((env->tlb_table[mmu_idx][index].addr_read) &                    \
+                 (TARGET_PAGE_MASK | TLB_INVALID_MASK));                     \
+})
+/* Whenever a SC operation fails, we add a small delay to reduce the
+ * concurrency among the atomic instruction emulation code. Without this delay,
+ * in very congested situation where plain stores make all the pending LLs
+ * fail, the code could reach a stalling situation in which all the SCs happen
+ * to fail.
+ * */
+#define TCG_ATOMIC_INSN_EMUL_DELAY 100
+
 WORD_TYPE helper_ldlink_name(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr)
 {
@@ -61,11 +75,13 @@ WORD_TYPE helper_ldlink_name(CPUArchState *env, target_ulong addr,
     hwaddr hw_addr;
     unsigned mmu_idx = get_mmuidx(oi);
 
-    /* Use the proper load helper from cpu_ldst.h */
-    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
-
     index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
 
+    if (!is_read_tlb_entry_set(env, addr, index) ||
+                        !VICTIM_TLB_HIT(addr_read)) {
+        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+    }
+
     /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
      * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
     hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
@@ -73,22 +89,34 @@ WORD_TYPE helper_ldlink_name(CPUArchState *env, target_ulong addr,
     cpu_physical_memory_clear_excl_dirty(hw_addr, ENV_GET_CPU(env)->cpu_index);
     /* If all the vCPUs have the EXCL bit set for this page there is no need
      * to request any flush. */
-    if (cpu_physical_memory_excl_is_dirty(hw_addr, smp_cpus)) {
+    if (unlikely(!atomic_xchg(&exit_flush_request, 1) &&
+        cpu_physical_memory_excl_is_dirty(hw_addr, smp_cpus))) {
         CPU_FOREACH(cpu) {
-            if (current_cpu != cpu) {
+            if (cpu->thread_id != qemu_get_thread_id()) {
                 if (cpu_physical_memory_excl_is_dirty(hw_addr,
                                                     cpu->cpu_index)) {
                     cpu_physical_memory_clear_excl_dirty(hw_addr,
                                                          cpu->cpu_index);
-                    tlb_flush(cpu, 1);
+                    tlb_query_flush_cpu(cpu, 1);
                 }
             }
         }
+
+        atomic_set(&exit_flush_request, 0);
     }
 
+    env->ll_sc_context = true;
+
+    qemu_mutex_lock(&tcg_excl_access_lock);
+
+    /* Use the proper load helper from cpu_ldst.h */
+    ret = helper_ld(env, addr, mmu_idx, retaddr);
+
     env->excl_protected_range.begin = hw_addr;
     env->excl_protected_range.end = hw_addr + DATA_SIZE;
 
+    qemu_mutex_unlock(&tcg_excl_access_lock);
+
     /* For this vCPU, just update the TLB entry, no need to flush. */
     env->tlb_table[mmu_idx][index].addr_write |= TLB_EXCL;
 
@@ -106,12 +134,13 @@ WORD_TYPE helper_stcond_name(CPUArchState *env, target_ulong addr,
      * access as one made by the store conditional wrapper. If the store
      * conditional does not succeed, the value will be set to 0.*/
     env->excl_succeeded = 1;
-    helper_st_legacy(env, addr, val, mmu_idx, retaddr);
+    helper_st(env, addr, val, mmu_idx, retaddr);
 
     if (env->excl_succeeded) {
         env->excl_succeeded = 0;
         ret = 0;
     } else {
+        g_usleep(TCG_ATOMIC_INSN_EMUL_DELAY);
         ret = 1;
     }
 
@@ -120,5 +149,5 @@ WORD_TYPE helper_stcond_name(CPUArchState *env, target_ulong addr,
 
 #undef helper_ldlink_name
 #undef helper_stcond_name
-#undef helper_ld_legacy
-#undef helper_st_legacy
+#undef helper_ld
+#undef helper_st
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [mttcg RFC v4 5/6] softmmu_template.h: move to multithreading
  2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
                   ` (3 preceding siblings ...)
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 4/6] softmmu_llsc_template.h: move to multithreading Alvise Rigo
@ 2015-08-14 15:55 ` Alvise Rigo
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 6/6] target-arm: Use a runtime helper for excl accesses Alvise Rigo
  5 siblings, 0 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

Exploiting the tcg_excl_access_lock, port the helper_{le,be}_st_name to
work in real multithreading.

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 softmmu_template.h | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/softmmu_template.h b/softmmu_template.h
index ad65d20..514aeb7 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -418,20 +418,29 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
              * exclusive-protected memory. */
             hwaddr hw_addr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
 
+            qemu_mutex_lock(&tcg_excl_access_lock);
             /* The function lookup_and_reset_cpus_ll_addr could have reset the
              * exclusive address. Fail the SC in this case.
              * N.B.: Here excl_succeeded == 0 means that helper_le_st_name has
              * not been called by a softmmu_llsc_template.h. */
             if(env->excl_succeeded) {
-                if (env->excl_protected_range.begin != hw_addr) {
-                    /* The vCPU is SC-ing to an unprotected address. */
+                if (!((env->excl_protected_range.begin == hw_addr) &&
+                  env->excl_protected_range.end == (hw_addr + DATA_SIZE))) {
+                    /* The vCPU is SC-ing to an unprotected address. This
+                     * can also happen when a vCPU stores to the address.
+                     * */
                     env->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
                     env->excl_succeeded = 0;
 
+                    qemu_mutex_unlock(&tcg_excl_access_lock);
+
                     return;
                 }
 
-                cpu_physical_memory_set_excl_dirty(hw_addr, ENV_GET_CPU(env)->cpu_index);
+                /* Now we are going for sure to complete the access. Set the
+                 * bit to dirty. */
+                cpu_physical_memory_set_excl_dirty(hw_addr,
+                                                  ENV_GET_CPU(env)->cpu_index);
             }
 
             haddr = addr + env->tlb_table[mmu_idx][index].addend;
@@ -441,8 +450,11 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
             glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
         #endif
 
+            /* This will reset the excl address also for the current vCPU. */
             lookup_and_reset_cpus_ll_addr(hw_addr, DATA_SIZE);
 
+            qemu_mutex_unlock(&tcg_excl_access_lock);
+
             return;
         } else {
             if ((addr & (DATA_SIZE - 1)) != 0) {
@@ -532,20 +544,29 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
              * exclusive-protected memory. */
             hwaddr hw_addr = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
 
+            qemu_mutex_lock(&tcg_excl_access_lock);
             /* The function lookup_and_reset_cpus_ll_addr could have reset the
              * exclusive address. Fail the SC in this case.
              * N.B.: Here excl_succeeded == 0 means that helper_le_st_name has
              * not been called by a softmmu_llsc_template.h. */
             if(env->excl_succeeded) {
-                if (env->excl_protected_range.begin != hw_addr) {
-                    /* The vCPU is SC-ing to an unprotected address. */
+                if (!((env->excl_protected_range.begin == hw_addr) &&
+                  env->excl_protected_range.end == (hw_addr + DATA_SIZE))) {
+                    /* The vCPU is SC-ing to an unprotected address. This
+                     * can also happen when a vCPU stores to the address.
+                     * */
                     env->excl_protected_range.begin = EXCLUSIVE_RESET_ADDR;
                     env->excl_succeeded = 0;
 
+                    qemu_mutex_unlock(&tcg_excl_access_lock);
+
                     return;
                 }
 
-                cpu_physical_memory_set_excl_dirty(hw_addr, ENV_GET_CPU(env)->cpu_index);
+                /* Now we are going for sure to complete the access. Set the
+                 * bit to dirty. */
+                cpu_physical_memory_set_excl_dirty(hw_addr,
+                                                  ENV_GET_CPU(env)->cpu_index);
             }
 
             haddr = addr + env->tlb_table[mmu_idx][index].addend;
@@ -555,8 +576,11 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
             glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
         #endif
 
+            /* This will reset the excl address also for the current vCPU. */
             lookup_and_reset_cpus_ll_addr(hw_addr, DATA_SIZE);
 
+            qemu_mutex_unlock(&tcg_excl_access_lock);
+
             return;
         } else {
             if ((addr & (DATA_SIZE - 1)) != 0) {
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [mttcg RFC v4 6/6] target-arm: Use a runtime helper for excl accesses
  2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
                   ` (4 preceding siblings ...)
  2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 5/6] softmmu_template.h: " Alvise Rigo
@ 2015-08-14 15:55 ` Alvise Rigo
  5 siblings, 0 replies; 7+ messages in thread
From: Alvise Rigo @ 2015-08-14 15:55 UTC (permalink / raw)
  To: qemu-devel, mttcg
  Cc: claudio.fontana, pbonzini, jani.kokkonen, tech, alex.bennee,
	aurelien

Instead of using TCG's load and store instructions, use a runtime helper
as a hook for the slow-path.

This is a proof of concept to verify that this approach is actually
working.
At the moment only the 32bit STREX is relying on this new code-path and
it's working as expected.

Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 target-arm/helper.h    |  2 ++
 target-arm/op_helper.c | 11 +++++++++++
 target-arm/translate.c | 12 +++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/target-arm/helper.h b/target-arm/helper.h
index c77bf04..c4da74a 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -534,6 +534,8 @@ DEF_HELPER_4(atomic_cmpxchg64, i32, env, i32, i64, i32)
 DEF_HELPER_1(atomic_clear, void, env)
 DEF_HELPER_3(atomic_claim, void, env, i32, i64)
 
+DEF_HELPER_4(stcond_aa32_i32, i32, env, i32, i32, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #endif
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index ba8c5f5..53dcdde 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -1095,3 +1095,14 @@ uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
         return ((uint32_t)x >> shift) | (x << (32 - shift));
     }
 }
+
+uint32_t HELPER(stcond_aa32_i32)(CPUARMState *env, uint32_t val, uint32_t addr,
+                                                            uint32_t index)
+{
+    CPUArchState *state = env;
+    TCGMemOpIdx op;
+
+    op = make_memop_idx(MO_LEUL, index);
+
+    return helper_le_stcondl_mmu(state, addr, val, op, 0);
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index d90a27b..591ce97 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -1006,6 +1006,17 @@ static inline void gen_aa32_stex64(TCGv_i32 is_dirty, TCGv_i64 val,
 
 #endif
 
+/* Use the runtime helper for 32bit exclusive stores. */
+static inline void gen_aa32_stex32(TCGv_i32 is_dirty, TCGv_i32 val,
+                                   TCGv_i32 addr, int index)
+{
+    TCGv index_tmp = tcg_temp_new_i32();
+
+    tcg_gen_movi_i32(index_tmp, index);
+    gen_helper_stcond_aa32_i32(is_dirty, cpu_env, val, addr, index_tmp);
+    tcg_temp_free_i32(index_tmp);
+}
+
 DO_GEN_LD(8s, MO_SB)
 DO_GEN_LD(8u, MO_UB)
 DO_GEN_LD(8uex, MO_UB | MO_EXCL)
@@ -1021,7 +1032,6 @@ DO_GEN_ST(32, MO_TEUL)
 /* Load/Store exclusive generators (always unsigned) */
 DO_GEN_STEX(8, MO_UB)
 DO_GEN_STEX(16, MO_TEUW)
-DO_GEN_STEX(32, MO_TEUL)
 
 static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-08-14 15:52 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-14 15:55 [Qemu-devel] [mttcg RFC v4 0/6] Atomic slow-path for mttcg Alvise Rigo
2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 1/6] cpus: async_run_on_cpu: kick only if needed Alvise Rigo
2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 2/6] cputlb: wrap tlb_flush with the a new function Alvise Rigo
2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 3/6] exec: ram_addr: Fix exclusive bitmap accessor Alvise Rigo
2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 4/6] softmmu_llsc_template.h: move to multithreading Alvise Rigo
2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 5/6] softmmu_template.h: " Alvise Rigo
2015-08-14 15:55 ` [Qemu-devel] [mttcg RFC v4 6/6] target-arm: Use a runtime helper for excl accesses Alvise Rigo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).