From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:53474) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZCSk3-0007ri-UA for qemu-devel@nongnu.org; Tue, 07 Jul 2015 09:16:58 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1ZCSjv-000843-Sy for qemu-devel@nongnu.org; Tue, 07 Jul 2015 09:16:55 -0400 Received: from greensocs.com ([193.104.36.180]:50575) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZCSjv-00083t-HF for qemu-devel@nongnu.org; Tue, 07 Jul 2015 09:16:47 -0400 Message-ID: <559BD13B.4080804@greensocs.com> Date: Tue, 07 Jul 2015 15:16:43 +0200 From: Frederic Konrad MIME-Version: 1.0 References: <1435330053-18733-1-git-send-email-fred.konrad@greensocs.com> <1435330053-18733-6-git-send-email-fred.konrad@greensocs.com> <87pp449my7.fsf@linaro.org> In-Reply-To: <87pp449my7.fsf@linaro.org> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [RFC PATCH V6 05/18] protect TBContext with tb_lock. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: =?UTF-8?B?QWxleCBCZW5uw6ll?= Cc: mttcg@listserver.greensocs.com, peter.maydell@linaro.org, a.spyridakis@virtualopensystems.com, mark.burton@greensocs.com, agraf@suse.de, qemu-devel@nongnu.org, guillaume.delbergue@greensocs.com, pbonzini@redhat.com, alistair.francis@xilinx.com On 07/07/2015 14:22, Alex Benn=C3=A9e wrote: > fred.konrad@greensocs.com writes: > >> From: KONRAD Frederic >> >> This protects TBContext with tb_lock to make tb_* thread safe. >> >> We can still have issue with tb_flush in case of multithread TCG: >> An other CPU can be executing code during a flush. >> >> This can be fixed later by making all other TCG thread exiting before = calling >> tb_flush(). >> >> tb_find_slow is separated into tb_find_slow and tb_find_physical as th= e whole >> tb_find_slow doesn't require to lock the tb. >> >> Signed-off-by: KONRAD Frederic > So my comments from earlier about the different locking between > CONFIG_USER and system emulation still stand. Ultimately we need a good > reason (or an abstraction) before sprinkling #ifdefs in the code if onl= y > for ease of reading. True. >> Changes: >> V1 -> V2: >> * Drop a tb_lock arround tb_find_fast in cpu-exec.c. >> --- >> cpu-exec.c | 60 ++++++++++++++-------- >> target-arm/translate.c | 5 ++ >> tcg/tcg.h | 7 +++ >> translate-all.c | 137 ++++++++++++++++++++++++++++++++++++++-= ---------- >> 4 files changed, 158 insertions(+), 51 deletions(-) >> >> diff --git a/cpu-exec.c b/cpu-exec.c >> index d6336d9..5d9b518 100644 >> --- a/cpu-exec.c >> +++ b/cpu-exec.c >> @@ -130,6 +130,8 @@ static void init_delay_params(SyncClocks *sc, cons= t CPUState *cpu) >> void cpu_loop_exit(CPUState *cpu) >> { >> cpu->current_tb =3D NULL; >> + /* Release those mutex before long jump so other thread can work.= */ >> + tb_lock_reset(); >> siglongjmp(cpu->jmp_env, 1); >> } >> =20 >> @@ -142,6 +144,8 @@ void cpu_resume_from_signal(CPUState *cpu, void *p= uc) >> /* XXX: restore cpu registers saved in host registers */ >> =20 >> cpu->exception_index =3D -1; >> + /* Release those mutex before long jump so other thread can work.= */ >> + tb_lock_reset(); >> siglongjmp(cpu->jmp_env, 1); >> } >> =20 >> @@ -253,12 +257,9 @@ static void cpu_exec_nocache(CPUArchState *env, i= nt max_cycles, >> tb_free(tb); >> } >> =20 >> -static TranslationBlock *tb_find_slow(CPUArchState *env, >> - target_ulong pc, >> - target_ulong cs_base, >> - uint64_t flags) >> +static TranslationBlock *tb_find_physical(CPUArchState *env, target_u= long pc, >> + target_ulong cs_base, uint6= 4_t flags) >> { > As Paolo has already mentioned comments on functions expecting to have > locks held when called. Ok, will do that. >> - CPUState *cpu =3D ENV_GET_CPU(env); >> TranslationBlock *tb, **ptb1; >> unsigned int h; >> tb_page_addr_t phys_pc, phys_page1; >> @@ -273,8 +274,9 @@ static TranslationBlock *tb_find_slow(CPUArchState= *env, >> ptb1 =3D &tcg_ctx.tb_ctx.tb_phys_hash[h]; >> for(;;) { >> tb =3D *ptb1; >> - if (!tb) >> - goto not_found; >> + if (!tb) { >> + return tb; >> + } >> if (tb->pc =3D=3D pc && >> tb->page_addr[0] =3D=3D phys_page1 && >> tb->cs_base =3D=3D cs_base && >> @@ -282,28 +284,43 @@ static TranslationBlock *tb_find_slow(CPUArchSta= te *env, >> /* check next page if needed */ >> if (tb->page_addr[1] !=3D -1) { >> tb_page_addr_t phys_page2; >> - >> virt_page2 =3D (pc & TARGET_PAGE_MASK) + >> TARGET_PAGE_SIZE; >> phys_page2 =3D get_page_addr_code(env, virt_page2); >> - if (tb->page_addr[1] =3D=3D phys_page2) >> - goto found; >> + if (tb->page_addr[1] =3D=3D phys_page2) { >> + return tb; >> + } >> } else { >> - goto found; >> + return tb; >> } >> } >> ptb1 =3D &tb->phys_hash_next; >> } >> - not_found: >> - /* if no translated code available, then translate it now */ >> - tb =3D tb_gen_code(cpu, pc, cs_base, flags, 0); >> - >> - found: >> - /* Move the last found TB to the head of the list */ >> - if (likely(*ptb1)) { >> - *ptb1 =3D tb->phys_hash_next; >> - tb->phys_hash_next =3D tcg_ctx.tb_ctx.tb_phys_hash[h]; >> - tcg_ctx.tb_ctx.tb_phys_hash[h] =3D tb; >> + return tb; >> +} >> + >> +static TranslationBlock *tb_find_slow(CPUArchState *env, target_ulong= pc, >> + target_ulong cs_base, uint64_t = flags) >> +{ >> + /* >> + * First try to get the tb if we don't find it we need to lock an= d compile >> + * it. >> + */ >> + CPUState *cpu =3D ENV_GET_CPU(env); >> + TranslationBlock *tb; >> + >> + tb =3D tb_find_physical(env, pc, cs_base, flags); >> + if (!tb) { >> + tb_lock(); >> + /* >> + * Retry to get the TB in case a CPU just translate it to avo= id having >> + * duplicated TB in the pool. >> + */ >> + tb =3D tb_find_physical(env, pc, cs_base, flags); >> + if (!tb) { >> + tb =3D tb_gen_code(cpu, pc, cs_base, flags, 0); >> + } >> + tb_unlock(); >> } >> /* we add the TB in the virtual pc hash table */ >> cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] =3D tb; >> @@ -326,6 +343,7 @@ static inline TranslationBlock *tb_find_fast(CPUAr= chState *env) >> tb->flags !=3D flags)) { >> tb =3D tb_find_slow(env, pc, cs_base, flags); >> } >> + >> return tb; >> } >> =20 >> diff --git a/target-arm/translate.c b/target-arm/translate.c >> index 971b6db..47345aa 100644 >> --- a/target-arm/translate.c >> +++ b/target-arm/translate.c >> @@ -11162,6 +11162,8 @@ static inline void gen_intermediate_code_inter= nal(ARMCPU *cpu, >> =20 >> dc->tb =3D tb; >> =20 >> + tb_lock(); >> + >> dc->is_jmp =3D DISAS_NEXT; >> dc->pc =3D pc_start; >> dc->singlestep_enabled =3D cs->singlestep_enabled; >> @@ -11499,6 +11501,7 @@ done_generating: >> tb->size =3D dc->pc - pc_start; >> tb->icount =3D num_insns; >> } >> + tb_unlock(); >> } >> =20 >> void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb) >> @@ -11567,6 +11570,7 @@ void arm_cpu_dump_state(CPUState *cs, FILE *f,= fprintf_function cpu_fprintf, >> =20 >> void restore_state_to_opc(CPUARMState *env, TranslationBlock *tb, in= t pc_pos) >> { >> + tb_lock(); >> if (is_a64(env)) { >> env->pc =3D tcg_ctx.gen_opc_pc[pc_pos]; >> env->condexec_bits =3D 0; >> @@ -11574,4 +11578,5 @@ void restore_state_to_opc(CPUARMState *env, Tr= anslationBlock *tb, int pc_pos) >> env->regs[15] =3D tcg_ctx.gen_opc_pc[pc_pos]; >> env->condexec_bits =3D gen_opc_condexec_bits[pc_pos]; >> } >> + tb_unlock(); >> } >> diff --git a/tcg/tcg.h b/tcg/tcg.h >> index 41e4869..032fe10 100644 >> --- a/tcg/tcg.h >> +++ b/tcg/tcg.h >> @@ -592,17 +592,24 @@ void *tcg_malloc_internal(TCGContext *s, int siz= e); >> void tcg_pool_reset(TCGContext *s); >> void tcg_pool_delete(TCGContext *s); >> =20 >> +void tb_lock(void); >> +void tb_unlock(void); >> +void tb_lock_reset(void); >> + >> static inline void *tcg_malloc(int size) >> { >> TCGContext *s =3D &tcg_ctx; >> uint8_t *ptr, *ptr_end; >> + tb_lock(); >> size =3D (size + sizeof(long) - 1) & ~(sizeof(long) - 1); >> ptr =3D s->pool_cur; >> ptr_end =3D ptr + size; >> if (unlikely(ptr_end > s->pool_end)) { >> + tb_unlock(); >> return tcg_malloc_internal(&tcg_ctx, size); > If the purpose of the lock is to protect the global tcg_ctx then we > shouldn't be unlocking before calling the _internal function which also > messes with the context. > Good point! I missed that. >> } else { >> s->pool_cur =3D ptr_end; >> + tb_unlock(); >> return ptr; >> } >> } >> diff --git a/translate-all.c b/translate-all.c >> index b6b0e1c..c25b79b 100644 >> --- a/translate-all.c >> +++ b/translate-all.c >> @@ -127,6 +127,34 @@ static void *l1_map[V_L1_SIZE]; >> /* code generation context */ >> TCGContext tcg_ctx; >> =20 >> +/* translation block context */ >> +__thread volatile int have_tb_lock; >> + >> +void tb_lock(void) >> +{ >> + if (!have_tb_lock) { >> + qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); >> + } >> + have_tb_lock++; >> +} >> + >> +void tb_unlock(void) >> +{ >> + assert(have_tb_lock > 0); >> + have_tb_lock--; >> + if (!have_tb_lock) { >> + qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); >> + } >> +} >> + >> +void tb_lock_reset(void) >> +{ >> + if (have_tb_lock) { >> + qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); >> + } >> + have_tb_lock =3D 0; >> +} >> + >> static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_p= c, >> tb_page_addr_t phys_page2); >> static TranslationBlock *tb_find_pc(uintptr_t tc_ptr); >> @@ -215,6 +243,7 @@ static int cpu_restore_state_from_tb(CPUState *cpu= , TranslationBlock *tb, >> #ifdef CONFIG_PROFILER >> ti =3D profile_getclock(); >> #endif >> + tb_lock(); >> tcg_func_start(s); >> =20 >> gen_intermediate_code_pc(env, tb); >> @@ -228,8 +257,10 @@ static int cpu_restore_state_from_tb(CPUState *cp= u, TranslationBlock *tb, >> =20 >> /* find opc index corresponding to search_pc */ >> tc_ptr =3D (uintptr_t)tb->tc_ptr; >> - if (searched_pc < tc_ptr) >> + if (searched_pc < tc_ptr) { >> + tb_unlock(); >> return -1; >> + } >> =20 >> s->tb_next_offset =3D tb->tb_next_offset; >> #ifdef USE_DIRECT_JUMP >> @@ -241,8 +272,10 @@ static int cpu_restore_state_from_tb(CPUState *cp= u, TranslationBlock *tb, >> #endif >> j =3D tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr, >> searched_pc - tc_ptr); >> - if (j < 0) >> + if (j < 0) { >> + tb_unlock(); >> return -1; >> + } >> /* now find start of instruction before */ >> while (s->gen_opc_instr_start[j] =3D=3D 0) { >> j--; >> @@ -255,6 +288,8 @@ static int cpu_restore_state_from_tb(CPUState *cpu= , TranslationBlock *tb, >> s->restore_time +=3D profile_getclock() - ti; >> s->restore_count++; >> #endif >> + >> + tb_unlock(); >> return 0; >> } >> =20 >> @@ -672,6 +707,7 @@ static inline void code_gen_alloc(size_t tb_size) >> CODE_GEN_AVG_BLOCK_SIZE; >> tcg_ctx.tb_ctx.tbs =3D >> g_malloc(tcg_ctx.code_gen_max_blocks * sizeof(Translatio= nBlock)); >> + qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); >> } >> =20 >> /* Must be called before using the QEMU cpus. 'tb_size' is the size >> @@ -696,16 +732,22 @@ bool tcg_enabled(void) >> return tcg_ctx.code_gen_buffer !=3D NULL; >> } >> =20 >> -/* Allocate a new translation block. Flush the translation buffer if >> - too many translation blocks or too much generated code. */ >> +/* >> + * Allocate a new translation block. Flush the translation buffer if >> + * too many translation blocks or too much generated code. >> + * tb_alloc is not thread safe but tb_gen_code is protected by a mute= x so this >> + * function is called only by one thread. > maybe: "..is not thread safe but tb_gen_code is protected by tb_lock so > only one thread calls it at a time."? Yes. >> + */ >> static TranslationBlock *tb_alloc(target_ulong pc) >> { >> - TranslationBlock *tb; >> + TranslationBlock *tb =3D NULL; >> =20 >> if (tcg_ctx.tb_ctx.nb_tbs >=3D tcg_ctx.code_gen_max_blocks || >> (tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer) >=3D >> tcg_ctx.code_gen_buffer_max_size) { >> - return NULL; >> + tb =3D &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++]; >> + tb->pc =3D pc; >> + tb->cflags =3D 0; >> } >> tb =3D &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++]; >> tb->pc =3D pc; > That looks weird. > > if (cond) return > &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++] then return > &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++];? > > Also rendering the setting of tb =3D NULL pointless as it will always b= e > from the array? Oops yes sorry, this is definitely a mistake, those changes should have disappeared. Thanks, Fred > >> @@ -718,11 +760,16 @@ void tb_free(TranslationBlock *tb) >> /* In pr actice this is mostly used for single use temporar= y TB >> Ignore the hard cases and just back up if this TB happens to >> be the last one generated. */ >> + >> + tb_lock(); >> + >> if (tcg_ctx.tb_ctx.nb_tbs > 0 && >> tb =3D=3D &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]= ) { >> tcg_ctx.code_gen_ptr =3D tb->tc_ptr; >> tcg_ctx.tb_ctx.nb_tbs--; >> } >> + >> + tb_unlock(); >> } >> =20 >> static inline void invalidate_page_bitmap(PageDesc *p) >> @@ -773,6 +820,8 @@ void tb_flush(CPUArchState *env1) >> { >> CPUState *cpu =3D ENV_GET_CPU(env1); >> =20 >> + tb_lock(); >> + >> #if defined(DEBUG_FLUSH) >> printf("qemu: flush code_size=3D%ld nb_tbs=3D%d avg_tb_size=3D%l= d\n", >> (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_b= uffer), >> @@ -797,6 +846,8 @@ void tb_flush(CPUArchState *env1) >> /* XXX: flush processor icache at this point if cache flush is >> expensive */ >> tcg_ctx.tb_ctx.tb_flush_count++; >> + >> + tb_unlock(); >> } >> =20 >> #ifdef DEBUG_TB_CHECK >> @@ -806,6 +857,8 @@ static void tb_invalidate_check(target_ulong addre= ss) >> TranslationBlock *tb; >> int i; >> =20 >> + tb_lock(); >> + >> address &=3D TARGET_PAGE_MASK; >> for (i =3D 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { >> for (tb =3D tb_ctx.tb_phys_hash[i]; tb !=3D NULL; tb =3D tb-= >phys_hash_next) { >> @@ -817,6 +870,8 @@ static void tb_invalidate_check(target_ulong addre= ss) >> } >> } >> } >> + >> + tb_unlock(); >> } >> =20 >> /* verify that all the pages have correct rights for code */ >> @@ -825,6 +880,8 @@ static void tb_page_check(void) >> TranslationBlock *tb; >> int i, flags1, flags2; >> =20 >> + tb_lock(); >> + >> for (i =3D 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { >> for (tb =3D tcg_ctx.tb_ctx.tb_phys_hash[i]; tb !=3D NULL; >> tb =3D tb->phys_hash_next) { >> @@ -836,6 +893,8 @@ static void tb_page_check(void) >> } >> } >> } >> + >> + tb_unlock(); >> } >> =20 >> #endif >> @@ -916,6 +975,8 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_p= age_addr_t page_addr) >> tb_page_addr_t phys_pc; >> TranslationBlock *tb1, *tb2; >> =20 >> + tb_lock(); >> + >> /* remove the TB from the hash list */ >> phys_pc =3D tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); >> h =3D tb_phys_hash_func(phys_pc); >> @@ -963,6 +1024,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_= page_addr_t page_addr) >> tb->jmp_first =3D (TranslationBlock *)((uintptr_t)tb | 2); /* fa= il safe */ >> =20 >> tcg_ctx.tb_ctx.tb_phys_invalidate_count++; >> + tb_unlock(); >> } >> =20 >> static void build_page_bitmap(PageDesc *p) >> @@ -1004,6 +1066,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, >> target_ulong virt_page2; >> int code_gen_size; >> =20 >> + tb_lock(); >> + >> phys_pc =3D get_page_addr_code(env, pc); >> if (use_icount) { >> cflags |=3D CF_USE_ICOUNT; >> @@ -1032,6 +1096,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, >> phys_page2 =3D get_page_addr_code(env, virt_page2); >> } >> tb_link_page(tb, phys_pc, phys_page2); >> + >> + tb_unlock(); >> return tb; >> } >> =20 >> @@ -1330,13 +1396,15 @@ static inline void tb_alloc_page(TranslationBl= ock *tb, >> } >> =20 >> /* add a new TB and link it to the physical page tables. phys_page2 = is >> - (-1) to indicate that only one page contains the TB. */ >> + * (-1) to indicate that only one page contains the TB. */ >> static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_p= c, >> tb_page_addr_t phys_page2) >> { >> unsigned int h; >> TranslationBlock **ptb; >> =20 >> + tb_lock(); >> + >> /* Grab the mmap lock to stop another thread invalidating this T= B >> before we are done. */ >> mmap_lock(); >> @@ -1370,6 +1438,8 @@ static void tb_link_page(TranslationBlock *tb, t= b_page_addr_t phys_pc, >> tb_page_check(); >> #endif >> mmap_unlock(); >> + >> + tb_unlock(); >> } >> =20 >> /* find the TB 'tb' such that tb[0].tc_ptr <=3D tc_ptr < >> @@ -1378,31 +1448,34 @@ static TranslationBlock *tb_find_pc(uintptr_t = tc_ptr) >> { >> int m_min, m_max, m; >> uintptr_t v; >> - TranslationBlock *tb; >> - >> - if (tcg_ctx.tb_ctx.nb_tbs <=3D 0) { >> - return NULL; >> - } >> - if (tc_ptr < (uintptr_t)tcg_ctx.code_gen_buffer || >> - tc_ptr >=3D (uintptr_t)tcg_ctx.code_gen_ptr) { >> - return NULL; >> - } >> - /* binary search (cf Knuth) */ >> - m_min =3D 0; >> - m_max =3D tcg_ctx.tb_ctx.nb_tbs - 1; >> - while (m_min <=3D m_max) { >> - m =3D (m_min + m_max) >> 1; >> - tb =3D &tcg_ctx.tb_ctx.tbs[m]; >> - v =3D (uintptr_t)tb->tc_ptr; >> - if (v =3D=3D tc_ptr) { >> - return tb; >> - } else if (tc_ptr < v) { >> - m_max =3D m - 1; >> - } else { >> - m_min =3D m + 1; >> + TranslationBlock *tb =3D NULL; >> + >> + tb_lock(); >> + >> + if ((tcg_ctx.tb_ctx.nb_tbs > 0) >> + && (tc_ptr >=3D (uintptr_t)tcg_ctx.code_gen_buffer && >> + tc_ptr < (uintptr_t)tcg_ctx.code_gen_ptr)) { >> + /* binary search (cf Knuth) */ >> + m_min =3D 0; >> + m_max =3D tcg_ctx.tb_ctx.nb_tbs - 1; >> + while (m_min <=3D m_max) { >> + m =3D (m_min + m_max) >> 1; >> + tb =3D &tcg_ctx.tb_ctx.tbs[m]; >> + v =3D (uintptr_t)tb->tc_ptr; >> + if (v =3D=3D tc_ptr) { >> + tb_unlock(); >> + return tb; >> + } else if (tc_ptr < v) { >> + m_max =3D m - 1; >> + } else { >> + m_min =3D m + 1; >> + } >> } >> + tb =3D &tcg_ctx.tb_ctx.tbs[m_max]; >> } >> - return &tcg_ctx.tb_ctx.tbs[m_max]; >> + >> + tb_unlock(); >> + return tb; >> } >> =20 >> #if !defined(CONFIG_USER_ONLY) >> @@ -1564,6 +1637,8 @@ void dump_exec_info(FILE *f, fprintf_function cp= u_fprintf) >> int direct_jmp_count, direct_jmp2_count, cross_page; >> TranslationBlock *tb; >> =20 >> + tb_lock(); >> + >> target_code_size =3D 0; >> max_target_code_size =3D 0; >> cross_page =3D 0; >> @@ -1619,6 +1694,8 @@ void dump_exec_info(FILE *f, fprintf_function cp= u_fprintf) >> tcg_ctx.tb_ctx.tb_phys_invalidate_count); >> cpu_fprintf(f, "TLB flush count %d\n", tlb_flush_count); >> tcg_dump_info(f, cpu_fprintf); >> + >> + tb_unlock(); >> } >> =20 >> void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf)