From: "Emilio G. Cota" <cota@braap.org>
To: qemu-devel@nongnu.org
Cc: Richard Henderson <rth@twiddle.net>
Subject: [Qemu-devel] [PATCH v3 43/43] tcg: enable multiple TCG contexts in softmmu
Date: Wed, 19 Jul 2017 23:09:29 -0400 [thread overview]
Message-ID: <1500520169-23367-44-git-send-email-cota@braap.org> (raw)
In-Reply-To: <1500520169-23367-1-git-send-email-cota@braap.org>
This enables parallel TCG code generation. However, we do not take
advantage of it yet since tb_lock is still held during tb_gen_code.
In user-mode we use a single TCG context; see the documentation
added to tcg_region_init for the rationale.
Note that targets do not need any conversion: targets initialize a
TCGContext (e.g. defining TCG globals), and after this initialization
has finished, the context is cloned by the vCPU threads, each of
them keeping a separate copy.
TCG threads claim one entry in tcg_ctxs[] by atomically increasing
n_tcg_ctxs. Do not be too annoyed by the subsequent atomic_read's
of that variable; they are there just to play nice with analysis
tools such as thread sanitizer.
Note that we do not allocate an array of contexts (we allocate
an array of pointers instead) because when tcg_context_init
is called, we do not know yet how many contexts we'll use since
the bool behind qemu_tcg_mttcg_enabled() isn't set yet.
Previous patches folded some TCG globals into TCGContext. The non-const
globals remaining are only set at init time, i.e. before the TCG
threads are spawned. Here is a list of these set-at-init-time globals
under tcg/:
Only written by tcg_context_init:
- indirect_reg_alloc_order
- tcg_op_defs
Only written by tcg_target_init (called from tcg_context_init):
- tcg_target_available_regs
- tcg_target_call_clobber_regs
- arm: arm_arch, use_idiv_instructions
- i386: have_cmov, have_bmi1, have_bmi2, have_lzcnt,
have_movbe, have_popcnt
- mips: use_movnz_instructions, use_mips32_instructions,
use_mips32r2_instructions, got_sigill (tcg_target_detect_isa)
- ppc: have_isa_2_06, have_isa_3_00, tb_ret_addr
- s390: tb_ret_addr, s390_facilities
- sparc: qemu_ld_trampoline, qemu_st_trampoline (build_trampolines),
use_vis3_instructions
Only written by tcg_prologue_init:
- 'struct jit_code_entry one_entry'
- aarch64: tb_ret_addr
- arm: tb_ret_addr
- i386: tb_ret_addr, guest_base_flags
- ia64: tb_ret_addr
- mips: tb_ret_addr, bswap32_addr, bswap32u_addr, bswap64_addr
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/tcg.h | 7 ++-
accel/tcg/translate-all.c | 2 +-
cpus.c | 2 +
linux-user/syscall.c | 1 +
tcg/tcg.c | 141 ++++++++++++++++++++++++++++++++++++++++++++--
5 files changed, 143 insertions(+), 10 deletions(-)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 3365da8..68cd14e 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -733,7 +733,7 @@ struct TCGContext {
};
extern TCGContext tcg_init_ctx;
-extern TCGContext *tcg_ctx;
+extern __thread TCGContext *tcg_ctx;
static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
{
@@ -755,7 +755,7 @@ static inline bool tcg_op_buf_full(void)
/* pool based memory allocation */
-/* tb_lock must be held for tcg_malloc_internal. */
+/* user-mode: tb_lock must be held for tcg_malloc_internal. */
void *tcg_malloc_internal(TCGContext *s, int size);
void tcg_pool_reset(TCGContext *s);
TranslationBlock *tcg_tb_alloc(TCGContext *s);
@@ -766,7 +766,7 @@ void tcg_region_reset_all(void);
size_t tcg_code_size(void);
size_t tcg_code_capacity(void);
-/* Called with tb_lock held. */
+/* user-mode: Called with tb_lock held. */
static inline void *tcg_malloc(int size)
{
TCGContext *s = tcg_ctx;
@@ -783,6 +783,7 @@ static inline void *tcg_malloc(int size)
}
void tcg_context_init(TCGContext *s);
+void tcg_register_thread(void);
void tcg_prologue_init(TCGContext *s);
void tcg_func_start(TCGContext *s);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 623b9e7..2e810b9 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -154,7 +154,7 @@ static void *l1_map[V_L1_MAX_SIZE];
/* code generation context */
TCGContext tcg_init_ctx;
-TCGContext *tcg_ctx;
+__thread TCGContext *tcg_ctx;
TBContext tb_ctx;
bool parallel_cpus;
diff --git a/cpus.c b/cpus.c
index 6022d40..74ddd49 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1307,6 +1307,7 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
CPUState *cpu = arg;
rcu_register_thread();
+ tcg_register_thread();
qemu_mutex_lock_iothread();
qemu_thread_get_self(cpu->thread);
@@ -1454,6 +1455,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
g_assert(!use_icount);
rcu_register_thread();
+ tcg_register_thread();
qemu_mutex_lock_iothread();
qemu_thread_get_self(cpu->thread);
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 003943b..bbf7913 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6214,6 +6214,7 @@ static void *clone_func(void *arg)
TaskState *ts;
rcu_register_thread();
+ tcg_register_thread();
env = info->env;
cpu = ENV_GET_CPU(env);
thread_cpu = cpu;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 22a949f..a5c01be 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -58,6 +58,7 @@
#include "elf.h"
#include "exec/log.h"
+#include "sysemu/sysemu.h"
/* Forward declarations for functions declared in tcg-target.inc.c and
used here. */
@@ -324,13 +325,14 @@ static inline bool tcg_region_initial_alloc__locked(TCGContext *s)
/* Call from a safe-work context */
void tcg_region_reset_all(void)
{
+ unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
unsigned int i;
qemu_mutex_lock(®ion.lock);
region.current = 0;
region.n_full = 0;
- for (i = 0; i < n_tcg_ctxs; i++) {
+ for (i = 0; i < n_ctxs; i++) {
bool err = tcg_region_initial_alloc__locked(tcg_ctxs[i]);
g_assert(!err);
@@ -338,11 +340,71 @@ void tcg_region_reset_all(void)
qemu_mutex_unlock(®ion.lock);
}
+#ifdef CONFIG_SOFTMMU
+/*
+ * It is likely that some vCPUs will translate more code than others, so we
+ * first try to set more regions than smp_cpus, with those regions being of
+ * reasonable size. If that's not possible we make do by evenly dividing
+ * the code_gen_buffer among the vCPUs.
+ */
+static size_t tcg_n_regions(void)
+{
+ size_t i;
+
+ /* Use a single region if all we have is one vCPU thread */
+ if (smp_cpus == 1 || !qemu_tcg_mttcg_enabled()) {
+ return 1;
+ }
+
+ /* Try to have more regions than smp_cpus, with each region being >= 2 MB */
+ for (i = 8; i > 0; i--) {
+ size_t regions_per_thread = i;
+ size_t region_size;
+
+ region_size = tcg_init_ctx.code_gen_buffer_size;
+ region_size /= smp_cpus * regions_per_thread;
+
+ if (region_size >= 2 * 1024u * 1024) {
+ return smp_cpus * regions_per_thread;
+ }
+ }
+ /* If we can't, then just allocate one region per vCPU thread */
+ return smp_cpus;
+}
+#else /* user-mode */
+static size_t tcg_n_regions(void)
+{
+ return 1;
+}
+#endif
+
/*
* Initializes region partitioning.
*
* Called at init time from the parent thread (i.e. the one calling
* tcg_context_init), after the target's TCG globals have been set.
+ *
+ * Region partitioning works by splitting code_gen_buffer into separate regions,
+ * and then assigning regions to TCG threads so that the threads can translate
+ * code in parallel without synchronization.
+ *
+ * In softmmu the number of TCG threads is bounded by smp_cpus, so we use at
+ * least smp_cpus regions in MTTCG. In !MTTCG we use a single region.
+ * Note that the TCG options from the command-line (i.e. -accel accel=tcg,[...])
+ * must have been parsed before calling this function, since it calls
+ * qemu_tcg_mttcg_enabled().
+ *
+ * In user-mode we use a single region. Having multiple regions in user-mode
+ * is not supported, because the number of vCPU threads (recall that each thread
+ * spawned by the guest corresponds to a vCPU thread) is only bounded by the
+ * OS, and usually this number is huge (tens of thousands is not uncommon).
+ * Thus, given this large bound on the number of vCPU threads and the fact
+ * that code_gen_buffer is allocated at compile-time, we cannot guarantee
+ * that the availability of at least one region per vCPU thread.
+ *
+ * However, this user-mode limitation is unlikely to be a significant problem
+ * in practice. Multi-threaded guests share most if not all of their translated
+ * code, which makes parallel code generation less appealing than in softmmu.
*/
void tcg_region_init(void)
{
@@ -352,8 +414,7 @@ void tcg_region_init(void)
size_t n_regions;
size_t i;
- /* We do not yet support multiple TCG contexts, so use one region for now */
- n_regions = 1;
+ n_regions = tcg_n_regions();
/* start on a page-aligned address */
buf = QEMU_ALIGN_PTR_UP(buf, qemu_real_host_page_size);
@@ -387,13 +448,69 @@ void tcg_region_init(void)
g_assert(!rc);
}
- /* We do not yet support multiple TCG contexts so allocate the region now */
+ /* In user-mode we support only one ctx, so do the initial allocation now */
+#ifdef CONFIG_USER_ONLY
{
bool err = tcg_region_initial_alloc__locked(tcg_ctx);
g_assert(!err);
}
+#endif
+}
+
+/*
+ * All TCG threads except the parent (i.e. the one that called tcg_context_init
+ * and registered the target's TCG globals) must register with this function
+ * before initiating translation.
+ *
+ * In user-mode we just point tcg_ctx to tcg_init_ctx. See the documentation
+ * of tcg_region_init() for the reasoning behind this.
+ *
+ * In softmmu each caller registers its context in tcg_ctxs[]. Note that in
+ * softmmu tcg_ctxs[] does not track tcg_ctx_init, since the initial context
+ * is not used anymore for translation once this function is called.
+ *
+ * Not tracking tcg_init_ctx in tcg_ctxs[] in softmmu keeps code that iterates
+ * over the array (e.g. tcg_code_size() the same for both softmmu and user-mode.
+ */
+#ifdef CONFIG_USER_ONLY
+void tcg_register_thread(void)
+{
+ tcg_ctx = &tcg_init_ctx;
+}
+#else
+void tcg_register_thread(void)
+{
+ TCGContext *s = g_malloc(sizeof(*s));
+ unsigned int n;
+ bool err;
+
+ memcpy(s, &tcg_init_ctx, sizeof(*s));
+
+ /* claim an entry in tcg_ctxs */
+ n = atomic_fetch_inc(&n_tcg_ctxs);
+ g_assert(n < smp_cpus);
+ tcg_ctxs[n] = s;
+
+ /*
+ * Zero out s->prof in all contexts but the first.
+ * This ensures that we correctly account for the profiling info
+ * generated during initialization, since tcg_init_ctx is not
+ * tracked by the array.
+ */
+ if (n != 0) {
+#ifdef CONFIG_PROFILER
+ memset(&s->prof, 0, sizeof(s->prof));
+#endif
+ }
+
+ tcg_ctx = s;
+ qemu_mutex_lock(®ion.lock);
+ err = tcg_region_initial_alloc__locked(tcg_ctx);
+ g_assert(!err);
+ qemu_mutex_unlock(®ion.lock);
}
+#endif /* !CONFIG_USER_ONLY */
/*
* Returns the size (in bytes) of all translated code (i.e. from all regions)
@@ -404,12 +521,13 @@ void tcg_region_init(void)
*/
size_t tcg_code_size(void)
{
+ unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
unsigned int i;
size_t total;
qemu_mutex_lock(®ion.lock);
total = region.n_full * (region.size - TCG_HIGHWATER);
- for (i = 0; i < n_tcg_ctxs; i++) {
+ for (i = 0; i < n_ctxs; i++) {
const TCGContext *s = tcg_ctxs[i];
size_t size;
@@ -561,8 +679,18 @@ void tcg_context_init(TCGContext *s)
}
tcg_ctx = s;
+ /*
+ * In user-mode we simply share the init context among threads, since we
+ * use a single region. See the documentation tcg_region_init() for the
+ * reasoning behind this.
+ * In softmmu we will have at most smp_cpus TCG threads.
+ */
+#ifdef CONFIG_USER_ONLY
tcg_ctxs = &tcg_ctx;
n_tcg_ctxs = 1;
+#else
+ tcg_ctxs = g_new(TCGContext *, smp_cpus);
+#endif
}
/*
@@ -2714,9 +2842,10 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
static inline
void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
{
+ unsigned int n_ctxs = atomic_read(&n_tcg_ctxs);
unsigned int i;
- for (i = 0; i < n_tcg_ctxs; i++) {
+ for (i = 0; i < n_ctxs; i++) {
const TCGProfile *orig = &tcg_ctxs[i]->prof;
if (counters) {
--
2.7.4
next prev parent reply other threads:[~2017-07-20 3:09 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-07-20 3:08 [Qemu-devel] [PATCH v3 00/43] tcg: support for multiple TCG contexts Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 01/43] cputlb: bring back tlb_flush_count under !TLB_DEBUG Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 02/43] tcg: fix corruption of code_time profiling counter upon tb_flush Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 03/43] exec-all: fix typos in TranslationBlock's documentation Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 04/43] translate-all: make have_tb_lock static Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 05/43] cpu-exec: rename have_tb_lock to acquired_tb_lock in tb_find Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 06/43] tcg/i386: constify tcg_target_callee_save_regs Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 07/43] tcg/mips: " Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 08/43] tcg: remove addr argument from lookup_tb_ptr Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 09/43] tcg: consolidate TB lookups in tb_lookup__cpu_state Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 10/43] exec-all: bring tb->invalid into tb->cflags Emilio G. Cota
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 11/43] tcg: define CF_PARALLEL and use it for TB hashing Emilio G. Cota
2017-07-20 8:45 ` Richard Henderson
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 12/43] tcg: convert tb->cflags reads to tb_cflags(tb) Emilio G. Cota
2017-07-20 7:22 ` Richard Henderson
2017-07-20 3:08 ` [Qemu-devel] [PATCH v3 13/43] target/arm: check CF_PARALLEL instead of parallel_cpus Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 14/43] target/hppa: " Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 15/43] target/i386: " Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 16/43] target/m68k: " Emilio G. Cota
2017-07-20 7:23 ` Richard Henderson
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 17/43] target/s390x: " Emilio G. Cota
2017-07-20 7:25 ` Richard Henderson
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 18/43] target/sh4: " Emilio G. Cota
2017-07-20 7:26 ` Richard Henderson
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 19/43] target/sparc: " Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 20/43] tcg: " Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 21/43] cpu-exec: lookup/generate TB outside exclusive region during step_atomic Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 22/43] translate-all: define and use DEBUG_TB_FLUSH_GATE Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 23/43] exec-all: introduce TB_PAGE_ADDR_FMT Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 24/43] translate-all: define and use DEBUG_TB_INVALIDATE_GATE Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 25/43] translate-all: define and use DEBUG_TB_CHECK_GATE Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 26/43] exec-all: extract tb->tc_* into a separate struct tc_tb Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 27/43] translate-all: use a binary search tree to track TBs in TBContext Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 28/43] exec-all: rename tb_free to tb_remove Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 29/43] translate-all: report correct avg host TB size Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 30/43] tci: move tci_regs to tcg_qemu_tb_exec's stack Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 31/43] tcg: take tb_ctx out of TCGContext Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 32/43] tcg: take .helpers " Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 33/43] tcg: define tcg_init_ctx and make tcg_ctx a pointer Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 34/43] gen-icount: fold exitreq_label into TCGContext Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 35/43] tcg: dynamically allocate optimizer temps Emilio G. Cota
2017-07-20 7:39 ` Richard Henderson
2017-07-20 23:53 ` Emilio G. Cota
2017-07-21 0:02 ` Richard Henderson
2017-07-21 5:04 ` Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 36/43] tcg: introduce **tcg_ctxs to keep track of all TCGContext's Emilio G. Cota
2017-07-20 7:47 ` Richard Henderson
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 37/43] tcg: distribute profiling counters across TCGContext's Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 38/43] util: move qemu_real_host_page_size/mask to osdep.h Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 39/43] osdep: introduce qemu_mprotect_rwx/none Emilio G. Cota
2017-07-20 7:49 ` Richard Henderson
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 40/43] translate-all: use qemu_protect_rwx/none helpers Emilio G. Cota
2017-07-20 7:51 ` Richard Henderson
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 41/43] tcg: define TCG_HIGHWATER Emilio G. Cota
2017-07-20 3:09 ` [Qemu-devel] [PATCH v3 42/43] tcg: introduce regions to split code_gen_buffer Emilio G. Cota
2017-07-20 8:04 ` Richard Henderson
2017-07-20 20:50 ` Emilio G. Cota
2017-07-20 21:22 ` Richard Henderson
2017-07-20 23:23 ` Emilio G. Cota
2017-07-21 0:07 ` Richard Henderson
2017-07-20 3:09 ` Emilio G. Cota [this message]
2017-07-20 8:17 ` [Qemu-devel] [PATCH v3 43/43] tcg: enable multiple TCG contexts in softmmu Richard Henderson
2017-07-20 4:05 ` [Qemu-devel] [PATCH v3 00/43] tcg: support for multiple TCG contexts no-reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1500520169-23367-44-git-send-email-cota@braap.org \
--to=cota@braap.org \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).