From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:35092) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X1f58-0008T7-2c for qemu-devel@nongnu.org; Mon, 30 Jun 2014 13:09:35 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1X1f4x-0002hR-AX for qemu-devel@nongnu.org; Mon, 30 Jun 2014 13:09:30 -0400 Received: from mx1.redhat.com ([209.132.183.28]:6316) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X1f4x-0002hK-0O for qemu-devel@nongnu.org; Mon, 30 Jun 2014 13:09:19 -0400 Message-ID: <53B199A4.9090304@redhat.com> Date: Mon, 30 Jun 2014 19:08:52 +0200 From: Paolo Bonzini MIME-Version: 1.0 References: <1404136749-523-1-git-send-email-sebastian.tanase@openwide.fr> <1404136749-523-5-git-send-email-sebastian.tanase@openwide.fr> In-Reply-To: <1404136749-523-5-git-send-email-sebastian.tanase@openwide.fr> Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [RFC PATCH V3 4/6] cpu_exec: Add sleeping algorithm List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Sebastian Tanase , qemu-devel@nongnu.org Cc: kwolf@redhat.com, peter.maydell@linaro.org, aliguori@amazon.com, wenchaoqemu@gmail.com, quintela@redhat.com, mjt@tls.msk.ru, mst@redhat.com, stefanha@redhat.com, armbru@redhat.com, lcapitulino@redhat.com, michael@walle.cc, alex@alex.org.uk, crobinso@redhat.com, afaerber@suse.de, rth@twiddle.net Il 30/06/2014 15:59, Sebastian Tanase ha scritto: > The goal is to sleep qemu whenever the guest clock > is in advance compared to the host clock (we use > the monotonic clocks). The amount of time to sleep > is calculated in the execution loop in cpu_exec. >=20 > At first, we tried to approximate at each for loop the real time elapse= d > while searching for a TB (generating or retrieving from cache) and > executing it. We would then approximate the virtual time corresponding > to the number of virtual instructions executed. The difference between > these 2 values would allow us to know if the guest is in advance or del= ayed. > However, the function used for measuring the real time > (qemu_clock_get_ns(QEMU_CLOCK_REALTIME)) proved to be very expensive. > We had an added overhead of 13% of the total run time. >=20 > Therefore, we modified the algorithm and only take into account the > difference between the 2 clocks at the begining of the cpu_exec functio= n. > During the for loop we try to reduce the advance of the guest only by > computing the virtual time elapsed and sleeping if necessary. The overh= ead > is thus reduced to 3%. Even though this method still has a noticeable > overhead, it no longer is a bottleneck in trying to achieve a better > guest frequency for which the guest clock is faster than the host one. >=20 > As for the the alignement of the 2 clocks, with the first algorithm > the guest clock was oscillating between -1 and 1ms compared to the host= clock. > Using the second algorithm we notice that the guest is 5ms behind the h= ost, which > is still acceptable for our use case. >=20 > The tests where conducted using fio and stress. The host machine in an = i5 CPU at > 3.10GHz running Debian Jessie (kernel 3.12). The guest machine is an ar= m versatile-pb > built with buildroot. >=20 > Currently, on our test machine, the lowest icount we can achieve that i= s suitable for > aligning the 2 clocks is 6. However, we observe that the IO tests (usin= g fio) are > slower than the cpu tests (using stress). >=20 > Signed-off-by: Sebastian Tanase > Tested-by: Camille B=C3=A9gu=C3=A9 > --- > cpu-exec.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++= ++++++++ > 1 file changed, 112 insertions(+) >=20 > diff --git a/cpu-exec.c b/cpu-exec.c > index 38e5f02..ac741b7 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -22,6 +22,102 @@ > #include "tcg.h" > #include "qemu/atomic.h" > #include "sysemu/qtest.h" > +#include "qemu/timer.h" > + > +/* Structs and function pointers for delaying the host */ > +typedef struct SyncClocks SyncClocks; > +typedef void (*init_delay_func)(SyncClocks *sc, > + const CPUState *cpu); > +typedef void (*perform_align_func)(SyncClocks *sc, > + const CPUState *cpu); > +struct SyncClocks { > + int64_t diff_clk; > + int64_t original_instr_counter; > + init_delay_func init_delay; > + perform_align_func perform_align; > +}; > + > +#if !defined(CONFIG_USER_ONLY) > +/* Allow the guest to have a max 3ms advance. > + * The difference between the 2 clocks could therefore > + * oscillate around 0. > + */ > +#define VM_CLOCK_ADVANCE 3000000 > + > +static int64_t delay_host(int64_t diff_clk) > +{ > + struct timespec sleep_delay, rem_delay; > + if (diff_clk > VM_CLOCK_ADVANCE) { > + sleep_delay.tv_sec =3D diff_clk / 1000000000LL; > + sleep_delay.tv_nsec =3D diff_clk % 1000000000LL; > + if (nanosleep(&sleep_delay, &rem_delay) < 0) { > + diff_clk -=3D (sleep_delay.tv_sec - rem_delay.tv_sec) * 10= 00000000LL; > + diff_clk -=3D sleep_delay.tv_nsec - rem_delay.tv_nsec; > + } else { > + diff_clk =3D 0; > + } > + } > + return diff_clk; > +} > + > +static int64_t instr_to_vtime(int64_t instr_counter, const CPUState *c= pu) > +{ > + int64_t instr_exec_time; > + instr_exec_time =3D instr_counter - > + (cpu->icount_extra + > + cpu->icount_decr.u16.low); > + instr_exec_time =3D instr_exec_time << icount_time_shift; > + > + return instr_exec_time; > +} > + > +static void align_clocks(SyncClocks *sc, const CPUState *cpu) > +{ > + if (!icount_align_option) { > + return; > + } > + sc->diff_clk +=3D instr_to_vtime(sc->original_instr_counter, cpu); > + sc->original_instr_counter =3D cpu->icount_extra + cpu->icount_dec= r.u16.low; > + sc->diff_clk =3D delay_host(sc->diff_clk); > +} > + > +static void init_delay_params(SyncClocks *sc, > + const CPUState *cpu) > +{ > + static int64_t clocks_offset =3D -1; > + int64_t realtime_clock_value, virtual_clock_value; > + if (!icount_align_option) { > + return; > + } > + /* On x86 target architecture, the PIT reset function (called > + by qemu_system_reset) will end up calling qemu_clock_warp > + and then icount_warp_rt changing vm_clock_warp_start from 0 (in= itial > + value) to -1. This in turn will make us skip the initial offset > + between the real and virtual clocks (initially virtual clock is= 0). > + Therefore we impose that the first time we run the cpu > + the host and virtual clocks should be aligned; we don't alter a= ny of > + the clocks, we just calculate the difference between them. */ I'm not sure if these gory details are really relevant. The point, I=20 think, is basically that the bases of QEMU_CLOCK_REALTIME and=20 QEMU_CLOCK_VIRTUAL differ. QEMU_CLOCK_REALTIME is based at the Unix epoc= h, QEMU_CLOCK_VIRTUAL is based at the time QEMU starts. > + realtime_clock_value =3D qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + virtual_clock_value =3D qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); > + if (clocks_offset =3D=3D -1) { > + clocks_offset =3D realtime_clock_value - virtual_clock_value; > + } > + sc->diff_clk =3D virtual_clock_value - realtime_clock_value + cloc= ks_offset; > + sc->original_instr_counter =3D cpu->icount_extra + cpu->icount_dec= r.u16.low; > +} > +#else > +/* We don't use the align feature for User emulation > + thus we add empty functions which shall be ignored > + by the compiler */ > +static void align_clocks(SyncClocks *sc, const CPUState *cpu) > +{ > +} > + > +static void init_delay_params(SyncClocks *sc, > + const CPUState *cpu) > +{ > +} > +#endif /* CONFIG USER ONLY */ > =20 > void cpu_loop_exit(CPUState *cpu) > { > @@ -227,6 +323,11 @@ int cpu_exec(CPUArchState *env) > TranslationBlock *tb; > uint8_t *tc_ptr; > uintptr_t next_tb; > + /* Delay algorithm */ > + static SyncClocks sc =3D { This need not be static. > + .init_delay =3D init_delay_params, > + .perform_align =3D align_clocks > + }; > /* This must be volatile so it is not trashed by longjmp() */ > volatile bool have_tb_lock =3D false; > =20 > @@ -283,6 +384,11 @@ int cpu_exec(CPUArchState *env) > #endif > cpu->exception_index =3D -1; > =20 > + /* Calculate difference between guest clock and host clock. > + This delay includes the delay of the last cycle, so > + what we have to do is sleep until it is 0. As for the > + advance/delay we gain here, we try to fix it next time. */ > + sc.init_delay(&sc, cpu); > /* prepare setjmp context for exception handling */ > for(;;) { > if (sigsetjmp(cpu->jmp_env, 0) =3D=3D 0) { > @@ -672,6 +778,9 @@ int cpu_exec(CPUArchState *env) > if (insns_left > 0) { > /* Execute remaining instructions. */ > cpu_exec_nocache(env, insns_left, tb); > + /* Try to align the host and virtual cl= ocks > + if the guest is in advance. */ > + sc.perform_align(&sc, cpu); > } > cpu->exception_index =3D EXCP_INTERRUPT; > next_tb =3D 0; > @@ -684,6 +793,9 @@ int cpu_exec(CPUArchState *env) > } > } > cpu->current_tb =3D NULL; > + /* Try to align the host and virtual clocks > + if the guest is in advance */ > + sc.perform_align(&sc, cpu); > /* reset soft MMU for next block (it can currently > only be set by a memory fault) */ > } /* for(;;) */ >=20