qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] exec: Don't request an address for code_gen_buffer if -fpie
@ 2012-10-04 21:31 Richard Henderson
  2012-10-07 16:34 ` Blue Swirl
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Henderson @ 2012-10-04 21:31 UTC (permalink / raw)
  To: qemu-devel

The hard-coded addresses inside code_gen_alloc only make sense if
we're building an executable that will actually run at the address
we've put into the linker scripts.

When we're building with -fpie, the executable will run at some
random location chosen by the kernel.  We get better placement for
the code_gen_buffer if we allow the kernel to place the memory,
as it will tend to to place it near the executable, based on the
PROT_EXEC bit.

Since code_gen_prologue is always inside the executable, this effect
is easily seen at the end of most TB, with the exit_tb opcode:

Before:
0x40b82024:  mov    $0x7fa97bd5c296,%r10
0x40b8202e:  jmpq   *%r10

After:
0x7f1191ff1024:  jmpq   0x7f119edc0296

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec.c | 127 +++++++++++++++++++++++++++++++----------------------------------
 1 file changed, 60 insertions(+), 67 deletions(-)

diff --git a/exec.c b/exec.c
index bb6aa4a..0ddc07a 100644
--- a/exec.c
+++ b/exec.c
@@ -510,6 +510,14 @@ static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
                __attribute__((aligned (CODE_GEN_ALIGN)));
 #endif
 
+/* ??? Should configure for this not list operating systems here.  */
+#if defined(__linux__) \
+    || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
+    || defined(__DragonFly__) || defined(__OpenBSD__) \
+    || defined(__NetBSD__)
+# define USE_MMAP
+#endif
+
 static void code_gen_alloc(unsigned long tb_size)
 {
 #ifdef USE_STATIC_CODE_GEN_BUFFER
@@ -517,6 +525,45 @@ static void code_gen_alloc(unsigned long tb_size)
     code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
     map_exec(code_gen_buffer, code_gen_buffer_size);
 #else
+#ifdef USE_MMAP
+    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#endif
+    uintptr_t max_buf = -1, start = 0;
+
+    /* Constrain the size and position of the buffer based on the host cpu.  */
+#if defined(__x86_64__)
+# if !defined(__PIE__) && !defined(__PIC__) && defined(MAP_32BIT)
+    /* Force the memory down into low memory with the executable.
+       Leave the choice of exact location with the kernel.  */
+    flags |= MAP_32BIT;
+    /* Cannot expect to map more than 800MB in low memory.  */
+    max_buf = 800 * 1024 * 1024;
+# else
+    /* Maximum range of direct branches.  */
+    max_buf = 2ul * 1024 * 1024 * 1024;
+# endif
+#elif defined(__sparc__) && HOST_LONG_BITS == 64
+    /* Maximum range of direct branches between TB (via "call").  */
+    max_buf = 2ul * 1024 * 1024 * 1024;
+    start = 0x40000000ul;
+#elif defined(__arm__)
+    /* Keep the buffer no bigger than 16MB to branch between blocks */
+    max_buf = 16 * 1024 * 1024;
+#elif defined(__s390x__)
+    /* Map the buffer so that we can use direct calls and branches.  */
+    /* We have a +- 4GB range on the branches; leave some slop.  */
+    max_buf = 3ul * 1024 * 1024 * 1024;
+    start = 0x90000000ul;
+#endif
+#if defined(__PIE__) || defined(__PIC__)
+    /* Don't bother setting a preferred location if we're building
+       a position-independent executable.  We're more likely to get
+       an address near the main executable if we let the kernel
+       choose the address.  */
+    start = 0;
+#endif
+
+    /* Size the buffer.  */
     code_gen_buffer_size = tb_size;
     if (code_gen_buffer_size == 0) {
 #if defined(CONFIG_USER_ONLY)
@@ -526,81 +573,27 @@ static void code_gen_alloc(unsigned long tb_size)
         code_gen_buffer_size = (unsigned long)(ram_size / 4);
 #endif
     }
-    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE)
+    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE) {
         code_gen_buffer_size = MIN_CODE_GEN_BUFFER_SIZE;
-    /* The code gen buffer location may have constraints depending on
-       the host cpu and OS */
-#if defined(__linux__) 
-    {
-        int flags;
-        void *start = NULL;
-
-        flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#if defined(__x86_64__)
-        flags |= MAP_32BIT;
-        /* Cannot map more than that */
-        if (code_gen_buffer_size > (800 * 1024 * 1024))
-            code_gen_buffer_size = (800 * 1024 * 1024);
-#elif defined(__sparc__) && HOST_LONG_BITS == 64
-        // Map the buffer below 2G, so we can use direct calls and branches
-        start = (void *) 0x40000000UL;
-        if (code_gen_buffer_size > (512 * 1024 * 1024))
-            code_gen_buffer_size = (512 * 1024 * 1024);
-#elif defined(__arm__)
-        /* Keep the buffer no bigger than 16MB to branch between blocks */
-        if (code_gen_buffer_size > 16 * 1024 * 1024)
-            code_gen_buffer_size = 16 * 1024 * 1024;
-#elif defined(__s390x__)
-        /* Map the buffer so that we can use direct calls and branches.  */
-        /* We have a +- 4GB range on the branches; leave some slop.  */
-        if (code_gen_buffer_size > (3ul * 1024 * 1024 * 1024)) {
-            code_gen_buffer_size = 3ul * 1024 * 1024 * 1024;
-        }
-        start = (void *)0x90000000UL;
-#endif
-        code_gen_buffer = mmap(start, code_gen_buffer_size,
-                               PROT_WRITE | PROT_READ | PROT_EXEC,
-                               flags, -1, 0);
-        if (code_gen_buffer == MAP_FAILED) {
-            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
-            exit(1);
-        }
     }
-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
-    || defined(__DragonFly__) || defined(__OpenBSD__) \
-    || defined(__NetBSD__)
-    {
-        int flags;
-        void *addr = NULL;
-        flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#if defined(__x86_64__)
-        /* FreeBSD doesn't have MAP_32BIT, use MAP_FIXED and assume
-         * 0x40000000 is free */
-        flags |= MAP_FIXED;
-        addr = (void *)0x40000000;
-        /* Cannot map more than that */
-        if (code_gen_buffer_size > (800 * 1024 * 1024))
-            code_gen_buffer_size = (800 * 1024 * 1024);
-#elif defined(__sparc__) && HOST_LONG_BITS == 64
-        // Map the buffer below 2G, so we can use direct calls and branches
-        addr = (void *) 0x40000000UL;
-        if (code_gen_buffer_size > (512 * 1024 * 1024)) {
-            code_gen_buffer_size = (512 * 1024 * 1024);
-        }
-#endif
-        code_gen_buffer = mmap(addr, code_gen_buffer_size,
-                               PROT_WRITE | PROT_READ | PROT_EXEC, 
-                               flags, -1, 0);
-        if (code_gen_buffer == MAP_FAILED) {
-            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
-            exit(1);
-        }
+    if (code_gen_buffer_size > max_buf) {
+        code_gen_buffer_size = max_buf;
+    }
+
+#ifdef USE_MMAP
+    code_gen_buffer = mmap((void *)start, code_gen_buffer_size,
+                           PROT_WRITE | PROT_READ | PROT_EXEC,
+                           flags, -1, 0);
+    if (code_gen_buffer == MAP_FAILED) {
+        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
+        exit(1);
     }
 #else
     code_gen_buffer = g_malloc(code_gen_buffer_size);
     map_exec(code_gen_buffer, code_gen_buffer_size);
 #endif
 #endif /* !USE_STATIC_CODE_GEN_BUFFER */
+
     map_exec(code_gen_prologue, sizeof(code_gen_prologue));
     code_gen_buffer_max_size = code_gen_buffer_size -
         (TCG_MAX_OP_SIZE * OPC_BUF_SIZE);
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] exec: Don't request an address for code_gen_buffer if -fpie
  2012-10-04 21:31 [Qemu-devel] [PATCH] exec: Don't request an address for code_gen_buffer if -fpie Richard Henderson
@ 2012-10-07 16:34 ` Blue Swirl
  2012-10-07 19:20   ` Richard Henderson
  0 siblings, 1 reply; 12+ messages in thread
From: Blue Swirl @ 2012-10-07 16:34 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Oct 4, 2012 at 9:31 PM, Richard Henderson <rth@twiddle.net> wrote:
> The hard-coded addresses inside code_gen_alloc only make sense if
> we're building an executable that will actually run at the address
> we've put into the linker scripts.
>
> When we're building with -fpie, the executable will run at some
> random location chosen by the kernel.  We get better placement for
> the code_gen_buffer if we allow the kernel to place the memory,
> as it will tend to to place it near the executable, based on the
> PROT_EXEC bit.
>
> Since code_gen_prologue is always inside the executable, this effect
> is easily seen at the end of most TB, with the exit_tb opcode:
>
> Before:
> 0x40b82024:  mov    $0x7fa97bd5c296,%r10
> 0x40b8202e:  jmpq   *%r10
>
> After:
> 0x7f1191ff1024:  jmpq   0x7f119edc0296
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  exec.c | 127 +++++++++++++++++++++++++++++++----------------------------------
>  1 file changed, 60 insertions(+), 67 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index bb6aa4a..0ddc07a 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -510,6 +510,14 @@ static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
>                 __attribute__((aligned (CODE_GEN_ALIGN)));
>  #endif
>
> +/* ??? Should configure for this not list operating systems here.  */
> +#if defined(__linux__) \
> +    || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
> +    || defined(__DragonFly__) || defined(__OpenBSD__) \
> +    || defined(__NetBSD__)
> +# define USE_MMAP
> +#endif
> +
>  static void code_gen_alloc(unsigned long tb_size)
>  {
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> @@ -517,6 +525,45 @@ static void code_gen_alloc(unsigned long tb_size)
>      code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
>      map_exec(code_gen_buffer, code_gen_buffer_size);
>  #else
> +#ifdef USE_MMAP
> +    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
> +#endif
> +    uintptr_t max_buf = -1, start = 0;
> +
> +    /* Constrain the size and position of the buffer based on the host cpu.  */
> +#if defined(__x86_64__)
> +# if !defined(__PIE__) && !defined(__PIC__) && defined(MAP_32BIT)
> +    /* Force the memory down into low memory with the executable.
> +       Leave the choice of exact location with the kernel.  */
> +    flags |= MAP_32BIT;
> +    /* Cannot expect to map more than 800MB in low memory.  */
> +    max_buf = 800 * 1024 * 1024;
> +# else
> +    /* Maximum range of direct branches.  */
> +    max_buf = 2ul * 1024 * 1024 * 1024;
> +# endif
> +#elif defined(__sparc__) && HOST_LONG_BITS == 64
> +    /* Maximum range of direct branches between TB (via "call").  */
> +    max_buf = 2ul * 1024 * 1024 * 1024;
> +    start = 0x40000000ul;
> +#elif defined(__arm__)
> +    /* Keep the buffer no bigger than 16MB to branch between blocks */
> +    max_buf = 16 * 1024 * 1024;
> +#elif defined(__s390x__)
> +    /* Map the buffer so that we can use direct calls and branches.  */
> +    /* We have a +- 4GB range on the branches; leave some slop.  */
> +    max_buf = 3ul * 1024 * 1024 * 1024;
> +    start = 0x90000000ul;
> +#endif
> +#if defined(__PIE__) || defined(__PIC__)
> +    /* Don't bother setting a preferred location if we're building
> +       a position-independent executable.  We're more likely to get
> +       an address near the main executable if we let the kernel
> +       choose the address.  */
> +    start = 0;
> +#endif
> +
> +    /* Size the buffer.  */
>      code_gen_buffer_size = tb_size;
>      if (code_gen_buffer_size == 0) {
>  #if defined(CONFIG_USER_ONLY)
> @@ -526,81 +573,27 @@ static void code_gen_alloc(unsigned long tb_size)
>          code_gen_buffer_size = (unsigned long)(ram_size / 4);
>  #endif
>      }
> -    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE)
> +    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE) {
>          code_gen_buffer_size = MIN_CODE_GEN_BUFFER_SIZE;
> -    /* The code gen buffer location may have constraints depending on
> -       the host cpu and OS */
> -#if defined(__linux__)
> -    {
> -        int flags;
> -        void *start = NULL;
> -
> -        flags = MAP_PRIVATE | MAP_ANONYMOUS;
> -#if defined(__x86_64__)
> -        flags |= MAP_32BIT;
> -        /* Cannot map more than that */
> -        if (code_gen_buffer_size > (800 * 1024 * 1024))
> -            code_gen_buffer_size = (800 * 1024 * 1024);
> -#elif defined(__sparc__) && HOST_LONG_BITS == 64
> -        // Map the buffer below 2G, so we can use direct calls and branches
> -        start = (void *) 0x40000000UL;
> -        if (code_gen_buffer_size > (512 * 1024 * 1024))
> -            code_gen_buffer_size = (512 * 1024 * 1024);
> -#elif defined(__arm__)
> -        /* Keep the buffer no bigger than 16MB to branch between blocks */
> -        if (code_gen_buffer_size > 16 * 1024 * 1024)
> -            code_gen_buffer_size = 16 * 1024 * 1024;
> -#elif defined(__s390x__)
> -        /* Map the buffer so that we can use direct calls and branches.  */
> -        /* We have a +- 4GB range on the branches; leave some slop.  */
> -        if (code_gen_buffer_size > (3ul * 1024 * 1024 * 1024)) {
> -            code_gen_buffer_size = 3ul * 1024 * 1024 * 1024;
> -        }
> -        start = (void *)0x90000000UL;
> -#endif
> -        code_gen_buffer = mmap(start, code_gen_buffer_size,
> -                               PROT_WRITE | PROT_READ | PROT_EXEC,
> -                               flags, -1, 0);
> -        if (code_gen_buffer == MAP_FAILED) {
> -            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> -            exit(1);
> -        }
>      }
> -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
> -    || defined(__DragonFly__) || defined(__OpenBSD__) \
> -    || defined(__NetBSD__)
> -    {
> -        int flags;
> -        void *addr = NULL;
> -        flags = MAP_PRIVATE | MAP_ANONYMOUS;
> -#if defined(__x86_64__)
> -        /* FreeBSD doesn't have MAP_32BIT, use MAP_FIXED and assume
> -         * 0x40000000 is free */
> -        flags |= MAP_FIXED;
> -        addr = (void *)0x40000000;
> -        /* Cannot map more than that */
> -        if (code_gen_buffer_size > (800 * 1024 * 1024))
> -            code_gen_buffer_size = (800 * 1024 * 1024);
> -#elif defined(__sparc__) && HOST_LONG_BITS == 64
> -        // Map the buffer below 2G, so we can use direct calls and branches
> -        addr = (void *) 0x40000000UL;
> -        if (code_gen_buffer_size > (512 * 1024 * 1024)) {
> -            code_gen_buffer_size = (512 * 1024 * 1024);
> -        }
> -#endif
> -        code_gen_buffer = mmap(addr, code_gen_buffer_size,
> -                               PROT_WRITE | PROT_READ | PROT_EXEC,
> -                               flags, -1, 0);
> -        if (code_gen_buffer == MAP_FAILED) {
> -            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> -            exit(1);
> -        }
> +    if (code_gen_buffer_size > max_buf) {
> +        code_gen_buffer_size = max_buf;
> +    }
> +
> +#ifdef USE_MMAP
> +    code_gen_buffer = mmap((void *)start, code_gen_buffer_size,
> +                           PROT_WRITE | PROT_READ | PROT_EXEC,
> +                           flags, -1, 0);
> +    if (code_gen_buffer == MAP_FAILED) {
> +        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> +        exit(1);
>      }
>  #else
>      code_gen_buffer = g_malloc(code_gen_buffer_size);
>      map_exec(code_gen_buffer, code_gen_buffer_size);

In this branch (e.g. mingw32), 'start' is unused:
/src/qemu/exec.c: In function 'code_gen_alloc':
/src/qemu/exec.c:531: warning: unused variable 'start'

>  #endif
>  #endif /* !USE_STATIC_CODE_GEN_BUFFER */
> +
>      map_exec(code_gen_prologue, sizeof(code_gen_prologue));
>      code_gen_buffer_max_size = code_gen_buffer_size -
>          (TCG_MAX_OP_SIZE * OPC_BUF_SIZE);
> --
> 1.7.11.4
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] exec: Don't request an address for code_gen_buffer if -fpie
  2012-10-07 16:34 ` Blue Swirl
@ 2012-10-07 19:20   ` Richard Henderson
  2012-10-07 19:40     ` Blue Swirl
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Henderson @ 2012-10-07 19:20 UTC (permalink / raw)
  To: Blue Swirl; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 945 bytes --]

On 10/07/2012 09:34 AM, Blue Swirl wrote:
>> > +#ifdef USE_MMAP
>> > +    code_gen_buffer = mmap((void *)start, code_gen_buffer_size,
>> > +                           PROT_WRITE | PROT_READ | PROT_EXEC,
>> > +                           flags, -1, 0);
>> > +    if (code_gen_buffer == MAP_FAILED) {
>> > +        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
>> > +        exit(1);
>> >      }
>> >  #else
>> >      code_gen_buffer = g_malloc(code_gen_buffer_size);
>> >      map_exec(code_gen_buffer, code_gen_buffer_size);
> In this branch (e.g. mingw32), 'start' is unused:
> /src/qemu/exec.c: In function 'code_gen_alloc':
> /src/qemu/exec.c:531: warning: unused variable 'start'

Well, I've rearranged the code to handle this, and it does avoid the warning.
But I'm not sure I like the two separate blocks.  Especially for the x86_64
MAP32 case.  Perhaps we're better off with an __attribute__((unused)) there?


r~





[-- Attachment #2: z --]
[-- Type: text/plain, Size: 2993 bytes --]

diff --git a/exec.c b/exec.c
index 8f3bc74..704426c 100644
--- a/exec.c
+++ b/exec.c
@@ -527,15 +527,14 @@ static void code_gen_alloc(unsigned long tb_size)
 #else
 #ifdef USE_MMAP
     int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+    uintptr_t start = 0;
 #endif
-    uintptr_t max_buf = -1, start = 0;
+    size_t max_buf = ~(size_t)0;
 
-    /* Constrain the size and position of the buffer based on the host cpu.  */
+    /* Constrain the size of the buffer based on the host cpu.  */
 #if defined(__x86_64__)
-# if !defined(__PIE__) && !defined(__PIC__) && defined(MAP_32BIT)
-    /* Force the memory down into low memory with the executable.
-       Leave the choice of exact location with the kernel.  */
-    flags |= MAP_32BIT;
+# if !defined(__PIE__) && !defined(__PIC__) \
+     && defined(USE_MMAP) && defined(MAP_32BIT)
     /* Cannot expect to map more than 800MB in low memory.  */
     max_buf = 800 * 1024 * 1024;
 # else
@@ -545,22 +544,12 @@ static void code_gen_alloc(unsigned long tb_size)
 #elif defined(__sparc__) && HOST_LONG_BITS == 64
     /* Maximum range of direct branches between TB (via "call").  */
     max_buf = 2ul * 1024 * 1024 * 1024;
-    start = 0x40000000ul;
 #elif defined(__arm__)
     /* Keep the buffer no bigger than 16MB to branch between blocks */
     max_buf = 16 * 1024 * 1024;
 #elif defined(__s390x__)
-    /* Map the buffer so that we can use direct calls and branches.  */
     /* We have a +- 4GB range on the branches; leave some slop.  */
     max_buf = 3ul * 1024 * 1024 * 1024;
-    start = 0x90000000ul;
-#endif
-#if defined(__PIE__) || defined(__PIC__)
-    /* Don't bother setting a preferred location if we're building
-       a position-independent executable.  We're more likely to get
-       an address near the main executable if we let the kernel
-       choose the address.  */
-    start = 0;
 #endif
 
     /* Size the buffer.  */
@@ -581,6 +570,23 @@ static void code_gen_alloc(unsigned long tb_size)
     }
 
 #ifdef USE_MMAP
+    /* Constrain the position of the buffer based on the host cpu.
+       Note that these addresses are chosen in concert with the
+       addresses assigned in the relevant linker script file.  */
+# if defined(__PIE__) || defined(__PIC__)
+    /* Don't bother setting a preferred location if we're building
+       a position-independent executable.  We're more likely to get
+       an address near the main executable if we let the kernel
+       choose the address.  */
+# elif defined(__x86_64__) && defined(MAP_32BIT)
+    /* Force the memory down into low memory with the executable.
+       Leave the choice of exact location with the kernel.  */
+    flags |= MAP_32BIT;
+# elif defined(__sparc__) && HOST_LONG_BITS == 64
+    start = 0x40000000ul;
+# elif defined(__s390x__)
+    start = 0x90000000ul;
+# endif
     code_gen_buffer = mmap((void *)start, code_gen_buffer_size,
                            PROT_WRITE | PROT_READ | PROT_EXEC,
                            flags, -1, 0);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] exec: Don't request an address for code_gen_buffer if -fpie
  2012-10-07 19:20   ` Richard Henderson
@ 2012-10-07 19:40     ` Blue Swirl
  2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
  0 siblings, 1 reply; 12+ messages in thread
From: Blue Swirl @ 2012-10-07 19:40 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Sun, Oct 7, 2012 at 7:20 PM, Richard Henderson <rth@twiddle.net> wrote:
> On 10/07/2012 09:34 AM, Blue Swirl wrote:
>>> > +#ifdef USE_MMAP
>>> > +    code_gen_buffer = mmap((void *)start, code_gen_buffer_size,
>>> > +                           PROT_WRITE | PROT_READ | PROT_EXEC,
>>> > +                           flags, -1, 0);
>>> > +    if (code_gen_buffer == MAP_FAILED) {
>>> > +        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
>>> > +        exit(1);
>>> >      }
>>> >  #else
>>> >      code_gen_buffer = g_malloc(code_gen_buffer_size);
>>> >      map_exec(code_gen_buffer, code_gen_buffer_size);
>> In this branch (e.g. mingw32), 'start' is unused:
>> /src/qemu/exec.c: In function 'code_gen_alloc':
>> /src/qemu/exec.c:531: warning: unused variable 'start'
>
> Well, I've rearranged the code to handle this, and it does avoid the warning.
> But I'm not sure I like the two separate blocks.  Especially for the x86_64
> MAP32 case.  Perhaps we're better off with an __attribute__((unused)) there?

How about splitting the function into three: common part, mmap case
and non-mmap case? It could improve readability.

>
>
> r~
>
>
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie
  2012-10-07 19:40     ` Blue Swirl
@ 2012-10-12 21:20       ` Richard Henderson
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer Richard Henderson
                           ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Richard Henderson @ 2012-10-12 21:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

This revision of the patch set takes Blue's suggestion to split up
code_gen_alloc into several pieces.  It does seem to clean things
up a bit.

The first patch is cleanup, doing the split.  The third patch does
in one line what I was trying to accomplish with the first revision
of this patch.  The second and fourth patches are new.

The patch set is available from

  git://github.com/rth7680/qemu.git exec


r~


Richard Henderson (4):
  exec: Split up and tidy code_gen_buffer
  exec: Don't make DEFAULT_CODE_GEN_BUFFER_SIZE too large
  exec: Do not use absolute address hints for code_gen_buffer with
    -fpie
  exec: Allocate code_gen_prologue from code_gen_buffer

 exec.c    | 232 +++++++++++++++++++++++++++++++++-----------------------------
 tcg/tcg.h |   2 +-
 2 files changed, 123 insertions(+), 111 deletions(-)

-- 
1.7.11.7

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer
  2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
@ 2012-10-12 21:20         ` Richard Henderson
  2012-10-13 13:33           ` Blue Swirl
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 2/4] exec: Don't make DEFAULT_CODE_GEN_BUFFER_SIZE too large Richard Henderson
                           ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Richard Henderson @ 2012-10-12 21:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

It now consists of:

A macro definition of MAX_CODE_GEN_BUFFER_SIZE with host-specific values,

A function size_code_gen_buffer that applies most of the reasoning for
choosing a buffer size,

Three variations of a function alloc_code_gen_buffer that contain all
of the logic for allocating executable memory via a given allocation
mechanism.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec.c | 193 ++++++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 102 insertions(+), 91 deletions(-)

diff --git a/exec.c b/exec.c
index 7899042..db735dd 100644
--- a/exec.c
+++ b/exec.c
@@ -103,9 +103,9 @@ spinlock_t tb_lock = SPIN_LOCK_UNLOCKED;
 
 uint8_t code_gen_prologue[1024] code_gen_section;
 static uint8_t *code_gen_buffer;
-static unsigned long code_gen_buffer_size;
+static size_t code_gen_buffer_size;
 /* threshold to flush the translated code buffer */
-static unsigned long code_gen_buffer_max_size;
+static size_t code_gen_buffer_max_size;
 static uint8_t *code_gen_ptr;
 
 #if !defined(CONFIG_USER_ONLY)
@@ -497,110 +497,121 @@ bool memory_region_is_unassigned(MemoryRegion *mr)
 #define mmap_unlock() do { } while(0)
 #endif
 
-#define DEFAULT_CODE_GEN_BUFFER_SIZE (32 * 1024 * 1024)
-
 #if defined(CONFIG_USER_ONLY)
 /* Currently it is not recommended to allocate big chunks of data in
-   user mode. It will change when a dedicated libc will be used */
+   user mode. It will change when a dedicated libc will be used.  */
+/* ??? 64-bit hosts ought to have no problem mmaping data outside the
+   region in which the guest needs to run.  Revisit this.  */
 #define USE_STATIC_CODE_GEN_BUFFER
 #endif
 
-#ifdef USE_STATIC_CODE_GEN_BUFFER
-static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
-               __attribute__((aligned (CODE_GEN_ALIGN)));
+/* ??? Should configure for this, not list operating systems here.  */
+#if (defined(__linux__) \
+    || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
+    || defined(__DragonFly__) || defined(__OpenBSD__) \
+    || defined(__NetBSD__))
+# define USE_MMAP
 #endif
 
-static void code_gen_alloc(unsigned long tb_size)
+/* Maximum size of the code gen buffer we'd like to use.  Unless otherwise
+   indicated, this is constrained by the range of direct branches on the
+   host cpu, as used by the TCG implementation of goto_tb.  */
+#if defined(__x86_64__)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
+#elif defined(__sparc__)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
+#elif defined(__arm__)
+# define MAX_CODE_GEN_BUFFER_SIZE  (16u * 1024 * 1024)
+#elif defined(__s390x__)
+  /* We have a +- 4GB range on the branches; leave some slop.  */
+# define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
+#else
+# define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
+#endif
+
+#define DEFAULT_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024)
+
+static inline size_t size_code_gen_buffer(size_t tb_size)
 {
+    /* Size the buffer.  */
+    if (tb_size == 0) {
 #ifdef USE_STATIC_CODE_GEN_BUFFER
-    code_gen_buffer = static_code_gen_buffer;
-    code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
-    map_exec(code_gen_buffer, code_gen_buffer_size);
-#else
-    code_gen_buffer_size = tb_size;
-    if (code_gen_buffer_size == 0) {
-#if defined(CONFIG_USER_ONLY)
-        code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
+        tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
 #else
-        /* XXX: needs adjustments */
-        code_gen_buffer_size = (unsigned long)(ram_size / 4);
+        /* ??? Needs adjustments.  */
+        /* ??? If we relax the requirement that CONFIG_USER_ONLY use the
+           static buffer, we could size this on RESERVED_VA, on the text
+           segment size of the executable, or continue to use the default.  */
+        tb_size = (unsigned long)(ram_size / 4);
 #endif
     }
-    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE)
-        code_gen_buffer_size = MIN_CODE_GEN_BUFFER_SIZE;
-    /* The code gen buffer location may have constraints depending on
-       the host cpu and OS */
-#if defined(__linux__) 
-    {
-        int flags;
-        void *start = NULL;
-
-        flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#if defined(__x86_64__)
-        flags |= MAP_32BIT;
-        /* Cannot map more than that */
-        if (code_gen_buffer_size > (800 * 1024 * 1024))
-            code_gen_buffer_size = (800 * 1024 * 1024);
-#elif defined(__sparc__) && HOST_LONG_BITS == 64
-        // Map the buffer below 2G, so we can use direct calls and branches
-        start = (void *) 0x40000000UL;
-        if (code_gen_buffer_size > (512 * 1024 * 1024))
-            code_gen_buffer_size = (512 * 1024 * 1024);
-#elif defined(__arm__)
-        /* Keep the buffer no bigger than 16MB to branch between blocks */
-        if (code_gen_buffer_size > 16 * 1024 * 1024)
-            code_gen_buffer_size = 16 * 1024 * 1024;
-#elif defined(__s390x__)
-        /* Map the buffer so that we can use direct calls and branches.  */
-        /* We have a +- 4GB range on the branches; leave some slop.  */
-        if (code_gen_buffer_size > (3ul * 1024 * 1024 * 1024)) {
-            code_gen_buffer_size = 3ul * 1024 * 1024 * 1024;
-        }
-        start = (void *)0x90000000UL;
-#endif
-        code_gen_buffer = mmap(start, code_gen_buffer_size,
-                               PROT_WRITE | PROT_READ | PROT_EXEC,
-                               flags, -1, 0);
-        if (code_gen_buffer == MAP_FAILED) {
-            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
-            exit(1);
-        }
+    if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) {
+        tb_size = MIN_CODE_GEN_BUFFER_SIZE;
     }
-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
-    || defined(__DragonFly__) || defined(__OpenBSD__) \
-    || defined(__NetBSD__)
-    {
-        int flags;
-        void *addr = NULL;
-        flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#if defined(__x86_64__)
-        /* FreeBSD doesn't have MAP_32BIT, use MAP_FIXED and assume
-         * 0x40000000 is free */
-        flags |= MAP_FIXED;
-        addr = (void *)0x40000000;
-        /* Cannot map more than that */
-        if (code_gen_buffer_size > (800 * 1024 * 1024))
-            code_gen_buffer_size = (800 * 1024 * 1024);
-#elif defined(__sparc__) && HOST_LONG_BITS == 64
-        // Map the buffer below 2G, so we can use direct calls and branches
-        addr = (void *) 0x40000000UL;
-        if (code_gen_buffer_size > (512 * 1024 * 1024)) {
-            code_gen_buffer_size = (512 * 1024 * 1024);
-        }
-#endif
-        code_gen_buffer = mmap(addr, code_gen_buffer_size,
-                               PROT_WRITE | PROT_READ | PROT_EXEC, 
-                               flags, -1, 0);
-        if (code_gen_buffer == MAP_FAILED) {
-            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
-            exit(1);
-        }
+    if (tb_size > MAX_CODE_GEN_BUFFER_SIZE) {
+        tb_size = MAX_CODE_GEN_BUFFER_SIZE;
     }
+    code_gen_buffer_size = tb_size;
+    return tb_size;
+}
+
+#ifdef USE_STATIC_CODE_GEN_BUFFER
+static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
+    __attribute__((aligned(CODE_GEN_ALIGN)));
+
+static inline void *alloc_code_gen_buffer(void)
+{
+    map_exec(static_code_gen_buffer, code_gen_buffer_size);
+    return static_code_gen_buffer;
+}
+#elif defined(USE_MMAP)
+static inline void *alloc_code_gen_buffer(void)
+{
+    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+    uintptr_t start = 0;
+    void *buf;
+
+    /* Constrain the position of the buffer based on the host cpu.
+       Note that these addresses are chosen in concert with the
+       addresses assigned in the relevant linker script file.  */
+# if defined(__x86_64__) && defined(MAP_32BIT)
+    /* Force the memory down into low memory with the executable.
+       Leave the choice of exact location with the kernel.  */
+    flags |= MAP_32BIT;
+    /* Cannot expect to map more than 800MB in low memory.  */
+    if (code_gen_buffer_size > 800u * 1024 * 1024) {
+        code_gen_buffer_size = 800u * 1024 * 1024;
+    }
+# elif defined(__sparc__)
+    start = 0x40000000ul;
+# elif defined(__s390x__)
+    start = 0x90000000ul;
+# endif
+
+    buf = mmap((void *)start, code_gen_buffer_size,
+               PROT_WRITE | PROT_READ | PROT_EXEC, flags, -1, 0);
+    return buf == MAP_FAILED ? NULL : buf;
+}
 #else
-    code_gen_buffer = g_malloc(code_gen_buffer_size);
-    map_exec(code_gen_buffer, code_gen_buffer_size);
-#endif
-#endif /* !USE_STATIC_CODE_GEN_BUFFER */
+static inline void *alloc_code_gen_buffer(void)
+{
+    void *buf = g_malloc(code_gen_buffer_size);
+    if (buf) {
+        map_exec(buf, code_gen_buffer_size);
+    }
+    return buf;
+}
+#endif /* USE_STATIC_CODE_GEN_BUFFER, USE_MMAP */
+
+static inline void code_gen_alloc(size_t tb_size)
+{
+    code_gen_buffer_size = size_code_gen_buffer(tb_size);
+    code_gen_buffer = alloc_code_gen_buffer();
+    if (code_gen_buffer == NULL) {
+        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
+        exit(1);
+    }
+
     map_exec(code_gen_prologue, sizeof(code_gen_prologue));
     code_gen_buffer_max_size = code_gen_buffer_size -
         (TCG_MAX_OP_SIZE * OPC_BUF_SIZE);
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 2/4] exec: Don't make DEFAULT_CODE_GEN_BUFFER_SIZE too large
  2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer Richard Henderson
@ 2012-10-12 21:20         ` Richard Henderson
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 3/4] exec: Do not use absolute address hints for code_gen_buffer with -fpie Richard Henderson
                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2012-10-12 21:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

For ARM we cap the buffer size to 16MB.  Do not allocate 32MB in that case.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/exec.c b/exec.c
index db735dd..386cc08 100644
--- a/exec.c
+++ b/exec.c
@@ -529,7 +529,11 @@ bool memory_region_is_unassigned(MemoryRegion *mr)
 # define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 #endif
 
-#define DEFAULT_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024)
+#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024)
+
+#define DEFAULT_CODE_GEN_BUFFER_SIZE \
+  (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
+   ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
 
 static inline size_t size_code_gen_buffer(size_t tb_size)
 {
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 3/4] exec: Do not use absolute address hints for code_gen_buffer with -fpie
  2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer Richard Henderson
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 2/4] exec: Don't make DEFAULT_CODE_GEN_BUFFER_SIZE too large Richard Henderson
@ 2012-10-12 21:20         ` Richard Henderson
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 4/4] exec: Allocate code_gen_prologue from code_gen_buffer Richard Henderson
  2012-10-12 21:41         ` [Qemu-devel] [PATCH 5/4] exec: Make MIN_CODE_GEN_BUFFER_SIZE private to exec.c Richard Henderson
  4 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2012-10-12 21:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

The hard-coded addresses inside alloc_code_gen_buffer only make sense
if we're building an executable that will actually run at the address
we've put into the linker scripts.

When we're building with -fpie, the executable will run at some
random location chosen by the kernel.  We get better placement for
the code_gen_buffer if we allow the kernel to place the memory,
as it will tend to to place it near the executable, based on the
PROT_EXEC bit.

Since code_gen_prologue is always inside the executable, this effect
is easily seen at the end of most TB, with the exit_tb opcode, and
with any calls to helper functions.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/exec.c b/exec.c
index 386cc08..e5f8c30 100644
--- a/exec.c
+++ b/exec.c
@@ -578,7 +578,12 @@ static inline void *alloc_code_gen_buffer(void)
     /* Constrain the position of the buffer based on the host cpu.
        Note that these addresses are chosen in concert with the
        addresses assigned in the relevant linker script file.  */
-# if defined(__x86_64__) && defined(MAP_32BIT)
+# if defined(__PIE__) || defined(__PIC__)
+    /* Don't bother setting a preferred location if we're building
+       a position-independent executable.  We're more likely to get
+       an address near the main executable if we let the kernel
+       choose the address.  */
+# elif defined(__x86_64__) && defined(MAP_32BIT)
     /* Force the memory down into low memory with the executable.
        Leave the choice of exact location with the kernel.  */
     flags |= MAP_32BIT;
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 4/4] exec: Allocate code_gen_prologue from code_gen_buffer
  2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
                           ` (2 preceding siblings ...)
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 3/4] exec: Do not use absolute address hints for code_gen_buffer with -fpie Richard Henderson
@ 2012-10-12 21:20         ` Richard Henderson
  2012-10-12 21:41         ` [Qemu-devel] [PATCH 5/4] exec: Make MIN_CODE_GEN_BUFFER_SIZE private to exec.c Richard Henderson
  4 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2012-10-12 21:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

We had a hack for arm and sparc, allocating code_gen_prologue to a
special section.  Which, honestly does no good under certain cases.
We've already got limits on code_gen_buffer_size to ensure that all
TBs can use direct branches between themselves; reuse this limit to
ensure the prologue is also reachable.

As a bonus, we get to avoid marking a page of the main executable's
data segment as executable.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec.c    | 30 +++++++++++-------------------
 tcg/tcg.h |  2 +-
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/exec.c b/exec.c
index e5f8c30..958aebe 100644
--- a/exec.c
+++ b/exec.c
@@ -86,22 +86,7 @@ static int nb_tbs;
 /* any access to the tbs or the page table must use this lock */
 spinlock_t tb_lock = SPIN_LOCK_UNLOCKED;
 
-#if defined(__arm__) || defined(__sparc__)
-/* The prologue must be reachable with a direct jump. ARM and Sparc64
- have limited branch ranges (possibly also PPC) so place it in a
- section close to code segment. */
-#define code_gen_section                                \
-    __attribute__((__section__(".gen_code")))           \
-    __attribute__((aligned (32)))
-#elif defined(_WIN32) && !defined(_WIN64)
-#define code_gen_section                                \
-    __attribute__((aligned (16)))
-#else
-#define code_gen_section                                \
-    __attribute__((aligned (32)))
-#endif
-
-uint8_t code_gen_prologue[1024] code_gen_section;
+uint8_t *code_gen_prologue;
 static uint8_t *code_gen_buffer;
 static size_t code_gen_buffer_size;
 /* threshold to flush the translated code buffer */
@@ -221,7 +206,7 @@ static int tb_flush_count;
 static int tb_phys_invalidate_count;
 
 #ifdef _WIN32
-static void map_exec(void *addr, long size)
+static inline void map_exec(void *addr, long size)
 {
     DWORD old_protect;
     VirtualProtect(addr, size,
@@ -229,7 +214,7 @@ static void map_exec(void *addr, long size)
     
 }
 #else
-static void map_exec(void *addr, long size)
+static inline void map_exec(void *addr, long size)
 {
     unsigned long start, end, page_size;
     
@@ -621,7 +606,14 @@ static inline void code_gen_alloc(size_t tb_size)
         exit(1);
     }
 
-    map_exec(code_gen_prologue, sizeof(code_gen_prologue));
+    /* Steal room for the prologue at the end of the buffer.  This ensures
+       (via the MAX_CODE_GEN_BUFFER_SIZE limits above) that direct branches
+       from TB's to the prologue are going to be in range.  It also means
+       that we don't need to mark (additional) portions of the data segment
+       as executable.  */
+    code_gen_prologue = code_gen_buffer + code_gen_buffer_size - 1024;
+    code_gen_buffer_size -= 1024;
+
     code_gen_buffer_max_size = code_gen_buffer_size -
         (TCG_MAX_OP_SIZE * OPC_BUF_SIZE);
     code_gen_max_blocks = code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 7bafe0e..45e94f5 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -616,7 +616,7 @@ TCGv_i64 tcg_const_i64(int64_t val);
 TCGv_i32 tcg_const_local_i32(int32_t val);
 TCGv_i64 tcg_const_local_i64(int64_t val);
 
-extern uint8_t code_gen_prologue[];
+extern uint8_t *code_gen_prologue;
 
 /* TCG targets may use a different definition of tcg_qemu_tb_exec. */
 #if !defined(tcg_qemu_tb_exec)
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 5/4] exec: Make MIN_CODE_GEN_BUFFER_SIZE private to exec.c
  2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
                           ` (3 preceding siblings ...)
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 4/4] exec: Allocate code_gen_prologue from code_gen_buffer Richard Henderson
@ 2012-10-12 21:41         ` Richard Henderson
  4 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2012-10-12 21:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl

It is used nowhere else, and the corresponding MAX_CODE_GEN_BUFFER_SIZE
also lives there.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 exec-all.h | 2 --
 exec.c     | 4 ++++
 2 files changed, 4 insertions(+), 2 deletions(-)

Dang it, I knew there was something else I was intending to include
in the patch set that I sent a moment ago...


r~


diff --git a/exec-all.h b/exec-all.h
index 6516da0..7f29820 100644
--- a/exec-all.h
+++ b/exec-all.h
@@ -121,8 +121,6 @@ static inline void tlb_flush(CPUArchState *env, int flush_global)
 #define CODE_GEN_PHYS_HASH_BITS     15
 #define CODE_GEN_PHYS_HASH_SIZE     (1 << CODE_GEN_PHYS_HASH_BITS)
 
-#define MIN_CODE_GEN_BUFFER_SIZE     (1024 * 1024)
-
 /* estimated block size for TB allocation */
 /* XXX: use a per code average code fragment size and modulate it
    according to the host CPU */
diff --git a/exec.c b/exec.c
index 958aebe..096cc56 100644
--- a/exec.c
+++ b/exec.c
@@ -498,6 +498,10 @@ bool memory_region_is_unassigned(MemoryRegion *mr)
 # define USE_MMAP
 #endif
 
+/* Minimum size of the code gen buffer.  This number is randomly chosen,
+   but not so small that we can't have a fair number of TB's live.  */
+#define MIN_CODE_GEN_BUFFER_SIZE     (1024u * 1024)
+
 /* Maximum size of the code gen buffer we'd like to use.  Unless otherwise
    indicated, this is constrained by the range of direct branches on the
    host cpu, as used by the TCG implementation of goto_tb.  */
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer
  2012-10-12 21:20         ` [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer Richard Henderson
@ 2012-10-13 13:33           ` Blue Swirl
  2012-10-15 20:16             ` Richard Henderson
  0 siblings, 1 reply; 12+ messages in thread
From: Blue Swirl @ 2012-10-13 13:33 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Oct 12, 2012 at 9:20 PM, Richard Henderson <rth@twiddle.net> wrote:
> It now consists of:
>
> A macro definition of MAX_CODE_GEN_BUFFER_SIZE with host-specific values,
>
> A function size_code_gen_buffer that applies most of the reasoning for
> choosing a buffer size,
>
> Three variations of a function alloc_code_gen_buffer that contain all
> of the logic for allocating executable memory via a given allocation
> mechanism.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  exec.c | 193 ++++++++++++++++++++++++++++++++++-------------------------------
>  1 file changed, 102 insertions(+), 91 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index 7899042..db735dd 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -103,9 +103,9 @@ spinlock_t tb_lock = SPIN_LOCK_UNLOCKED;
>
>  uint8_t code_gen_prologue[1024] code_gen_section;
>  static uint8_t *code_gen_buffer;
> -static unsigned long code_gen_buffer_size;
> +static size_t code_gen_buffer_size;
>  /* threshold to flush the translated code buffer */
> -static unsigned long code_gen_buffer_max_size;
> +static size_t code_gen_buffer_max_size;

This breaks build on i386:
/src/qemu/exec.c: In function 'dump_exec_info':
/src/qemu/exec.c:4208: error: format '%ld' expects type 'long int',
but argument 4 has type 'size_t'

>  static uint8_t *code_gen_ptr;
>
>  #if !defined(CONFIG_USER_ONLY)
> @@ -497,110 +497,121 @@ bool memory_region_is_unassigned(MemoryRegion *mr)
>  #define mmap_unlock() do { } while(0)
>  #endif
>
> -#define DEFAULT_CODE_GEN_BUFFER_SIZE (32 * 1024 * 1024)
> -
>  #if defined(CONFIG_USER_ONLY)
>  /* Currently it is not recommended to allocate big chunks of data in
> -   user mode. It will change when a dedicated libc will be used */
> +   user mode. It will change when a dedicated libc will be used.  */
> +/* ??? 64-bit hosts ought to have no problem mmaping data outside the
> +   region in which the guest needs to run.  Revisit this.  */
>  #define USE_STATIC_CODE_GEN_BUFFER
>  #endif
>
> -#ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> -               __attribute__((aligned (CODE_GEN_ALIGN)));
> +/* ??? Should configure for this, not list operating systems here.  */
> +#if (defined(__linux__) \
> +    || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
> +    || defined(__DragonFly__) || defined(__OpenBSD__) \
> +    || defined(__NetBSD__))
> +# define USE_MMAP
>  #endif
>
> -static void code_gen_alloc(unsigned long tb_size)
> +/* Maximum size of the code gen buffer we'd like to use.  Unless otherwise
> +   indicated, this is constrained by the range of direct branches on the
> +   host cpu, as used by the TCG implementation of goto_tb.  */
> +#if defined(__x86_64__)
> +# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
> +#elif defined(__sparc__)
> +# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
> +#elif defined(__arm__)
> +# define MAX_CODE_GEN_BUFFER_SIZE  (16u * 1024 * 1024)
> +#elif defined(__s390x__)
> +  /* We have a +- 4GB range on the branches; leave some slop.  */
> +# define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
> +#else
> +# define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
> +#endif
> +
> +#define DEFAULT_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024)
> +
> +static inline size_t size_code_gen_buffer(size_t tb_size)
>  {
> +    /* Size the buffer.  */
> +    if (tb_size == 0) {
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> -    code_gen_buffer = static_code_gen_buffer;
> -    code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
> -    map_exec(code_gen_buffer, code_gen_buffer_size);
> -#else
> -    code_gen_buffer_size = tb_size;
> -    if (code_gen_buffer_size == 0) {
> -#if defined(CONFIG_USER_ONLY)
> -        code_gen_buffer_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
> +        tb_size = DEFAULT_CODE_GEN_BUFFER_SIZE;
>  #else
> -        /* XXX: needs adjustments */
> -        code_gen_buffer_size = (unsigned long)(ram_size / 4);
> +        /* ??? Needs adjustments.  */
> +        /* ??? If we relax the requirement that CONFIG_USER_ONLY use the
> +           static buffer, we could size this on RESERVED_VA, on the text
> +           segment size of the executable, or continue to use the default.  */
> +        tb_size = (unsigned long)(ram_size / 4);
>  #endif
>      }
> -    if (code_gen_buffer_size < MIN_CODE_GEN_BUFFER_SIZE)
> -        code_gen_buffer_size = MIN_CODE_GEN_BUFFER_SIZE;
> -    /* The code gen buffer location may have constraints depending on
> -       the host cpu and OS */
> -#if defined(__linux__)
> -    {
> -        int flags;
> -        void *start = NULL;
> -
> -        flags = MAP_PRIVATE | MAP_ANONYMOUS;
> -#if defined(__x86_64__)
> -        flags |= MAP_32BIT;
> -        /* Cannot map more than that */
> -        if (code_gen_buffer_size > (800 * 1024 * 1024))
> -            code_gen_buffer_size = (800 * 1024 * 1024);
> -#elif defined(__sparc__) && HOST_LONG_BITS == 64
> -        // Map the buffer below 2G, so we can use direct calls and branches
> -        start = (void *) 0x40000000UL;
> -        if (code_gen_buffer_size > (512 * 1024 * 1024))
> -            code_gen_buffer_size = (512 * 1024 * 1024);
> -#elif defined(__arm__)
> -        /* Keep the buffer no bigger than 16MB to branch between blocks */
> -        if (code_gen_buffer_size > 16 * 1024 * 1024)
> -            code_gen_buffer_size = 16 * 1024 * 1024;
> -#elif defined(__s390x__)
> -        /* Map the buffer so that we can use direct calls and branches.  */
> -        /* We have a +- 4GB range on the branches; leave some slop.  */
> -        if (code_gen_buffer_size > (3ul * 1024 * 1024 * 1024)) {
> -            code_gen_buffer_size = 3ul * 1024 * 1024 * 1024;
> -        }
> -        start = (void *)0x90000000UL;
> -#endif
> -        code_gen_buffer = mmap(start, code_gen_buffer_size,
> -                               PROT_WRITE | PROT_READ | PROT_EXEC,
> -                               flags, -1, 0);
> -        if (code_gen_buffer == MAP_FAILED) {
> -            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> -            exit(1);
> -        }
> +    if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) {
> +        tb_size = MIN_CODE_GEN_BUFFER_SIZE;
>      }
> -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) \
> -    || defined(__DragonFly__) || defined(__OpenBSD__) \
> -    || defined(__NetBSD__)
> -    {
> -        int flags;
> -        void *addr = NULL;
> -        flags = MAP_PRIVATE | MAP_ANONYMOUS;
> -#if defined(__x86_64__)
> -        /* FreeBSD doesn't have MAP_32BIT, use MAP_FIXED and assume
> -         * 0x40000000 is free */
> -        flags |= MAP_FIXED;
> -        addr = (void *)0x40000000;
> -        /* Cannot map more than that */
> -        if (code_gen_buffer_size > (800 * 1024 * 1024))
> -            code_gen_buffer_size = (800 * 1024 * 1024);
> -#elif defined(__sparc__) && HOST_LONG_BITS == 64
> -        // Map the buffer below 2G, so we can use direct calls and branches
> -        addr = (void *) 0x40000000UL;
> -        if (code_gen_buffer_size > (512 * 1024 * 1024)) {
> -            code_gen_buffer_size = (512 * 1024 * 1024);
> -        }
> -#endif
> -        code_gen_buffer = mmap(addr, code_gen_buffer_size,
> -                               PROT_WRITE | PROT_READ | PROT_EXEC,
> -                               flags, -1, 0);
> -        if (code_gen_buffer == MAP_FAILED) {
> -            fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> -            exit(1);
> -        }
> +    if (tb_size > MAX_CODE_GEN_BUFFER_SIZE) {
> +        tb_size = MAX_CODE_GEN_BUFFER_SIZE;
>      }
> +    code_gen_buffer_size = tb_size;
> +    return tb_size;
> +}
> +
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +    __attribute__((aligned(CODE_GEN_ALIGN)));
> +
> +static inline void *alloc_code_gen_buffer(void)
> +{
> +    map_exec(static_code_gen_buffer, code_gen_buffer_size);
> +    return static_code_gen_buffer;
> +}
> +#elif defined(USE_MMAP)
> +static inline void *alloc_code_gen_buffer(void)
> +{
> +    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
> +    uintptr_t start = 0;
> +    void *buf;
> +
> +    /* Constrain the position of the buffer based on the host cpu.
> +       Note that these addresses are chosen in concert with the
> +       addresses assigned in the relevant linker script file.  */
> +# if defined(__x86_64__) && defined(MAP_32BIT)
> +    /* Force the memory down into low memory with the executable.
> +       Leave the choice of exact location with the kernel.  */
> +    flags |= MAP_32BIT;
> +    /* Cannot expect to map more than 800MB in low memory.  */
> +    if (code_gen_buffer_size > 800u * 1024 * 1024) {
> +        code_gen_buffer_size = 800u * 1024 * 1024;
> +    }
> +# elif defined(__sparc__)
> +    start = 0x40000000ul;
> +# elif defined(__s390x__)
> +    start = 0x90000000ul;
> +# endif
> +
> +    buf = mmap((void *)start, code_gen_buffer_size,
> +               PROT_WRITE | PROT_READ | PROT_EXEC, flags, -1, 0);
> +    return buf == MAP_FAILED ? NULL : buf;
> +}
>  #else
> -    code_gen_buffer = g_malloc(code_gen_buffer_size);
> -    map_exec(code_gen_buffer, code_gen_buffer_size);
> -#endif
> -#endif /* !USE_STATIC_CODE_GEN_BUFFER */
> +static inline void *alloc_code_gen_buffer(void)
> +{
> +    void *buf = g_malloc(code_gen_buffer_size);
> +    if (buf) {
> +        map_exec(buf, code_gen_buffer_size);
> +    }
> +    return buf;
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER, USE_MMAP */
> +
> +static inline void code_gen_alloc(size_t tb_size)
> +{
> +    code_gen_buffer_size = size_code_gen_buffer(tb_size);
> +    code_gen_buffer = alloc_code_gen_buffer();
> +    if (code_gen_buffer == NULL) {
> +        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
> +        exit(1);
> +    }
> +
>      map_exec(code_gen_prologue, sizeof(code_gen_prologue));
>      code_gen_buffer_max_size = code_gen_buffer_size -
>          (TCG_MAX_OP_SIZE * OPC_BUF_SIZE);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer
  2012-10-13 13:33           ` Blue Swirl
@ 2012-10-15 20:16             ` Richard Henderson
  0 siblings, 0 replies; 12+ messages in thread
From: Richard Henderson @ 2012-10-15 20:16 UTC (permalink / raw)
  To: Blue Swirl; +Cc: qemu-devel

On 2012-10-13 23:33, Blue Swirl wrote:
> /src/qemu/exec.c:4208: error: format '%ld' expects type 'long int',
> but argument 4 has type 'size_t'

Dang it.  And here I thought I was helping get the type right for win64.
That printf format should be changed to %zd...


r~

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2012-10-15 20:17 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-04 21:31 [Qemu-devel] [PATCH] exec: Don't request an address for code_gen_buffer if -fpie Richard Henderson
2012-10-07 16:34 ` Blue Swirl
2012-10-07 19:20   ` Richard Henderson
2012-10-07 19:40     ` Blue Swirl
2012-10-12 21:20       ` [Qemu-devel] [PATCH v3 0/4] Better allocation of code_gen_buffer with -fpie Richard Henderson
2012-10-12 21:20         ` [Qemu-devel] [PATCH 1/4] exec: Split up and tidy code_gen_buffer Richard Henderson
2012-10-13 13:33           ` Blue Swirl
2012-10-15 20:16             ` Richard Henderson
2012-10-12 21:20         ` [Qemu-devel] [PATCH 2/4] exec: Don't make DEFAULT_CODE_GEN_BUFFER_SIZE too large Richard Henderson
2012-10-12 21:20         ` [Qemu-devel] [PATCH 3/4] exec: Do not use absolute address hints for code_gen_buffer with -fpie Richard Henderson
2012-10-12 21:20         ` [Qemu-devel] [PATCH 4/4] exec: Allocate code_gen_prologue from code_gen_buffer Richard Henderson
2012-10-12 21:41         ` [Qemu-devel] [PATCH 5/4] exec: Make MIN_CODE_GEN_BUFFER_SIZE private to exec.c Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).