All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/boot: Move/copy sections more efficiently
@ 2015-09-24  8:14 Andrew Cooper
  2015-09-24  9:52 ` Jan Beulich
  0 siblings, 1 reply; 3+ messages in thread
From: Andrew Cooper @ 2015-09-24  8:14 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper, Jan Beulich

Both the trampoline copy and BSS initialise can be performed more
efficiently by using 4-byte variants of the string operations.

The ALIGN(STACK_SIZE) actually belongs with .bss.stack_aligned, but
__init_end still needs page alignment because of the init sections being
freed and returned to the domheap after boot.

Note concerning Intel ERMSB, which indicate that byte MOVS are
efficient.  ERMSB and non-aliased aligned MOVSD scale with identical
complexity albeit ERMSB doesn't have a small setup overhead (which falls
into the nose, given the length of the REP).  On non-ERMSB systems
however, MOVSD scales 4 times better than MOVSB.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Jan Beulich <JBeulich@suse.com>

---
v2: Better patch description.  No functional change.
---
 xen/arch/x86/boot/head.S |    9 +++++----
 xen/arch/x86/xen.lds.S   |    5 ++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index f63b349..2b38048 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -128,7 +128,8 @@ __start:
         mov     $sym_phys(__bss_end),%ecx
         sub     %edi,%ecx
         xor     %eax,%eax
-        rep     stosb
+        shr     $2,%ecx
+        rep     stosl
 
         /* Interrogate CPU extended features via CPUID. */
         mov     $0x80000000,%eax
@@ -197,8 +198,8 @@ __start:
 
         /* Copy bootstrap trampoline to low memory, below 1MB. */
         mov     $sym_phys(trampoline_start),%esi
-        mov     $trampoline_end - trampoline_start,%ecx
-        rep     movsb
+        mov     $((trampoline_end - trampoline_start) / 4),%ecx
+        rep     movsl
 
         /* Jump into the relocated trampoline. */
         lret
@@ -210,6 +211,6 @@ reloc:
 
 ENTRY(trampoline_start)
 #include "trampoline.S"
-GLOBAL(trampoline_end)
+ENTRY(trampoline_end)
 
 #include "x86_64.S"
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 6553cff..c1180b2 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -158,11 +158,13 @@ SECTIONS
        __xsm_initcall_start = .;
        *(.xsm_initcall.init)
        __xsm_initcall_end = .;
+
+       . = ALIGN(PAGE_SIZE);
   } :text
-  . = ALIGN(STACK_SIZE);
   __init_end = .;
 
   .bss : {                     /* BSS */
+       . = ALIGN(STACK_SIZE);
        __bss_start = .;
        *(.bss.stack_aligned)
        . = ALIGN(PAGE_SIZE);
@@ -175,6 +177,7 @@ SECTIONS
        *(.bss.percpu.read_mostly)
        . = ALIGN(SMP_CACHE_BYTES);
        __per_cpu_data_end = .;
+       . = ALIGN(8);
        __bss_end = .;
   } :text
   _end = . ;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-09-29 10:26 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-24  8:14 [PATCH] x86/boot: Move/copy sections more efficiently Andrew Cooper
2015-09-24  9:52 ` Jan Beulich
2015-09-29 10:26   ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.