From mboxrd@z Thu Jan 1 00:00:00 1970 From: Feng Wu Subject: [PATCH 2/3] x86: Port the basic alternative mechanism from Linux to Xen Date: Mon, 26 May 2014 15:27:52 +0800 Message-ID: <1401089273-16425-3-git-send-email-feng.wu@intel.com> References: <1401089273-16425-1-git-send-email-feng.wu@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1401089273-16425-1-git-send-email-feng.wu@intel.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: xen-devel@lists.xen.org Cc: keir.xen@gmail.com, Feng Wu , JBeulich@suse.com List-Id: xen-devel@lists.xenproject.org This patch ports the basic alternative mechanism from Linux to Xen. With this mechanism, we can patch code based on the CPU features. Signed-off-by: Feng Wu --- xen/arch/x86/Makefile | 1 + xen/arch/x86/alternative.c | 223 ++++++++++++++++++++++++++++++++++++++ xen/arch/x86/setup.c | 5 + xen/arch/x86/traps.c | 15 +++ xen/arch/x86/xen.lds.S | 22 ++++ xen/include/asm-x86/alternative.h | 77 +++++++++++++ xen/include/asm-x86/traps.h | 2 + 7 files changed, 345 insertions(+) create mode 100644 xen/arch/x86/alternative.c create mode 100644 xen/include/asm-x86/alternative.h diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index d502bdf..3734884 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -58,6 +58,7 @@ obj-y += crash.o obj-y += tboot.o obj-y += hpet.o obj-y += xstate.o +obj-y += alternative.o obj-$(crash_debug) += gdbstub.o diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c new file mode 100644 index 0000000..af8864e --- /dev/null +++ b/xen/arch/x86/alternative.c @@ -0,0 +1,223 @@ +/****************************************************************************** + * alternative.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PATCH_LEN (255-1) + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + +#ifdef K8_NOP1 +static const unsigned char k8nops[] = { + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8 +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 +}; +#endif + +#ifdef P6_NOP1 +static const unsigned char p6nops[] = { + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8 +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+1] = { + NULL, + p6nops, + p6nops + 1, + p6nops + 1 + 2, + p6nops + 1 + 2 + 3, + p6nops + 1 + 2 + 3 + 4, + p6nops + 1 + 2 + 3 + 4 + 5, + p6nops + 1 + 2 + 3 + 4 + 5 + 6, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 +}; +#endif + +const unsigned char * const *ideal_nops = p6_nops; + +void __init arch_init_ideal_nops(void) +{ + switch (boot_cpu_data.x86_vendor) + { + case X86_VENDOR_INTEL: + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if ( boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30 ) + ideal_nops = k8_nops; + else + ideal_nops = p6_nops; + break; + default: + ideal_nops = k8_nops; + } +} + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init add_nops(void *insns, unsigned int len) +{ + while ( len > 0 ) + { + unsigned int noplen = len; + if ( noplen > ASM_NOP_MAX ) + noplen = ASM_NOP_MAX; + memcpy(insns, ideal_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + +/* + * text_poke_early - Update instructions on a live kernel at boot time + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these + * instructions. And on the local CPU you need to be protected again NMI or MCE + * handlers seeing an inconsistent instruction while you patch. + */ +static void *__init text_poke_early(void *addr, const void *opcode, size_t len) +{ + int tmp; + unsigned long flags; + local_irq_save(flags); + memcpy(addr, opcode, len); + /* + * CPUID is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. + */ + asm volatile("cpuid" + : "=a" (tmp) + : "0" (1) + : "ebx", "ecx", "edx", "memory"); + + local_irq_restore(flags); + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + return addr; +} + +/* + * Replace instructions with better alternatives for this CPU type. + * This runs before SMP is initialized to avoid SMP problems with + * self modifying code. This implies that asymmetric systems where + * APs have less capabilities than the boot processor are not handled. + * Tough. Make sure you disable such features by hand. + */ + +static void __init apply_alternatives(struct alt_instr *start, struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insnbuf[MAX_PATCH_LEN]; + + printk("%s: alt table %p -> %p\n", __func__, start, end); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite a previous scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ + for ( a = start; a < end; a++ ) + { + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + BUG_ON(a->replacementlen > a->instrlen); + BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= NCAPINTS * 32); + if ( !boot_cpu_has(a->cpuid) ) + continue; + + memcpy(insnbuf, replacement, a->replacementlen); + + /* 0xe8 is a relative jump; fix the offset. */ + if ( *insnbuf == 0xe8 && a->replacementlen == 5 ) + *(s32 *)(insnbuf + 1) += replacement - instr; + + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + text_poke_early(instr, insnbuf, a->instrlen); + } +} + +void __init alternative_instructions(void) +{ + /* + * The patching is not fully atomic, so try to avoid local interruptions + * that might execute the to be patched code. + * Other CPUs are not running. + */ + stop_nmi(); + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ + + apply_alternatives(__alt_instructions, __alt_instructions_end); + restart_nmi(); +} diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 508649d..7635868 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include /* opt_nosmp: If true, secondary processors are ignored. */ static bool_t __initdata opt_nosmp; @@ -1288,6 +1290,9 @@ void __init noreturn __start_xen(unsigned long mbi_p) if ( cpu_has_fsgsbase ) set_in_cr4(X86_CR4_FSGSBASE); + arch_init_ideal_nops(); + alternative_instructions(); + local_irq_enable(); pt_pci_init(); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 1722912..4108c8b 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -111,6 +111,8 @@ integer_param("debug_stack_lines", debug_stack_lines); static bool_t __devinitdata opt_ler; boolean_param("ler", opt_ler); +static int ignore_nmis; + #define stack_words_per_line 4 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp) @@ -3303,6 +3305,9 @@ void do_nmi(struct cpu_user_regs *regs) ++nmi_count(cpu); + if ( ignore_nmis ) + return; + if ( nmi_callback(regs, cpu) ) return; @@ -3322,6 +3327,16 @@ void do_nmi(struct cpu_user_regs *regs) } } +void stop_nmi(void) +{ + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; +} + void set_nmi_callback(nmi_callback_t callback) { nmi_callback = callback; diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S index 17db361..64bdb18 100644 --- a/xen/arch/x86/xen.lds.S +++ b/xen/arch/x86/xen.lds.S @@ -147,6 +147,28 @@ SECTIONS . = ALIGN(STACK_SIZE); __init_end = .; + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + */ + . = ALIGN(8); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + */ + .altinstr_replacement : { + *(.altinstr_replacement) + } + . = ALIGN(STACK_SIZE); + .bss : { /* BSS */ __bss_start = .; *(.bss.stack_aligned) diff --git a/xen/include/asm-x86/alternative.h b/xen/include/asm-x86/alternative.h new file mode 100644 index 0000000..18c0e85 --- /dev/null +++ b/xen/include/asm-x86/alternative.h @@ -0,0 +1,77 @@ +#ifndef __X86_ALTERNATIVE_H__ +#define __X86_ALTERNATIVE_H__ + +#ifdef __ASSEMBLY__ +.macro altinstruction_entry orig alt feature orig_len alt_len + .long \orig - . + .long \alt - . + .word \feature + .byte \orig_len + .byte \alt_len +.endm +#else +#include + +struct alt_instr { + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ + u16 cpuid; /* cpuid bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction, <= instrlen */ +}; + +extern void alternative_instructions(void); + +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number) "663"#number +#define e_replacement(number) "664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_slen "\n" /* source len */ \ + " .byte " alt_rlen(number) "\n" /* replacement len */ + +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" + +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, newinstr, feature) \ + OLDINSTR(oldinstr) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".popsection\n" \ + ".pushsection .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".popsection" + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * length of oldinstr must be longer or equal the length of newinstr + * It can be padded with nops as needed. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, newinstr, feature) \ + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_ALTERNATIVE_H__ */ + diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h index 556b133..9b1dc2c 100644 --- a/xen/include/asm-x86/traps.h +++ b/xen/include/asm-x86/traps.h @@ -54,4 +54,6 @@ uint32_t guest_io_read(unsigned int port, unsigned int bytes, void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data, struct vcpu *, struct cpu_user_regs *); +extern void stop_nmi(void); +extern void restart_nmi(void); #endif /* ASM_TRAP_H */ -- 1.8.3.1