From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Cooper Subject: Re: [PATCH v2 4/5] x86: Port the basic alternative mechanism from Linux to Xen Date: Thu, 29 May 2014 09:55:58 +0100 Message-ID: <5386F61E.7030408@citrix.com> References: <1401341669-5237-1-git-send-email-feng.wu@intel.com> <1401341669-5237-5-git-send-email-feng.wu@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii"; Format="flowed" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1401341669-5237-5-git-send-email-feng.wu@intel.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Feng Wu , xen-devel@lists.xen.org Cc: tim@xen.org, keir.xen@gmail.com, stefano.stabellini@citrix.com, ian.campbell@citrix.com, JBeulich@suse.com List-Id: xen-devel@lists.xenproject.org On 29/05/2014 06:34, Feng Wu wrote: > This patch ports the basic alternative mechanism from Linux to Xen. > With this mechanism, we can patch code based on the CPU features. > > Signed-off-by: Feng Wu > --- > xen/arch/x86/Makefile | 1 + > xen/arch/x86/alternative.c | 213 ++++++++++++++++++++++++++++++++++++++ > xen/arch/x86/setup.c | 3 + > xen/arch/x86/xen.lds.S | 15 +++ > xen/include/asm-x86/alternative.h | 78 ++++++++++++++ > 5 files changed, 310 insertions(+) > create mode 100644 xen/arch/x86/alternative.c > create mode 100644 xen/include/asm-x86/alternative.h > > diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile > index d502bdf..3734884 100644 > --- a/xen/arch/x86/Makefile > +++ b/xen/arch/x86/Makefile > @@ -58,6 +58,7 @@ obj-y += crash.o > obj-y += tboot.o > obj-y += hpet.o > obj-y += xstate.o > +obj-y += alternative.o > > obj-$(crash_debug) += gdbstub.o > > diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c > new file mode 100644 > index 0000000..3dbc811 > --- /dev/null > +++ b/xen/arch/x86/alternative.c > @@ -0,0 +1,213 @@ > +/****************************************************************************** > + * alternative.c > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#define MAX_PATCH_LEN (255-1) > + > +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; > + > +#ifdef K8_NOP1 > +static const unsigned char k8nops[] __initconst = { > + K8_NOP1, > + K8_NOP2, > + K8_NOP3, > + K8_NOP4, > + K8_NOP5, > + K8_NOP6, > + K8_NOP7, > + K8_NOP8 > +}; > +static const unsigned char * const k8_nops[ASM_NOP_MAX+1] __initconst = { > + NULL, > + k8nops, > + k8nops + 1, > + k8nops + 1 + 2, > + k8nops + 1 + 2 + 3, > + k8nops + 1 + 2 + 3 + 4, > + k8nops + 1 + 2 + 3 + 4 + 5, > + k8nops + 1 + 2 + 3 + 4 + 5 + 6, > + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 > +}; > +#endif > + > +#ifdef P6_NOP1 > +static const unsigned char p6nops[] __initconst = { > + P6_NOP1, > + P6_NOP2, > + P6_NOP3, > + P6_NOP4, > + P6_NOP5, > + P6_NOP6, > + P6_NOP7, > + P6_NOP8 > +}; > +static const unsigned char * const p6_nops[ASM_NOP_MAX+1] __initconst = { > + NULL, > + p6nops, > + p6nops + 1, > + p6nops + 1 + 2, > + p6nops + 1 + 2 + 3, > + p6nops + 1 + 2 + 3 + 4, > + p6nops + 1 + 2 + 3 + 4 + 5, > + p6nops + 1 + 2 + 3 + 4 + 5 + 6, > + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 > +}; > +#endif > + > +static const unsigned char * const *ideal_nops __initdata = k8_nops; > + > +static int __init mask_nmi_callback(struct cpu_user_regs *regs, int cpu) > +{ > + return 1; > +} > + > +static void __init arch_init_ideal_nops(void) > +{ > + /* > + * Due to a decoder implementation quirk, some > + * specific Intel CPUs actually perform better with > + * the "k8_nops" than with the SDM-recommended NOPs. > + */ > + if ( (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && > + !(boot_cpu_data.x86 == 6 && > + boot_cpu_data.x86_model >= 0x0f && > + boot_cpu_data.x86_model != 0x1c && > + boot_cpu_data.x86_model != 0x26 && > + boot_cpu_data.x86_model != 0x27 && > + boot_cpu_data.x86_model < 0x30) ) > + ideal_nops = p6_nops; > +} > + > +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ > +static void __init add_nops(void *insns, unsigned int len) > +{ > + while ( len > 0 ) > + { > + unsigned int noplen = len; > + if ( noplen > ASM_NOP_MAX ) > + noplen = ASM_NOP_MAX; > + memcpy(insns, ideal_nops[noplen], noplen); > + insns += noplen; > + len -= noplen; > + } > +} > + > +/* > + * text_poke_early - Update instructions on a live kernel at boot time > + * @addr: address to modify > + * @opcode: source of the copy > + * @len: length to copy > + * > + * When you use this code to patch more than one byte of an instruction > + * you need to make sure that other CPUs cannot execute this code in parallel. > + * Also no thread must be currently preempted in the middle of these > + * instructions. And on the local CPU you need to be protected again NMI or MCE > + * handlers seeing an inconsistent instruction while you patch. > + */ > +static void *__init text_poke_early(void *addr, const void *opcode, size_t len) > +{ > + unsigned long flags; > + > + local_irq_save(flags); > + memcpy(addr, opcode, len); > + sync_core(); > + local_irq_restore(flags); > + > + return addr; > +} > + > +/* > + * Replace instructions with better alternatives for this CPU type. > + * This runs before SMP is initialized to avoid SMP problems with > + * self modifying code. This implies that asymmetric systems where > + * APs have less capabilities than the boot processor are not handled. > + * Tough. Make sure you disable such features by hand. > + */ > + Excess newline > +static void __init apply_alternatives(struct alt_instr *start, struct alt_instr *end) > +{ > + struct alt_instr *a; > + u8 *instr, *replacement; > + u8 insnbuf[MAX_PATCH_LEN]; > + > + printk(KERN_INFO "alt table %p -> %p\n", start, end); > + > + /* > + * The scan order should be from start to end. A later scanned > + * alternative code can overwrite a previous scanned alternative code. > + * Some kernel functions (e.g. memcpy, memset, etc) use this order to > + * patch code. > + * > + * So be careful if you want to change the scan order to any other > + * order. > + */ > + for ( a = start; a < end; a++ ) > + { > + instr = (u8 *)&a->instr_offset + a->instr_offset; > + replacement = (u8 *)&a->repl_offset + a->repl_offset; > + BUG_ON(a->replacementlen > a->instrlen); > + BUG_ON(a->instrlen > sizeof(insnbuf)); > + BUG_ON(a->cpuid >= NCAPINTS * 32); > + if ( !boot_cpu_has(a->cpuid) ) > + continue; > + > + memcpy(insnbuf, replacement, a->replacementlen); > + > + /* 0xe8/0xe9 is a relative jump; fix the offset. */ > + if ( (*insnbuf & 0xfe) == 0xe8 && a->replacementlen == 5 ) > + *(s32 *)(insnbuf + 1) += replacement - instr; > + > + add_nops(insnbuf + a->replacementlen, > + a->instrlen - a->replacementlen); > + text_poke_early(instr, insnbuf, a->instrlen); > + } > +} > + > +void __init alternative_instructions(void) > +{ > + nmi_callback_t saved_nmi_callback; > + > + arch_init_ideal_nops(); > + > + /* > + * The patching is not fully atomic, so try to avoid local interruptions > + * that might execute the to be patched code. > + * Other CPUs are not running. > + */ > + saved_nmi_callback = set_nmi_callback(mask_nmi_callback); Newline here > + /* > + * Don't stop machine check exceptions while patching. > + * MCEs only happen when something got corrupted and in this > + * case we must do something about the corruption. > + * Ignoring it is worse than a unlikely patching race. > + * Also machine checks tend to be broadcast and if one CPU > + * goes into machine check the others follow quickly, so we don't > + * expect a machine check to cause undue problems during to code > + * patching. > + */ > + but not here. > + apply_alternatives(__alt_instructions, __alt_instructions_end); Possibly also here. > + set_nmi_callback(saved_nmi_callback); > +} > diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c > index 508649d..d16453a 100644 > --- a/xen/arch/x86/setup.c > +++ b/xen/arch/x86/setup.c > @@ -48,6 +48,7 @@ > #include > #include > #include > +#include > > /* opt_nosmp: If true, secondary processors are ignored. */ > static bool_t __initdata opt_nosmp; > @@ -1288,6 +1289,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) > if ( cpu_has_fsgsbase ) > set_in_cr4(X86_CR4_FSGSBASE); > > + alternative_instructions(); > + Given this ordering, it might be cleaner to have an ASSERT(!local_irq_enabled()) in the top of alternative_instructions(), and forgo the local_irq_save/restore() in text_poke_early(). If you can move this higher up before enabling MCEs in CR4, it might be slightly more resilient. ~Andrew > local_irq_enable(); > > pt_pci_init(); > diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S > index 17db361..d4b1f1a 100644 > --- a/xen/arch/x86/xen.lds.S > +++ b/xen/arch/x86/xen.lds.S > @@ -105,6 +105,12 @@ SECTIONS > .init.text : { > _sinittext = .; > *(.init.text) > + /* > + * Here are the replacement instructions. The linker sticks them > + * as binary blobs. The .altinstructions has enough data to get > + * the address and the length of them to patch the kernel safely. > + */ > + *(.altinstr_replacement) > _einittext = .; > } :text > .init.data : { > @@ -120,6 +126,15 @@ SECTIONS > __trampoline_seg_start = .; > *(.trampoline_seg) > __trampoline_seg_stop = .; > + /* > + * struct alt_inst entries. From the header (alternative.h): > + * "Alternative instructions for different CPU types or capabilities" > + * Think locking instructions on spinlocks. > + */ > + . = ALIGN(8); > + __alt_instructions = .; > + *(.altinstructions) > + __alt_instructions_end = .; > > . = ALIGN(8); > __ctors_start = .; > diff --git a/xen/include/asm-x86/alternative.h b/xen/include/asm-x86/alternative.h > new file mode 100644 > index 0000000..55a6604 > --- /dev/null > +++ b/xen/include/asm-x86/alternative.h > @@ -0,0 +1,78 @@ > +#ifndef __X86_ALTERNATIVE_H__ > +#define __X86_ALTERNATIVE_H__ > + > +#include > + > +#ifdef __ASSEMBLY__ > +.macro altinstruction_entry orig alt feature orig_len alt_len > + .long \orig - . > + .long \alt - . > + .word \feature > + .byte \orig_len > + .byte \alt_len > +.endm > +#else > +#include > + > +struct alt_instr { > + s32 instr_offset; /* original instruction */ > + s32 repl_offset; /* offset to replacement instruction */ > + u16 cpuid; /* cpuid bit set for replacement */ > + u8 instrlen; /* length of original instruction */ > + u8 replacementlen; /* length of new instruction, <= instrlen */ > +}; > + > +extern void alternative_instructions(void); > + > +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" > + > +#define b_replacement(number) "663"#number > +#define e_replacement(number) "664"#number > + > +#define alt_slen "662b-661b" > +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" > + > +#define ALTINSTR_ENTRY(feature, number) \ > + " .long 661b - .\n" /* label */ \ > + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ > + " .word " STR(feature) "\n" /* feature bit */ \ > + " .byte " alt_slen "\n" /* source len */ \ > + " .byte " alt_rlen(number) "\n" /* replacement len */ > + > +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ > + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" > + > +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ > + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" > + > +/* alternative assembly primitive: */ > +#define ALTERNATIVE(oldinstr, newinstr, feature) \ > + OLDINSTR(oldinstr) \ > + ".pushsection .altinstructions,\"a\"\n" \ > + ALTINSTR_ENTRY(feature, 1) \ > + ".popsection\n" \ > + ".pushsection .discard,\"aw\",@progbits\n" \ > + DISCARD_ENTRY(1) \ > + ".popsection\n" \ > + ".pushsection .altinstr_replacement, \"ax\"\n" \ > + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ > + ".popsection" > + > +/* > + * Alternative instructions for different CPU types or capabilities. > + * > + * This allows to use optimized instructions even on generic binary > + * kernels. > + * > + * length of oldinstr must be longer or equal the length of newinstr > + * It can be padded with nops as needed. > + * > + * For non barrier like inlines please define new variants > + * without volatile and memory clobber. > + */ > +#define alternative(oldinstr, newinstr, feature) \ > + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") > + > +#endif /* __ASSEMBLY__ */ > + > +#endif /* __X86_ALTERNATIVE_H__ */