From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Jan Beulich <JBeulich@suse.com>,
xen-devel <xen-devel@lists.xenproject.org>
Cc: Keir Fraser <keir@xen.org>
Subject: Re: [PATCH 1/4] x86: move syscall trampolines off the stack
Date: Tue, 19 May 2015 17:59:19 +0100 [thread overview]
Message-ID: <555B6BE7.508@citrix.com> (raw)
In-Reply-To: <5559FB2E020000780007B196@mail.emea.novell.com>
On 18/05/15 13:46, Jan Beulich wrote:
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -25,6 +25,7 @@
> #include <xen/kernel.h>
> #include <xen/mm.h>
> #include <xen/domain.h>
> +#include <xen/domain_page.h>
> #include <xen/sched.h>
> #include <xen/sched-if.h>
> #include <xen/irq.h>
> @@ -603,6 +604,42 @@ static int do_boot_cpu(int apicid, int c
> return rc;
> }
>
> +unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
> +{
> + unsigned long stub_va;
> + struct page_info *pg;
> +
> + if ( *mfn )
> + pg = mfn_to_page(*mfn);
> + else
> + {
> + nodeid_t node = cpu_to_node(cpu);
> + unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
> +
> + pg = alloc_domheap_page(NULL, memflags);
> + if ( !pg )
> + return 0;
> +
> + unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
Seems like memset_page(pg, int val) might be a nice piece of cleanup.
> + }
> +
> + stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
> + if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
> + PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
> + {
> + if ( !*mfn )
> + free_domheap_page(pg);
> + stub_va = 0;
> + }
> + else
> + {
> + stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;
"(cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE" comes up very frequently
through this patch.
I think the logic would be easier to read given:
#define STUB_OFFSET(cpu) (((cpu) & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
> + *mfn = page_to_mfn(pg);
> + }
> +
> + return stub_va;
> +}
> +
> void cpu_exit_clear(unsigned int cpu)
> {
> cpu_uninit(cpu);
> @@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
> free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
> free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>
> + if ( per_cpu(stubs.addr, cpu) )
> + {
> + unsigned long mfn = per_cpu(stubs.mfn, cpu);
> + unsigned char *stub_page = map_domain_page(mfn);
> + unsigned int i;
> +
> + memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
> + 0xcc, STUB_BUF_SIZE);
> + for ( i = 0; i < STUBS_PER_PAGE; ++i )
> + if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
> + break;
There is a race condition between allocate and free, as
write_stub_trampoline() is written by the cpu itself. Just finding 0xcc
here doesn't mean there isn't a different cpu in the process of coming
up and intending to use the stub page.
I suspect the race can be fixed by having the alloc side clobber the
first instruction, perhaps to ud2 ?
(Also, style at the end of the if)
> + unmap_domain_page(stub_page);
> + destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
> + (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
> + if ( i == STUBS_PER_PAGE )
> + free_domheap_page(mfn_to_page(mfn));
> + }
> +
> order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
> free_xenheap_pages(per_cpu(gdt_table, cpu), order);
>
> --- a/xen/arch/x86/x86_64/compat/entry.S
> +++ b/xen/arch/x86/x86_64/compat/entry.S
> @@ -219,7 +219,19 @@ ENTRY(compat_post_handle_exception)
> movb $0,TRAPBOUNCE_flags(%rdx)
> jmp compat_test_all_events
>
I would be tempted to leave a comment here referring to 'lstar_enter'
for stack/register details.
> -ENTRY(compat_syscall)
> +ENTRY(cstar_enter)
> + sti
> + movq 8(%rsp),%rax /* Restore %rax. */
> + movq $FLAT_KERNEL_SS,8(%rsp)
> + pushq %r11
> + pushq $FLAT_USER_CS32
> + pushq %rcx
> + pushq $0
> + SAVE_VOLATILE TRAP_syscall
> + GET_CURRENT(%rbx)
> + movq VCPU_domain(%rbx),%rcx
> + cmpb $0,DOMAIN_is_32bit_pv(%rcx)
> + je switch_to_kernel
> cmpb $0,VCPU_syscall32_disables_events(%rbx)
> movzwl VCPU_syscall32_sel(%rbx),%esi
> movq VCPU_syscall32_addr(%rbx),%rax
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -13,9 +13,8 @@
> #include <public/xen.h>
> #include <irq_vectors.h>
>
> - ALIGN
> /* %rbx: struct vcpu */
> -switch_to_kernel:
> +ENTRY(switch_to_kernel)
> leaq VCPU_trap_bounce(%rbx),%rdx
> /* TB_eip = (32-bit syscall && syscall32_addr) ?
> * syscall32_addr : syscall_addr */
> @@ -114,22 +113,21 @@ restore_all_xen:
> * Vector directly to the registered arch.syscall_addr.
> *
> * Initial work is done by per-CPU stack trampolines. At this point %rsp
The trampolines are no longer on the stack, so I think this line needs
editing as well.
> - * has been initialised to point at the correct Xen stack, and %rsp, %rflags
> - * and %cs have been saved. All other registers are still to be saved onto
> - * the stack, starting with %rip, and an appropriate %ss must be saved into
> - * the space left by the trampoline.
> + * has been initialised to point at the correct Xen stack, %rsp has been
> + * saved, and %rax needs to be restored from the %ss save slot. All other
> + * registers are still to be saved onto the stack, starting with RFLAGS, and
> + * an appropriate %ss must be saved into the space left by the trampoline.
> */
> -ENTRY(syscall_enter)
> +ENTRY(lstar_enter)
> sti
> - movl $FLAT_KERNEL_SS,24(%rsp)
> + movq 8(%rsp),%rax /* Restore %rax. */
> + movq $FLAT_KERNEL_SS,8(%rsp)
> + pushq %r11
> + pushq $FLAT_KERNEL_CS64
> pushq %rcx
> pushq $0
> - movq 24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
> SAVE_VOLATILE TRAP_syscall
> GET_CURRENT(%rbx)
> - movq VCPU_domain(%rbx),%rcx
> - testb $1,DOMAIN_is_32bit_pv(%rcx)
> - jnz compat_syscall
> testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
> jz switch_to_kernel
>
> --- a/xen/arch/x86/x86_64/traps.c
> +++ b/xen/arch/x86/x86_64/traps.c
>
> +DEFINE_PER_CPU(struct stubs, stubs);
> +void lstar_enter(void);
> +void cstar_enter(void);
> +
> void __devinit subarch_percpu_traps_init(void)
> {
> - char *stack_bottom, *stack;
> -
> - stack_bottom = (char *)get_stack_bottom();
> - stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
> + unsigned long stack_bottom = get_stack_bottom();
> + unsigned long stub_va = this_cpu(stubs.addr);
> + unsigned char *stub_page;
> + unsigned int offset;
>
> /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
> BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
>
> - /* Trampoline for SYSCALL entry from long mode. */
> - stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
> - wrmsrl(MSR_LSTAR, (unsigned long)stack);
> - stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
> + stub_page = map_domain_page(this_cpu(stubs.mfn));
> +
> + /* Trampoline for SYSCALL entry from 64-bit mode. */
> + wrmsrl(MSR_LSTAR, stub_va);
> + offset = write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
I would personally opt for "stub_va & ~PAGE_MASK"
> + stub_va, stack_bottom,
> + (unsigned long)lstar_enter);
> + stub_va += offset;
>
> if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
> boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
> {
> /* SYSENTER entry. */
> - wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
> + wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
> wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
> wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
> }
>
> /* Trampoline for SYSCALL entry from compatibility mode. */
> - stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
> - wrmsrl(MSR_CSTAR, (unsigned long)stack);
> - stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
> + wrmsrl(MSR_CSTAR, stub_va);
> + offset += write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
> + stub_va, stack_bottom,
> + (unsigned long)cstar_enter);
> +
> + /* Don't consume more than half of the stub space here. */
> + ASSERT(offset <= STUB_BUF_SIZE / 2);
> +
> + unmap_domain_page(stub_page);
>
> /* Common SYSCALL parameters. */
> wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
As for the rest of the patch, I have checked the asm changes carefully
and can't see any issues.
~Andrew
next prev parent reply other threads:[~2015-05-19 17:07 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-05-18 10:28 [PATCH 0/4] x86: don't default to executable mappings Jan Beulich
2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
2015-05-18 18:39 ` Andrew Cooper
2015-05-19 6:41 ` Jan Beulich
2015-05-19 9:24 ` Andrew Cooper
2015-05-19 16:59 ` Andrew Cooper [this message]
2015-05-20 9:16 ` Jan Beulich
2015-05-20 13:37 ` Jan Beulich
2015-05-20 13:58 ` Andrew Cooper
2015-05-20 15:54 ` Jan Beulich
2015-05-18 12:46 ` [PATCH 2/4] x86emul: move stubs " Jan Beulich
2015-05-19 17:33 ` Andrew Cooper
2015-05-20 9:25 ` Jan Beulich
2015-05-18 12:47 ` [PATCH 3/4] x86: move I/O emulation " Jan Beulich
2015-05-19 17:48 ` Andrew Cooper
2015-05-20 13:57 ` Jan Beulich
2015-05-18 12:47 ` [PATCH 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
2015-05-19 18:53 ` Andrew Cooper
2015-05-20 9:32 ` Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=555B6BE7.508@citrix.com \
--to=andrew.cooper3@citrix.com \
--cc=JBeulich@suse.com \
--cc=keir@xen.org \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.