From: Michal Hocko <mhocko@kernel.org>
To: Andy Lutomirski <luto@kernel.org>
Cc: x86@kernel.org, Borislav Petkov <bp@alien8.de>,
linux-kernel@vger.kernel.org, Brian Gerst <brgerst@gmail.com>,
Oleg Nesterov <oleg@redhat.com>
Subject: Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support
Date: Mon, 15 Aug 2016 13:55:02 +0200 [thread overview]
Message-ID: <20160815115502.GC3360@dhcp22.suse.cz> (raw)
In-Reply-To: <14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org>
On Thu 11-08-16 02:35:21, Andy Lutomirski wrote:
> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
> vmalloc_node.
>
> grsecurity has had a similar feature (called
> GRKERNSEC_KSTACKOVERFLOW) for a long time.
>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
Looks good to me.
FWIW
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> arch/Kconfig | 34 +++++++++++++
> arch/ia64/include/asm/thread_info.h | 2 +-
> include/linux/sched.h | 15 ++++++
> kernel/fork.c | 96 +++++++++++++++++++++++++++++--------
> 4 files changed, 126 insertions(+), 21 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index bd8056b5b246..2a7f5b62e716 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -698,4 +698,38 @@ config ARCH_NO_COHERENT_DMA_MMAP
> config CPU_NO_EFFICIENT_FFS
> def_bool n
>
> +config HAVE_ARCH_VMAP_STACK
> + def_bool n
> + help
> + An arch should select this symbol if it can support kernel stacks
> + in vmalloc space. This means:
> +
> + - vmalloc space must be large enough to hold many kernel stacks.
> + This may rule out many 32-bit architectures.
> +
> + - Stacks in vmalloc space need to work reliably. For example, if
> + vmap page tables are created on demand, either this mechanism
> + needs to work while the stack points to a virtual address with
> + unpopulated page tables or arch code (switch_to and switch_mm,
> + most likely) needs to ensure that the stack's page table entries
> + are populated before running on a possibly unpopulated stack.
> +
> + - If the stack overflows into a guard page, something reasonable
> + should happen. The definition of "reasonable" is flexible, but
> + instantly rebooting without logging anything would be unfriendly.
> +
> +config VMAP_STACK
> + default y
> + bool "Use a virtually-mapped stack"
> + depends on HAVE_ARCH_VMAP_STACK && !KASAN
> + ---help---
> + Enable this if you want the use virtually-mapped kernel stacks
> + with guard pages. This causes kernel stack overflows to be
> + caught immediately rather than causing difficult-to-diagnose
> + corruption.
> +
> + This is presently incompatible with KASAN because KASAN expects
> + the stack to map directly to the KASAN shadow map using a formula
> + that is incorrect if the stack is in vmalloc space.
> +
> source "kernel/gcov/Kconfig"
> diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
> index 29bd59790d6c..c7026429816b 100644
> --- a/arch/ia64/include/asm/thread_info.h
> +++ b/arch/ia64/include/asm/thread_info.h
> @@ -56,7 +56,7 @@ struct thread_info {
> #define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0)
> #define task_thread_info(tsk) ((struct thread_info *) 0)
> #endif
> -#define free_thread_stack(ti) /* nothing */
> +#define free_thread_stack(tsk) /* nothing */
> #define task_stack_page(tsk) ((void *)(tsk))
>
> #define __HAVE_THREAD_FUNCTIONS
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 62c68e513e39..20f9f47bcfd0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1923,6 +1923,9 @@ struct task_struct {
> #ifdef CONFIG_MMU
> struct task_struct *oom_reaper_list;
> #endif
> +#ifdef CONFIG_VMAP_STACK
> + struct vm_struct *stack_vm_area;
> +#endif
> /* CPU-specific state of this task */
> struct thread_struct thread;
> /*
> @@ -1939,6 +1942,18 @@ extern int arch_task_struct_size __read_mostly;
> # define arch_task_struct_size (sizeof(struct task_struct))
> #endif
>
> +#ifdef CONFIG_VMAP_STACK
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
> +{
> + return t->stack_vm_area;
> +}
> +#else
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
> +{
> + return NULL;
> +}
> +#endif
> +
> /* Future-safe accessor for struct task_struct's cpus_allowed. */
> #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 52e725d4a866..05f7ef796fb4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack)
> * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
> * kmemcache based allocator.
> */
> -# if THREAD_SIZE >= PAGE_SIZE
> -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
> - int node)
> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
> +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> {
> +#ifdef CONFIG_VMAP_STACK
> + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
> + VMALLOC_START, VMALLOC_END,
> + THREADINFO_GFP | __GFP_HIGHMEM,
> + PAGE_KERNEL,
> + 0, node,
> + __builtin_return_address(0));
> +
> + /*
> + * We can't call find_vm_area() in interrupt context, and
> + * free_thread_stack can be called in interrupt context, so cache
> + * the vm_struct.
> + */
> + if (stack)
> + tsk->stack_vm_area = find_vm_area(stack);
> + return stack;
> +#else
> struct page *page = alloc_pages_node(node, THREADINFO_GFP,
> THREAD_SIZE_ORDER);
>
> return page ? page_address(page) : NULL;
> +#endif
> }
>
> -static inline void free_thread_stack(unsigned long *stack)
> +static inline void free_thread_stack(struct task_struct *tsk)
> {
> - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
> + if (task_stack_vm_area(tsk))
> + vfree(tsk->stack);
> + else
> + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
> }
> # else
> static struct kmem_cache *thread_stack_cache;
> @@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
> return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
> }
>
> -static void free_thread_stack(unsigned long *stack)
> +static void free_thread_stack(struct task_struct *tsk)
> {
> - kmem_cache_free(thread_stack_cache, stack);
> + kmem_cache_free(thread_stack_cache, tsk->stack);
> }
>
> void thread_stack_cache_init(void)
> @@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep;
> /* SLAB cache for mm_struct structures (tsk->mm) */
> static struct kmem_cache *mm_cachep;
>
> -static void account_kernel_stack(unsigned long *stack, int account)
> +static void account_kernel_stack(struct task_struct *tsk, int account)
> {
> - /* All stack pages are in the same zone and belong to the same memcg. */
> - struct page *first_page = virt_to_page(stack);
> + void *stack = task_stack_page(tsk);
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
> +
> + if (vm) {
> + int i;
> +
> + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> + mod_zone_page_state(page_zone(vm->pages[i]),
> + NR_KERNEL_STACK_KB,
> + PAGE_SIZE / 1024 * account);
> + }
>
> - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
> - THREAD_SIZE / 1024 * account);
> + /* All stack pages belong to the same memcg. */
> + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> + account * (THREAD_SIZE / 1024));
> + } else {
> + /*
> + * All stack pages are in the same zone and belong to the
> + * same memcg.
> + */
> + struct page *first_page = virt_to_page(stack);
> +
> + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
> + THREAD_SIZE / 1024 * account);
>
> - memcg_kmem_update_page_stat(
> - first_page, MEMCG_KERNEL_STACK_KB,
> - account * (THREAD_SIZE / 1024));
> + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
> + account * (THREAD_SIZE / 1024));
> + }
> }
>
> void free_task(struct task_struct *tsk)
> {
> - account_kernel_stack(tsk->stack, -1);
> + account_kernel_stack(tsk, -1);
> arch_release_thread_stack(tsk->stack);
> - free_thread_stack(tsk->stack);
> + free_thread_stack(tsk);
> rt_mutex_debug_task_free(tsk);
> ftrace_graph_exit_task(tsk);
> put_seccomp_filter(tsk);
> @@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
> {
> struct task_struct *tsk;
> unsigned long *stack;
> + struct vm_struct *stack_vm_area;
> int err;
>
> if (node == NUMA_NO_NODE)
> @@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
> if (!stack)
> goto free_tsk;
>
> + stack_vm_area = task_stack_vm_area(tsk);
> +
> err = arch_dup_task_struct(tsk, orig);
> +
> + /*
> + * arch_dup_task_struct clobbers the stack-related fields. Make
> + * sure they're properly initialized before using any stack-related
> + * functions again.
> + */
> + tsk->stack = stack;
> +#ifdef CONFIG_VMAP_STACK
> + tsk->stack_vm_area = stack_vm_area;
> +#endif
> +
> if (err)
> goto free_stack;
>
> - tsk->stack = stack;
> #ifdef CONFIG_SECCOMP
> /*
> * We must handle setting up seccomp filters once we're under
> @@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
> tsk->task_frag.page = NULL;
> tsk->wake_q.next = NULL;
>
> - account_kernel_stack(stack, 1);
> + account_kernel_stack(tsk, 1);
>
> kcov_task_init(tsk);
>
> return tsk;
>
> free_stack:
> - free_thread_stack(stack);
> + free_thread_stack(tsk);
> free_tsk:
> free_task_struct(tsk);
> return NULL;
> --
> 2.7.4
--
Michal Hocko
SUSE Labs
next prev parent reply other threads:[~2016-08-15 11:55 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-08-11 9:35 [PATCH v6 0/3] virtually mapped stacks Andy Lutomirski
2016-08-11 9:35 ` [PATCH v6 1/3] fork: Add generic vmalloced stack support Andy Lutomirski
2016-08-15 11:55 ` Michal Hocko [this message]
2016-08-24 10:03 ` Ingo Molnar
2016-08-24 16:11 ` Dmitry Vyukov
2016-08-24 13:02 ` [tip:x86/asm] " tip-bot for Andy Lutomirski
2016-08-24 16:51 ` [PATCH v6 1/3] " Josh Cartwright
2016-08-30 22:01 ` Andy Lutomirski
2016-08-30 22:26 ` Josh Cartwright
2016-08-11 9:35 ` [PATCH v6 2/3] dma-api: Teach the "DMA-from-stack" check about vmapped stacks Andy Lutomirski
2016-08-24 13:02 ` [tip:x86/asm] " tip-bot for Andy Lutomirski
2016-08-11 9:35 ` [PATCH v6 3/3] x86/mm/64: Enable " Andy Lutomirski
2016-08-24 13:03 ` [tip:x86/asm] x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y) tip-bot for Andy Lutomirski
[not found] ` <tip-e37e43a497d5a8b7c0cc1736d56986f432c394c9-Ckxz5ZWcFp/9qxiX1TGQuw@public.gmane.org>
2016-10-21 12:32 ` Matt Fleming
2016-10-21 12:32 ` Matt Fleming
[not found] ` <20161021123243.GC27807-mF/unelCI9GS6iBeEJttW/XRex20P6io@public.gmane.org>
2016-10-22 0:18 ` Andy Lutomirski
2016-10-22 0:18 ` Andy Lutomirski
[not found] ` <CALCETrUqhRg_G1fia=oFKrsxpF_YobEJLs1KpA3UF2Pnt6-u+w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-10-24 13:09 ` Matt Fleming
2016-10-24 13:09 ` Matt Fleming
2016-10-30 16:21 ` Andy Lutomirski
2016-11-07 12:32 ` Matt Fleming
2016-08-20 18:06 ` [PATCH v6 0/3] virtually mapped stacks Andy Lutomirski
2016-08-24 10:12 ` Ingo Molnar
2016-08-24 15:27 ` Andy Lutomirski
2016-08-24 17:16 ` Ingo Molnar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160815115502.GC3360@dhcp22.suse.cz \
--to=mhocko@kernel.org \
--cc=bp@alien8.de \
--cc=brgerst@gmail.com \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=oleg@redhat.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.