[PATCH 4.4 03/37] kaiser: merged update

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Hugh Dickins <hughd@google.com>,
	Jiri Kosina <jkosina@suse.cz>
Subject: [PATCH 4.4 03/37] kaiser: merged update
Date: Wed,  3 Jan 2018 21:11:09 +0100	[thread overview]
Message-ID: <20180103195057.043002552@linuxfoundation.org> (raw)
In-Reply-To: <20180103195056.837404126@linuxfoundation.org>

4.4-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Dave Hansen <dave.hansen@linux.intel.com>


Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S            |  106 ++++++++++-
 arch/x86/include/asm/kaiser.h        |   43 ++--
 arch/x86/include/asm/pgtable.h       |   18 +
 arch/x86/include/asm/pgtable_64.h    |   48 ++++-
 arch/x86/include/asm/pgtable_types.h |    6 
 arch/x86/kernel/espfix_64.c          |   13 -
 arch/x86/kernel/head_64.S            |   19 +-
 arch/x86/kernel/ldt.c                |   27 ++
 arch/x86/kernel/tracepoint.c         |    2 
 arch/x86/mm/kaiser.c                 |  318 +++++++++++++++++++++++++----------
 arch/x86/mm/pageattr.c               |   63 +++++-
 arch/x86/mm/pgtable.c                |   40 +---
 include/linux/kaiser.h               |   26 ++
 kernel/fork.c                        |    9 
 security/Kconfig                     |    5 
 15 files changed, 553 insertions(+), 190 deletions(-)
 create mode 100644 include/linux/kaiser.h

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	/*
@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
@@ -1059,6 +1080,13 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	/*
+	 * error_entry() always returns with a kernel gsbase and
+	 * CR3.  We must also have a kernel CR3/gsbase before
+	 * calling TRACE_IRQS_*.  Just unconditionally switch to
+	 * the kernel CR3 here.
+	 */
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1069,7 +1097,6 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1122,7 +1149,7 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
+
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * as if we faulted immediately after IRET and clear EBX so that
@@ -1222,7 +1249,10 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
-	SWITCH_KERNEL_CR3_NO_STACK
+	/*
+	 * percpu variables are mapped with user CR3, so no need
+	 * to switch CR3 here.
+	 */
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1256,14 +1286,33 @@ ENTRY(nmi)
 
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  I know we return to
+	 * kernel code that needs user CR3, but do we ever return
+	 * to "user mode" where we need the kernel CR3?
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	/*
 	 * Return back to user mode.  We must *not* do the normal exit
-	 * work, because we don't want to enable interrupts.  Fortunately,
-	 * do_nmi doesn't modify pt_regs.
+	 * work, because we don't want to enable interrupts.  Do not
+	 * switch to user CR3: we might be going back to kernel code
+	 * that had a user CR3 set.
 	 */
-	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1459,23 +1508,54 @@ end_repeat_nmi:
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-	 * as we should not be calling schedule in NMI context.
-	 * Even with normal interrupts enabled. An NMI should not be
-	 * setting NEED_RESCHED or anything that normal interrupts and
-	 * exceptions might do.
+	 * Use the same approach as paranoid_entry to handle SWAPGS, but
+	 * without CR3 handling since we do that differently in NMIs.  No
+	 * need to use paranoid_exit as we should not be calling schedule
+	 * in NMI context.  Even with normal interrupts enabled. An NMI
+	 * should not be setting NEED_RESCHED or anything that normal
+	 * interrupts and exceptions might do.
 	 */
-	call	paranoid_entry
+	cld
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
+	SWAPGS
+	xorl	%ebx, %ebx
+1:
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq	%rsp, %rdi
+	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
 	movq	$-1, %rsi
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  We might be returning to
+	 * kernel code that needs user CR3, like just just before
+	 * a sysret.
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
-	SWITCH_USER_CR3_NO_STACK
+	/* We fixed up CR3 above, so no need to switch it here */
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -16,13 +16,17 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 andq $(~0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 orq $(0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_b
 .endm
 
 #endif /* CONFIG_KAISER */
+
 #else /* __ASSEMBLY__ */
 
 
 #ifdef CONFIG_KAISER
-// Upon kernel/user mode switch, it may happen that
-// the address space has to be switched before the registers have been stored.
-// To change the address space, another register is needed.
-// A register therefore has to be stored/restored.
-//
-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
 
-#endif /* CONFIG_KAISER */
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
 /**
- *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  *  @flags: The mapping flags of the pages
  *
- *  the mapping is done on a global scope, so no bigger synchronization has to be done.
- *  the pages have to be manually unmapped again when they are not needed any longer.
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
  */
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 
 
 /**
- *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  */
 extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 
 /**
- *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_initialize_mapping - Initalize the shadow mapping
  *
- *  most parts of the shadow mapping can be mapped upon boot time.
- *  only the thread stacks have to be mapped on runtime.
- *  the mapped regions are not unmapped at all.
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and nevertunmapped.
  */
 extern void kaiser_init(void);
 
-#endif
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
 
 
 
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *p
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	pgdval_t ignore_flags = _PAGE_USER;
+	/*
+	 * We set NX on KAISER pgds that map userspace memory so
+	 * that userspace can not meaningfully use the kernel
+	 * page table by accident; it will fault on the first
+	 * instruction it tries to run.  See native_set_pgd().
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t
 {
        memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	// clone the shadow pgd part as well
-	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+	/* Clone the shadow pgd part as well */
+	memcpy(native_get_shadow_pgd(dst),
+	       native_get_shadow_pgd(src),
+	       count * sizeof(pgd_t));
 #endif
 }
 
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_
 }
 
 #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
 }
 
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
 }
+#else
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
+	BUILD_BUG_ON(1);
+	return NULL;
+}
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
+	return pgdp;
+}
 #endif /* CONFIG_KAISER */
 
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 #ifdef CONFIG_KAISER
-	// We know that a pgd is page aligned.
-	// Therefore the lower indices have to be mapped to user space.
-	// These pages are mapped to the shadow mapping.
-	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+	pteval_t extra_kern_pgd_flags = 0;
+	/* Do we need to also populate the shadow pgd? */
+	if (is_userspace_pgd(pgdp)) {
 		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+		/*
+		 * Even if the entry is *mapping* userspace, ensure
+		 * that userspace can not use it.  This way, if we
+		 * get out to userspace running on the kernel CR3,
+		 * userspace will crash instead of running.
+		 */
+		extra_kern_pgd_flags = _PAGE_NX;
 	}
-
-	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+	pgdp->pgd = pgd.pgd;
+	pgdp->pgd |= extra_kern_pgd_flags;
 #else /* CONFIG_KAISER */
 	*pgdp = pgd;
 #endif
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -42,7 +42,7 @@
 #ifdef CONFIG_KAISER
 #define _PAGE_GLOBAL	(_AT(pteval_t, 0))
 #else
-#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
@@ -93,11 +93,7 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#ifdef CONFIG_KAISER
-#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
-#else
 #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-#ifdef CONFIG_KAISER
-	// add the esp stack pud to the shadow mapping here.
-	// This can be done directly, because the fixup stack has its own pud
-	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
-#endif
+	/*
+	 * Just copy the top-level PGD that is mapping the espfix
+	 * area to ensure it is mapped into the shadow user page
+	 * tables.
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		set_pgd(native_get_shadow_pgd(pgd_p),
+			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
 
 	/* Randomize the locations */
 	init_espfix_random();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -442,11 +442,24 @@ early_idt_ripmsg:
 GLOBAL(name)
 
 #ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
 #define NEXT_PGD_PAGE(name) \
 	.balign 2 * PAGE_SIZE; \
 GLOBAL(name)
 #else
 #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL	0
 #endif
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
@@ -461,6 +474,7 @@ GLOBAL(name)
 NEXT_PGD_PAGE(early_level4_pgt)
 	.fill	511,8,0
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)
 
 #ifndef CONFIG_XEN
 NEXT_PGD_PAGE(init_level4_pgt)
-	.fill	2*512,8,0
+	.fill	512,8,0
+	.fill	KAISER_USER_PGD_FILL,8,0
 #else
 NEXT_PGD_PAGE(init_level4_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
 	.org    init_level4_pgt + L4_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #endif
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/kaiser.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
 	set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree(ldt->entries);
+	else
+		free_page((unsigned long)ldt->entries);
+	kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
 	struct ldt_struct *new_ldt;
 	int alloc_size;
+	int ret = 0;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_stru
 		return NULL;
 	}
 
+	// FIXME: make kaiser_add_mapping() return an error code
+	// when it fails
+	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+			   __PAGE_KERNEL);
+	if (ret) {
+		__free_ldt_struct(new_ldt);
+		return NULL;
+	}
 	new_ldt->size = size;
 	return new_ldt;
 }
@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_s
 	if (likely(!ldt))
 		return;
 
+	kaiser_remove_mapping((unsigned long)ldt->entries,
+			      ldt->size * LDT_ENTRY_SIZE);
 	paravirt_free_ldt(ldt->entries, ldt->size);
-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree(ldt->entries);
-	else
-		free_page((unsigned long)ldt->entries);
-	kfree(ldt);
+	__free_ldt_struct(ldt);
 }
 
 /*
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
 #include <linux/atomic.h>
 
 atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
 struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
 				(unsigned long) trace_idt_table };
 
 /* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
 gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
 
 static int trace_irq_vector_refcount;
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -1,160 +1,306 @@
-
-
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
-
 #include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/kaiser.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 #ifdef CONFIG_KAISER
 
 __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
 
-/**
- * Get the real ppn from a address in kernel mapping.
- * @param address The virtual adrress
- * @return the physical address
+/*
+ * Returns -1 on error.
  */
-static inline unsigned long get_pa_from_mapping (unsigned long address)
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 
-	pgd = pgd_offset_k(address);
-	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+	pgd = pgd_offset_k(vaddr);
+	/*
+	 * We made all the kernel PGDs present in kaiser_init().
+	 * We expect them to stay that way.
+	 */
+	BUG_ON(pgd_none(*pgd));
+	/*
+	 * PGDs are either 512GB or 128TB on all x86_64
+	 * configurations.  We don't handle these.
+	 */
+	BUG_ON(pgd_large(*pgd));
+
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
 
-	pud = pud_offset(pgd, address);
-	BUG_ON(pud_none(*pud));
+	if (pud_large(*pud))
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
 
-	if (pud_large(*pud)) {
-		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	pmd = pmd_offset(pud, address);
-	BUG_ON(pmd_none(*pmd));
+	if (pmd_large(*pmd))
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
 
-	if (pmd_large(*pmd)) {
-		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte)) {
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	pte = pte_offset_kernel(pmd, address);
-	BUG_ON(pte_none(*pte));
-
-	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 }
 
-void _kaiser_copy (unsigned long start_addr, unsigned long size,
-					unsigned long flags)
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 {
-	pgd_t *pgd;
-	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long address;
-	unsigned long end_addr = start_addr + size;
-	unsigned long target_address;
+	pud_t *pud;
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
-			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
-		target_address = get_pa_from_mapping(address);
+	might_sleep();
+	if (is_atomic) {
+		gfp &= ~GFP_KERNEL;
+		gfp |= __GFP_HIGH | __GFP_ATOMIC;
+	}
 
-		pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	if (pgd_none(*pgd)) {
+		WARN_ONCE(1, "All shadow pgds should have been populated");
+		return NULL;
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
-		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
-		BUG_ON(pgd_large(*pgd));
+	pud = pud_offset(pgd, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pud_none(*pud))
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+		else
+			free_page(new_pmd_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
 
-		pud = pud_offset(pgd, address);
-		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
-		}
-		BUG_ON(pud_large(*pud));
+	pmd = pmd_offset(pud, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pmd_none(*pmd))
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+		else
+			free_page(new_pte_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
 
-		pmd = pmd_offset(pud, address);
-		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
-		}
-		BUG_ON(pmd_large(*pmd));
+	return pte_offset_kernel(pmd, address);
+}
 
-		pte = pte_offset_kernel(pmd, address);
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			unsigned long flags)
+{
+	int ret = 0;
+	pte_t *pte;
+	unsigned long start_addr = (unsigned long )__start_addr;
+	unsigned long address = start_addr & PAGE_MASK;
+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+	unsigned long target_address;
+
+	for (;address < end_addr; address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+		if (target_address == -1) {
+			ret = -EIO;
+			break;
+		}
+		pte = kaiser_pagetable_walk(address, false);
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
-			BUG_ON(__pa(pte_page(*pte)) != target_address);
+			pte_t tmp;
+			set_pte(&tmp, __pte(flags | target_address));
+			WARN_ON_ONCE(!pte_same(*pte, tmp));
 		}
 	}
+	return ret;
 }
 
-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
-static inline void __init _kaiser_init(void)
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+	unsigned long size = end - start;
+
+	return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
 {
 	pgd_t *pgd;
 	int i = 0;
 
 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
-		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+		pgd_t new_pgd;
+		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+		if (!pud) {
+			WARN_ON(1);
+			break;
+		}
+		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+		/*
+		 * Make sure not to stomp on some other pgd entry.
+		 */
+		if (!pgd_none(pgd[i])) {
+			WARN_ON(1);
+			continue;
+		}
+		set_pgd(pgd + i, new_pgd);
 	}
 }
 
+#define kaiser_add_user_map_early(start, size, flags) do {	\
+	int __ret = kaiser_add_user_map(start, size, flags);	\
+	WARN_ON(__ret);						\
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
+	WARN_ON(__ret);							\
+} while (0)
+
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-spinlock_t shadow_table_lock;
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
 void __init kaiser_init(void)
 {
 	int cpu;
-	spin_lock_init(&shadow_table_lock);
-
-	spin_lock(&shadow_table_lock);
 
-	_kaiser_init();
+	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
-		// map the per cpu user variables
-		_kaiser_copy(
-				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
-				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
-				__PAGE_KERNEL);
-	}
-
-	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
-	_kaiser_copy(
-			(unsigned long) __entry_text_start,
-			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
-			__PAGE_KERNEL_RX);
-
-	// the fixed map address of the idt_table
-	_kaiser_copy(
-			(unsigned long) idt_descr.address,
-			sizeof(gate_desc) * NR_VECTORS,
-			__PAGE_KERNEL_RO);
+		void *percpu_vaddr = __per_cpu_user_mapped_start +
+				     per_cpu_offset(cpu);
+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
+					  __per_cpu_user_mapped_start;
+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+					  __PAGE_KERNEL);
+	}
 
-	spin_unlock(&shadow_table_lock);
+	/*
+	 * Map the entry/exit text section, which is needed at
+	 * switches from user to and from kernel.
+	 */
+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+				       __PAGE_KERNEL_RX);
+
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+				       __irqentry_text_end,
+				       __PAGE_KERNEL_RX);
+#endif
+	kaiser_add_user_map_early((void *)idt_descr.address,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+	kaiser_add_user_map_early(&trace_idt_descr,
+				  sizeof(trace_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&trace_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
+#endif
+	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&debug_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
 }
 
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
 // add a mapping to the shadow-mapping, and synchronize the mappings
-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
-	spin_lock(&shadow_table_lock);
-	_kaiser_copy(addr, size, flags);
-	spin_unlock(&shadow_table_lock);
+	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
-	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
-	spin_lock(&shadow_table_lock);
-	do {
-		unmap_pud_range(pgd, start, start + size);
-	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
-	spin_unlock(&shadow_table_lock);
+	unsigned long end = start + size;
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PGDIR_SIZE) {
+		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+		/*
+		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
+		 * so no need to trim 'end'.
+		 */
+		unmap_pud_range_nofree(pgd, addr, end);
+	}
 }
 #endif /* CONFIG_KAISER */
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 #define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_d
 	return 0;
 }
 
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		if (!pte_none(pte[i]))
 			return false;
@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *
 	return true;
 }
 
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PMD; i++)
 		if (!pmd_none(pmd[i]))
 			return false;
@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *
 	return true;
 }
 
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+			    unsigned long start,
+			    unsigned long end)
 {
 	pte_t *pte = pte_offset_kernel(pmd, start);
 
@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd,
 		pte++;
 	}
 
-	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
 		pmd_clear(pmd);
 		return true;
 	}
 	return false;
 }
 
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
 			      unsigned long start, unsigned long end)
 {
-	if (unmap_pte_range(pmd, start, end))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+	if (unmap_pte_range(cpa, pmd, start, end))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+			    unsigned long start, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, start);
 
@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud,
 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 		unsigned long pre_end = min_t(unsigned long, end, next_page);
 
-		__unmap_pmd_range(pud, pmd, start, pre_end);
+		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
 
 		start = pre_end;
 		pmd++;
@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud,
 		if (pmd_large(*pmd))
 			pmd_clear(pmd);
 		else
-			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+			__unmap_pmd_range(cpa, pud, pmd,
+					  start, start + PMD_SIZE);
 
 		start += PMD_SIZE;
 		pmd++;
@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud,
 	 * 4K leftovers?
 	 */
 	if (start < end)
-		return __unmap_pmd_range(pud, pmd, start, end);
+		return __unmap_pmd_range(cpa, pud, pmd, start, end);
 
 	/*
 	 * Try again to free the PMD page if haven't succeeded above.
 	 */
 	if (!pud_none(*pud))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+			      unsigned long start,
+			      unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 
@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 
-		unmap_pmd_range(pud, start, pre_end);
+		unmap_pmd_range(cpa, pud, start, pre_end);
 
 		start = pre_end;
 		pud++;
@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
 		if (pud_large(*pud))
 			pud_clear(pud);
 		else
-			unmap_pmd_range(pud, start, start + PUD_SIZE);
+			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
 
 		start += PUD_SIZE;
 		pud++;
@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
 	 * 2M leftovers?
 	 */
 	if (start < end)
-		unmap_pmd_range(pud, start, end);
+		unmap_pmd_range(cpa, pud, start, end);
 
 	/*
 	 * No need to try to free the PUD page because we'll free it in
@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigne
 	 */
 }
 
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = CPA_FREE_PAGETABLES,
+	};
+
+	__unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = 0,
+	};
+
+	__unmap_pud_range(&cpa, pgd, start, end);
+}
+
 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd_entry = root + pgd_index(addr);
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
-static inline pgd_t *_pgd_alloc(void)
-{
-#ifdef CONFIG_KAISER
-	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
-	// block. Therefore, we have to allocate at least 3 pages. However, the
-	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
-	// the beginning of the page of our 8kb-aligned memory block in order to
-	// correctly free it afterwars.
 
-	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
-
-	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
-	{
-		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
-		return (pgd_t *) pages;
-	}
-	else
-	{
-		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
-		return (pgd_t *) (pages + PAGE_SIZE);
-	}
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
 #else
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#define PGD_ALLOCATION_ORDER 0
 #endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-#ifdef CONFIG_KAISER
-  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
-	free_pages(pages, get_order(4*PAGE_SIZE));
-#else
-	free_page((unsigned long)pgd);
-#endif
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,26 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
+#include <linux/kaiser.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct tas
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 }
 
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
@@ -357,9 +357,10 @@ static struct task_struct *dup_task_stru
 		goto free_ti;
 
 	tsk->stack = ti;
-#ifdef CONFIG_KAISER
-	kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
-#endif
+
+	err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+	if (err)
+		goto free_ti;
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -32,12 +32,17 @@ config SECURITY
 	  If you are unsure how to answer this question, answer N.
 config KAISER
 	bool "Remove the kernel mapping in user mode"
+	default y
 	depends on X86_64
 	depends on !PARAVIRT
 	help
 	  This enforces a strict kernel and user space isolation in order to close
 	  hardware side channels on kernel address information.
 
+config KAISER_REAL_SWITCH
+	bool "KAISER: actually switch page tables"
+	default y
+
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help

next prev parent reply	other threads:[~2018-01-03 20:12 UTC|newest]

Thread overview: 156+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-03 20:11 [PATCH 4.4 00/37] 4.4.110-stable review Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 01/37] x86/boot: Add early cmdline parsing for options with arguments Greg Kroah-Hartman
2018-01-03 20:11   ` Greg Kroah-Hartman
2018-01-03 20:11   ` Greg Kroah-Hartman
2018-01-03 20:11   ` Greg Kroah-Hartman
2018-01-03 20:11 ` [kernel-hardening] [PATCH 4.4 02/37] KAISER: Kernel Address Isolation Greg Kroah-Hartman
2018-01-03 20:11   ` Greg Kroah-Hartman
2018-01-03 20:11 ` Greg Kroah-Hartman [this message]
2018-01-03 20:11 ` [PATCH 4.4 04/37] kaiser: do not set _PAGE_NX on pgd_none Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 05/37] kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 06/37] kaiser: fix build and FIXME in alloc_ldt_struct() Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 07/37] kaiser: KAISER depends on SMP Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 08/37] kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 09/37] kaiser: fix perf crashes Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 10/37] kaiser: ENOMEM if kaiser_pagetable_walk() NULL Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 11/37] kaiser: tidied up asm/kaiser.h somewhat Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 12/37] kaiser: tidied up kaiser_add/remove_mapping slightly Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 13/37] kaiser: kaiser_remove_mapping() move along the pgd Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 14/37] kaiser: cleanups while trying for gold link Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 15/37] kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 16/37] kaiser: delete KAISER_REAL_SWITCH option Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 17/37] kaiser: vmstat show NR_KAISERTABLE as nr_overhead Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 18/37] kaiser: enhanced by kernel and user PCIDs Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 19/37] kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 20/37] kaiser: PCID 0 for kernel and 128 for user Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 21/37] kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 22/37] kaiser: paranoid_entry pass cr3 need to paranoid_exit Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 23/37] kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 24/37] kaiser: fix unlikely error in alloc_ldt_struct() Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 25/37] kaiser: add "nokaiser" boot option, using ALTERNATIVE Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 26/37] x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 27/37] x86/kaiser: Check boottime cmdline params Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 28/37] kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 29/37] kaiser: drop is_atomic arg to kaiser_pagetable_walk() Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 30/37] kaiser: asm/tlbflush.h handle noPGE at lower level Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 31/37] kaiser: kaiser_flush_tlb_on_return_to_user() check PCID Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 32/37] x86/paravirt: Dont patch flush_tlb_single Greg Kroah-Hartman
2018-01-03 20:11   ` Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 33/37] x86/kaiser: Reenable PARAVIRT Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 34/37] kaiser: disabled on Xen PV Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 35/37] x86/kaiser: Move feature detection up Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 36/37] KPTI: Rename to PAGE_TABLE_ISOLATION Greg Kroah-Hartman
2018-01-03 20:11 ` [PATCH 4.4 37/37] KPTI: Report when enabled Greg Kroah-Hartman
2018-01-03 22:08 ` [PATCH 4.4 00/37] 4.4.110-stable review Nathan Chancellor
2018-01-04  8:10   ` Greg Kroah-Hartman
2018-01-04  6:50 ` Naresh Kamboju
2018-01-04  9:27 ` kernelci.org bot
2018-01-05  0:06   ` Kevin Hilman
2018-01-08 15:06     ` Guillaume Tucker
2018-01-04 16:38 ` Pavel Tatashin
2018-01-04 16:53   ` Greg Kroah-Hartman
2018-01-04 17:01     ` Guenter Roeck
2018-01-04 17:09       ` Greg Kroah-Hartman
2018-01-04 17:02     ` Pavel Tatashin
2018-01-04 17:03     ` Willy Tarreau
2018-01-04 17:11       ` Greg Kroah-Hartman
2018-01-04 17:13         ` Willy Tarreau
2018-01-04 17:14         ` Greg Kroah-Hartman
2018-01-04 17:16           ` Greg Kroah-Hartman
2018-01-04 17:56             ` Guenter Roeck
2018-01-05 15:00               ` Greg Kroah-Hartman
2018-01-05 18:12                 ` Guenter Roeck
2018-01-05 20:53                   ` Greg Kroah-Hartman
2018-01-04 20:11   ` Linus Torvalds
2018-01-04 17:03 ` Guenter Roeck
2018-01-04 19:38 ` Thomas Voegtle
2018-01-04 19:50   ` Greg Kroah-Hartman
2018-01-04 20:16     ` Thomas Voegtle
2018-01-04 20:29       ` Linus Torvalds
2018-01-04 20:43         ` Andy Lutomirski
2018-01-04 20:57           ` Hugh Dickins
2018-01-04 21:16             ` Andy Lutomirski
2018-01-04 21:23             ` Pavel Tatashin
2018-01-04 21:37               ` Hugh Dickins
2018-01-04 21:48                 ` Pavel Tatashin
2018-01-04 22:33                   ` Linus Torvalds
2018-01-05 14:59                   ` Greg Kroah-Hartman
2018-01-05 15:32                     ` Pavel Tatashin
2018-01-05 15:51                       ` Greg Kroah-Hartman
2018-01-05 15:57                         ` Willy Tarreau
2018-01-05 18:01                           ` Greg Kroah-Hartman
2018-01-05 16:26                         ` Pavel Tatashin
2018-01-05 16:57                       ` Andy Lutomirski
2018-01-05 17:14                         ` Pavel Tatashin
2018-01-05 17:43                           ` Andy Lutomirski
2018-01-05 17:48                             ` Pavel Tatashin
2018-01-05 17:52                               ` Greg Kroah-Hartman
2018-01-05 18:15                                 ` Andy Lutomirski
2018-01-05 18:21                                   ` Pavel Tatashin
2018-01-05 19:14                                     ` Pavel Tatashin
2018-01-05 19:18                                       ` Pavel Tatashin
2018-01-05 20:45                                         ` Greg Kroah-Hartman
2018-01-05 21:03                                           ` Pavel Tatashin
2018-01-05 23:15                                             ` Hugh Dickins
2018-01-06  1:16                                               ` Pavel Tatashin
2018-01-07 10:45                                             ` Greg Kroah-Hartman
2018-01-07 14:17                                               ` Pavel Tatashin
2018-01-07 15:06                                                 ` Pavel Tatashin
2018-01-08  7:46                                                   ` Greg Kroah-Hartman
2018-01-08 20:38                                                     ` Pavel Tatashin
2018-01-08 21:24                                                       ` Pavel Tatashin
2018-01-11 18:36                                                         ` Pavel Tatashin
2018-01-11 18:40                                                           ` Pavel Tatashin
2018-01-11 19:09                                                             ` Linus Torvalds
2018-01-11 20:37                                                               ` Thomas Gleixner
2018-01-11 20:46                                                                 ` Linus Torvalds
2018-01-11 21:32                                                                   ` Thomas Gleixner
2018-01-11 22:30                                                                     ` Thomas Gleixner
2018-01-11 22:42                                                                       ` Steven Sistare
2018-01-11 22:47                                                                         ` Thomas Gleixner
2018-01-12  1:15                                                                           ` Guenter Roeck
2018-01-11 22:59                                                                         ` Linus Torvalds
2018-01-11 23:03                                                                       ` Thomas Gleixner
2018-01-12  7:19                                                                         ` Greg Kroah-Hartman
2018-01-12  8:03                                                                           ` Thomas Gleixner
2018-01-11 21:35                                                                   ` Steven Sistare
2018-01-11 21:44                                                                     ` Thomas Gleixner
2018-01-11 21:49                                                                       ` Linus Torvalds
2018-01-11 20:10                                                           ` Greg Kroah-Hartman
2018-01-11 20:17                                                             ` Linus Torvalds
2018-01-11 20:18                                                             ` Pavel Tatashin
2018-01-05 20:48                                   ` Greg Kroah-Hartman
2018-01-05  5:33           ` Andy Lutomirski
2018-01-05 10:12             ` Kees Cook
2018-01-05 12:14               ` Greg Kroah-Hartman
2018-01-05 13:08               ` Greg Kroah-Hartman
2018-01-04 20:10   ` Guenter Roeck
2018-01-05 14:58   ` Greg Kroah-Hartman
2018-01-05 15:25     ` Thomas Voegtle
2018-01-05 15:48       ` Greg Kroah-Hartman
2018-01-04 22:00 ` Shuah Khan
2018-01-05  7:55   ` Greg Kroah-Hartman
2018-01-04 23:45 ` Guenter Roeck
2018-01-04 23:58   ` Linus Torvalds
2018-01-05  4:37   ` Mike Galbraith
2018-01-05  4:37     ` Mike Galbraith
2018-01-05 12:17     ` Greg Kroah-Hartman
2018-01-05 12:17       ` Greg Kroah-Hartman
2018-01-05 13:03       ` Mike Galbraith
2018-01-05 13:03         ` Mike Galbraith
2018-01-05 13:34         ` Greg Kroah-Hartman
2018-01-05 13:34           ` Greg Kroah-Hartman
2018-01-05 14:03           ` Mike Galbraith
2018-01-05 23:28             ` Hugh Dickins
2018-01-06  2:58               ` Mike Galbraith
2018-01-05 13:41   ` Greg Kroah-Hartman
2018-01-05 17:51     ` Guenter Roeck
2018-01-05 17:20 ` Alice Ferrazzi
2018-01-05 18:01   ` Greg Kroah-Hartman
2018-01-09 19:49     ` Serge E. Hallyn
2018-01-10  8:48       ` Greg Kroah-Hartman
2018-01-10 16:45         ` Serge E. Hallyn
2018-01-05 17:56 ` Guenter Roeck
2018-01-05 20:54   ` Greg Kroah-Hartman
2018-01-05 21:21     ` Guenter Roeck
2018-01-06  1:35     ` Guenter Roeck

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180103195057.043002552@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=hughd@google.com \
    --cc=jkosina@suse.cz \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.