Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v8 10/27] x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Before introducing _PAGE_DIRTY_SW for non-hardware, memory management
purposes in the next patch, rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
_PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty bits
more clear.  There are no functional changes in this patch.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/include/asm/pgtable.h       |  6 +++---
 arch/x86/include/asm/pgtable_types.h | 17 +++++++++--------
 arch/x86/kernel/relocate_kernel_64.S |  2 +-
 arch/x86/kvm/vmx/vmx.c               |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0bc530c4eb13..8e38d87fce6e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -332,7 +332,7 @@ static inline pte_t pte_mkexec(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+	return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
 }
 
 static inline pte_t pte_mkyoung(pte_t pte)
@@ -406,7 +406,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
 
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
-	return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+	return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
 }
 
 static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -460,7 +460,7 @@ static inline pud_t pud_wrprotect(pud_t pud)
 
 static inline pud_t pud_mkdirty(pud_t pud)
 {
-	return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+	return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
 }
 
 static inline pud_t pud_mkdevmap(pud_t pud)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b5e49e6bac63..e647e3c75578 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -15,7 +15,7 @@
 #define _PAGE_BIT_PWT		3	/* page write through */
 #define _PAGE_BIT_PCD		4	/* page cache disabled */
 #define _PAGE_BIT_ACCESSED	5	/* was accessed (raised by CPU) */
-#define _PAGE_BIT_DIRTY		6	/* was written to (raised by CPU) */
+#define _PAGE_BIT_DIRTY_HW	6	/* was written to (raised by CPU) */
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
 #define _PAGE_BIT_PAT		7	/* on 4KB pages */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
@@ -45,7 +45,7 @@
 #define _PAGE_PWT	(_AT(pteval_t, 1) << _PAGE_BIT_PWT)
 #define _PAGE_PCD	(_AT(pteval_t, 1) << _PAGE_BIT_PCD)
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_DIRTY_HW	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_HW)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
@@ -73,7 +73,7 @@
 			 _PAGE_PKEY_BIT3)
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY_HW | _PAGE_ACCESSED)
 #else
 #define _PAGE_KNL_ERRATUM_MASK 0
 #endif
@@ -111,9 +111,9 @@
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
-				 _PAGE_ACCESSED | _PAGE_DIRTY)
+				 _PAGE_ACCESSED | _PAGE_DIRTY_HW)
 #define _KERNPG_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW |		\
-				 _PAGE_ACCESSED | _PAGE_DIRTY)
+				 _PAGE_ACCESSED | _PAGE_DIRTY_HW)
 
 /*
  * Set of bits not changed in pte_modify.  The pte's
@@ -122,7 +122,7 @@
  * pte_modify() does modify it.
  */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
-			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW |	\
 			 _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
@@ -167,7 +167,8 @@ enum page_cache_mode {
 					 _PAGE_ACCESSED)
 
 #define __PAGE_KERNEL_EXEC						\
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY_HW | _PAGE_ACCESSED | \
+	 _PAGE_GLOBAL)
 #define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
 
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
@@ -186,7 +187,7 @@ enum page_cache_mode {
 #define _PAGE_ENC	(_AT(pteval_t, sme_me_mask))
 
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
-			 _PAGE_DIRTY | _PAGE_ENC)
+			 _PAGE_DIRTY_HW | _PAGE_ENC)
 #define _PAGE_TABLE	(_KERNPG_TABLE | _PAGE_USER)
 
 #define __PAGE_KERNEL_ENC	(__PAGE_KERNEL | _PAGE_ENC)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c51ccff5cd01..60b75e8f4c14 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -15,7 +15,7 @@
  */
 
 #define PTR(x) (x << 3)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY_HW)
 
 /*
  * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 42ed3faa6af8..226875fbfa45 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3395,7 +3395,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
 	/* Set up identity-mapping pagetable for EPT in real mode */
 	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
 		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
-			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+			_PAGE_ACCESSED | _PAGE_DIRTY_HW | _PAGE_PSE);
 		r = kvm_write_guest_page(kvm, identity_map_pfn,
 				&tmp, i * sizeof(tmp), sizeof(tmp));
 		if (r < 0)
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 09/27] mm/mmap: Prevent Shadow Stack VMA merges
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

To prevent function call/return spills into the next shadow stack
area, do not merge shadow stack areas.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 mm/mmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/mmap.c b/mm/mmap.c
index 7e8c3e8ae75f..b1a921c0de63 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1149,6 +1149,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	if (vm_flags & VM_SPECIAL)
 		return NULL;
 
+	/*
+	 * Do not merge shadow stack areas.
+	 */
+	if (vm_flags & VM_SHSTK)
+		return NULL;
+
 	if (prev)
 		next = prev->vm_next;
 	else
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 08/27] mm: Introduce VM_SHSTK for shadow stack memory
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

VM_SHSTK indicates a shadow stack memory area.
The shadow stack is implemented only for the 64-bit kernel.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 fs/proc/task_mmu.c | 3 +++
 include/linux/mm.h | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 731642e0f5a0..09521579cc8e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -684,6 +684,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_PKEY_BIT4)]	= "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+		[ilog2(VM_SHSTK)]	= "ss",
+#endif
 	};
 	size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..bc58585014c9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -298,11 +298,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5	37	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5	BIT(VM_HIGH_ARCH_BIT_5)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -340,6 +342,12 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_MPX		VM_NONE
 #endif
 
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+# define VM_SHSTK	VM_HIGH_ARCH_5
+#else
+# define VM_SHSTK	VM_NONE
+#endif
+
 #ifndef VM_GROWSUP
 # define VM_GROWSUP	VM_NONE
 #endif
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 07/27] x86/cet/shstk: Add Kconfig option for user-mode shadow stack
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Introduce Kconfig option X86_INTEL_SHADOW_STACK_USER.

An application has shadow stack protection when all the following are
true:

  (1) The kernel has X86_INTEL_SHADOW_STACK_USER enabled,
  (2) The running processor supports the shadow stack,
  (3) The application is built with shadow stack enabled tools & libs
      and, and at runtime, all dependent shared libs can support
      shadow stack.

If this kernel config option is enabled, but (2) or (3) above is not
true, the application runs without the shadow stack protection.
Existing legacy applications will continue to work without the shadow
stack protection.

The user-mode shadow stack protection is only implemented for the
64-bit kernel.  Thirty-two bit applications are supported under the
compatibility mode.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/Kconfig  | 25 +++++++++++++++++++++++++
 arch/x86/Makefile |  7 +++++++
 2 files changed, 32 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 222855cc0158..eaf86ef13348 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1934,6 +1934,31 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
 
 	  If unsure, say y.
 
+config X86_INTEL_CET
+	def_bool n
+
+config ARCH_HAS_SHSTK
+	def_bool n
+
+config X86_INTEL_SHADOW_STACK_USER
+	prompt "Intel Shadow Stack for user-mode"
+	def_bool n
+	depends on CPU_SUP_INTEL && X86_64
+	select ARCH_USES_HIGH_VMA_FLAGS
+	select X86_INTEL_CET
+	select ARCH_HAS_SHSTK
+	---help---
+	  Shadow stack provides hardware protection against program stack
+	  corruption.  Only when all the following are true will an application
+	  have the shadow stack protection: the kernel supports it (i.e. this
+	  feature is enabled), the application is compiled and linked with
+	  shadow stack enabled, and the processor supports this feature.
+	  When the kernel has this configuration enabled, existing non shadow
+	  stack applications will continue to work, but without shadow stack
+	  protection.
+
+	  If unsure, say y.
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 56e748a7679f..0b2e9df48907 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -148,6 +148,13 @@ ifdef CONFIG_X86_X32
 endif
 export CONFIG_X86_X32_ABI
 
+# Check assembler shadow stack suppot
+ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+  ifeq ($(call as-instr, saveprevssp, y),)
+      $(error CONFIG_X86_INTEL_SHADOW_STACK_USER not supported by the assembler)
+  endif
+endif
+
 #
 # If the function graph tracer is used with mcount instead of fentry,
 # '-maccumulate-outgoing-args' is needed to prevent a GCC bug
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 06/27] x86/cet: Add control protection exception handler
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

A control protection exception is triggered when a control flow transfer
attempt violated shadow stack or indirect branch tracking constraints.
For example, the return address for a RET instruction differs from the
safe copy on the shadow stack; or a JMP instruction arrives at a non-
ENDBR instruction.

The control protection exception handler works in a similar way as the
general protection fault handler.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/entry/entry_64.S          |  2 +-
 arch/x86/include/asm/traps.h       |  3 ++
 arch/x86/kernel/idt.c              |  4 +++
 arch/x86/kernel/signal_compat.c    |  2 +-
 arch/x86/kernel/traps.c            | 57 ++++++++++++++++++++++++++++++
 include/uapi/asm-generic/siginfo.h |  3 +-
 6 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index be9ca198c581..c45485dfb8a1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1035,7 +1035,7 @@ idtentry spurious_interrupt_bug		do_spurious_interrupt_bug	has_error_code=0
 idtentry coprocessor_error		do_coprocessor_error		has_error_code=0
 idtentry alignment_check		do_alignment_check		has_error_code=1
 idtentry simd_coprocessor_error		do_simd_coprocessor_error	has_error_code=0
-
+idtentry control_protection		do_control_protection		has_error_code=1
 
 	/*
 	 * Reload gs selector with exception handling
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b25e633033c3..8691261faeb0 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -26,6 +26,7 @@ asmlinkage void invalid_TSS(void);
 asmlinkage void segment_not_present(void);
 asmlinkage void stack_segment(void);
 asmlinkage void general_protection(void);
+asmlinkage void control_protection(void);
 asmlinkage void page_fault(void);
 asmlinkage void async_page_fault(void);
 asmlinkage void spurious_interrupt_bug(void);
@@ -81,6 +82,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
 void __init trap_init(void);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
+dotraplinkage void do_control_protection(struct pt_regs *regs, long error_code);
 dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
@@ -151,6 +153,7 @@ enum {
 	X86_TRAP_AC,		/* 17, Alignment Check */
 	X86_TRAP_MC,		/* 18, Machine Check */
 	X86_TRAP_XF,		/* 19, SIMD Floating-Point Exception */
+	X86_TRAP_CP = 21,	/* 21 Control Protection Fault */
 	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
 };
 
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 87ef69a72c52..8ed406f469e7 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -102,6 +102,10 @@ static const __initconst struct idt_data def_idts[] = {
 #elif defined(CONFIG_X86_32)
 	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_32),
 #endif
+
+#ifdef CONFIG_X86_64
+	INTG(X86_TRAP_CP,		control_protection),
+#endif
 };
 
 /*
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 9ccbf0576cd0..c572a3de1037 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -27,7 +27,7 @@ static inline void signal_compat_build_tests(void)
 	 */
 	BUILD_BUG_ON(NSIGILL  != 11);
 	BUILD_BUG_ON(NSIGFPE  != 15);
-	BUILD_BUG_ON(NSIGSEGV != 7);
+	BUILD_BUG_ON(NSIGSEGV != 8);
 	BUILD_BUG_ON(NSIGBUS  != 5);
 	BUILD_BUG_ON(NSIGTRAP != 5);
 	BUILD_BUG_ON(NSIGCHLD != 6);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4bb0f8447112..b7e8b340e08e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -566,6 +566,63 @@ do_general_protection(struct pt_regs *regs, long error_code)
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
+static const char *control_protection_err[] = {
+	"unknown",
+	"near-ret",
+	"far-ret/iret",
+	"endbranch",
+	"rstorssp",
+	"setssbsy",
+};
+
+/*
+ * When a control protection exception occurs, send a signal
+ * to the responsible application.  Currently, control
+ * protection is only enabled for the user mode.  This
+ * exception should not come from the kernel mode.
+ */
+dotraplinkage void
+do_control_protection(struct pt_regs *regs, long error_code)
+{
+	struct task_struct *tsk;
+
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+	if (notify_die(DIE_TRAP, "control protection fault", regs,
+		       error_code, X86_TRAP_CP, SIGSEGV) == NOTIFY_STOP)
+		return;
+	cond_local_irq_enable(regs);
+
+	if (!user_mode(regs))
+		die("kernel control protection fault", regs, error_code);
+
+	if (!static_cpu_has(X86_FEATURE_SHSTK) &&
+	    !static_cpu_has(X86_FEATURE_IBT))
+		WARN_ONCE(1, "CET is disabled but got control protection fault\n");
+
+	tsk = current;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_nr = X86_TRAP_CP;
+
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+	    printk_ratelimit()) {
+		unsigned int max_err;
+
+		max_err = ARRAY_SIZE(control_protection_err) - 1;
+		if ((error_code < 0) || (error_code > max_err))
+			error_code = 0;
+		pr_info("%s[%d] control protection ip:%lx sp:%lx error:%lx(%s)",
+			tsk->comm, task_pid_nr(tsk),
+			regs->ip, regs->sp, error_code,
+			control_protection_err[error_code]);
+		print_vma_addr(KERN_CONT " in ", regs->ip);
+		pr_cont("\n");
+	}
+
+	force_sig_fault(SIGSEGV, SEGV_CPERR,
+			(void __user *)uprobe_get_trap_addr(regs));
+}
+NOKPROBE_SYMBOL(do_control_protection);
+
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index cb3d6c267181..693071dbe641 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -229,7 +229,8 @@ typedef struct siginfo {
 #define SEGV_ACCADI	5	/* ADI not enabled for mapped object */
 #define SEGV_ADIDERR	6	/* Disrupting MCD error */
 #define SEGV_ADIPERR	7	/* Precise MCD exception */
-#define NSIGSEGV	7
+#define SEGV_CPERR	8
+#define NSIGSEGV	8
 
 /*
  * SIGBUS si_codes
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 05/27] x86/fpu/xstate: Introduce CET MSR system states
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Intel Control-flow Enforcement Technology (CET) introduces the
following MSRs.

    MSR_IA32_U_CET (user-mode CET settings),
    MSR_IA32_PL3_SSP (user-mode shadow stack),
    MSR_IA32_PL0_SSP (kernel-mode shadow stack),
    MSR_IA32_PL1_SSP (Privilege Level 1 shadow stack),
    MSR_IA32_PL2_SSP (Privilege Level 2 shadow stack).

Introduce them into XSAVES system states.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/include/asm/fpu/types.h            | 22 ++++++++++++++++++
 arch/x86/include/asm/fpu/xstate.h           |  4 +++-
 arch/x86/include/asm/msr-index.h            | 18 +++++++++++++++
 arch/x86/include/uapi/asm/processor-flags.h |  2 ++
 arch/x86/kernel/fpu/xstate.c                | 25 +++++++++++++++++++--
 5 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f098f6cab94b..d7ef4d9c7ad5 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,9 @@ enum xfeature {
 	XFEATURE_Hi16_ZMM,
 	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
 	XFEATURE_PKRU,
+	XFEATURE_RESERVED,
+	XFEATURE_CET_USER,
+	XFEATURE_CET_KERNEL,
 
 	XFEATURE_MAX,
 };
@@ -128,6 +131,8 @@ enum xfeature {
 #define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
 #define XFEATURE_MASK_PT		(1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_CET_USER		(1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL	(1 << XFEATURE_CET_KERNEL)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
@@ -229,6 +234,23 @@ struct pkru_state {
 	u32				pad;
 } __packed;
 
+/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+	u64 user_cet;			/* user control-flow settings */
+	u64 user_ssp;			/* user shadow stack pointer */
+};
+
+/*
+ * State component 12 is Control-flow Enforcement kernel states
+ */
+struct cet_kernel_state {
+	u64 kernel_ssp;			/* kernel shadow stack */
+	u64 pl1_ssp;			/* privilege level 1 shadow stack */
+	u64 pl2_ssp;			/* privilege level 2 shadow stack */
+};
+
 struct xstate_header {
 	u64				xfeatures;
 	u64				xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 970bbd303cfb..ebf5979b21e7 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -30,7 +30,9 @@
 				  XFEATURE_MASK_Hi16_ZMM | \
 				  XFEATURE_MASK_PKRU | \
 				  XFEATURE_MASK_BNDREGS | \
-				  XFEATURE_MASK_BNDCSR)
+				  XFEATURE_MASK_BNDCSR | \
+				  XFEATURE_MASK_CET_USER | \
+				  XFEATURE_MASK_CET_KERNEL)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6b4fc2788078..e06c1e3fde2f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -848,4 +848,22 @@
 #define MSR_VM_IGNNE                    0xc0010115
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET		0x6a0 /* user mode cet setting */
+#define MSR_IA32_S_CET		0x6a2 /* kernel mode cet setting */
+#define MSR_IA32_PL0_SSP	0x6a4 /* kernel shstk pointer */
+#define MSR_IA32_PL1_SSP	0x6a5 /* ring-1 shstk pointer */
+#define MSR_IA32_PL2_SSP	0x6a6 /* ring-2 shstk pointer */
+#define MSR_IA32_PL3_SSP	0x6a7 /* user shstk pointer */
+#define MSR_IA32_INT_SSP_TAB	0x6a8 /* exception shstk table */
+
+/* MSR_IA32_U_CET and MSR_IA32_S_CET bits */
+#define MSR_IA32_CET_SHSTK_EN		0x0000000000000001ULL
+#define MSR_IA32_CET_WRSS_EN		0x0000000000000002ULL
+#define MSR_IA32_CET_ENDBR_EN		0x0000000000000004ULL
+#define MSR_IA32_CET_LEG_IW_EN		0x0000000000000008ULL
+#define MSR_IA32_CET_NO_TRACK_EN	0x0000000000000010ULL
+#define MSR_IA32_CET_WAIT_ENDBR	0x00000000000000800UL
+#define MSR_IA32_CET_BITMAP_MASK	0xfffffffffffff000ULL
+
 #endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..a8df907e8017 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
 #define X86_CR4_SMAP		_BITUL(X86_CR4_SMAP_BIT)
 #define X86_CR4_PKE_BIT		22 /* enable Protection Keys support */
 #define X86_CR4_PKE		_BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT		23 /* enable Control-flow Enforcement */
+#define X86_CR4_CET		_BITUL(X86_CR4_CET_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9fbe73c546df..63374bb19066 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -38,6 +38,9 @@ static const char *xfeature_names[] =
 	"Processor Trace (unused)"	,
 	"Protection Keys User registers",
 	"unknown xstate feature"	,
+	"Control-flow User registers"	,
+	"Control-flow Kernel registers"	,
+	"unknown xstate feature"	,
 };
 
 static short xsave_cpuid_features[] __initdata = {
@@ -51,6 +54,9 @@ static short xsave_cpuid_features[] __initdata = {
 	X86_FEATURE_AVX512F,
 	X86_FEATURE_INTEL_PT,
 	X86_FEATURE_PKU,
+	-1,		   /* Unused */
+	X86_FEATURE_SHSTK, /* XFEATURE_CET_USER */
+	X86_FEATURE_SHSTK, /* XFEATURE_CET_KERNEL */
 };
 
 /*
@@ -313,6 +319,8 @@ static void __init print_xstate_features(void)
 	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 	print_xstate_feature(XFEATURE_MASK_PKRU);
+	print_xstate_feature(XFEATURE_MASK_CET_USER);
+	print_xstate_feature(XFEATURE_MASK_CET_KERNEL);
 }
 
 /*
@@ -559,6 +567,8 @@ static void check_xstate_against_struct(int nr)
 	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
 	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
 	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
+	XCHECK_SZ(sz, nr, XFEATURE_CET_USER,   struct cet_user_state);
+	XCHECK_SZ(sz, nr, XFEATURE_CET_KERNEL, struct cet_kernel_state);
 
 	/*
 	 * Make *SURE* to add any feature numbers in below if
@@ -770,8 +780,19 @@ void __init fpu__init_system_xstate(void)
 	 * Clear XSAVE features that are disabled in the normal CPUID.
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
-		if (!boot_cpu_has(xsave_cpuid_features[i]))
-			xfeatures_mask_all &= ~BIT_ULL(i);
+		if (xsave_cpuid_features[i] == X86_FEATURE_SHSTK) {
+			/*
+			 * X86_FEATURE_SHSTK and X86_FEATURE_IBT share
+			 * same states, but can be enabled separately.
+			 */
+			if (!boot_cpu_has(X86_FEATURE_SHSTK) &&
+			    !boot_cpu_has(X86_FEATURE_IBT))
+				xfeatures_mask_all &= ~BIT_ULL(i);
+		} else {
+			if ((xsave_cpuid_features[i] == -1) ||
+			    !boot_cpu_has(xsave_cpuid_features[i]))
+				xfeatures_mask_all &= ~BIT_ULL(i);
+		}
 	}
 
 	xfeatures_mask_all &= SUPPORTED_XFEATURES_MASK;
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 04/27] x86/fpu/xstate: Introduce XSAVES system states
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Control-flow Enforcement (CET) MSR contents are XSAVES system states.
To support CET, introduce XSAVES system states first.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/include/asm/fpu/internal.h | 23 +++++++-
 arch/x86/include/asm/fpu/xstate.h   |  4 +-
 arch/x86/kernel/fpu/core.c          | 26 +++++++--
 arch/x86/kernel/fpu/init.c          | 10 ----
 arch/x86/kernel/fpu/signal.c        |  4 +-
 arch/x86/kernel/fpu/xstate.c        | 90 +++++++++++++++++++----------
 arch/x86/kernel/process.c           |  2 +-
 arch/x86/kernel/signal.c            |  2 +-
 8 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 652be3853b40..2ca5c36a77d5 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -31,7 +31,8 @@ extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
-extern void fpu__clear(struct fpu *fpu);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern void fpu__clear_all(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
 
@@ -44,7 +45,6 @@ extern void fpu__init_cpu_xstate(void);
 extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
 
 /*
  * Debugging facility:
@@ -92,7 +92,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
 	 * XRSTORS requires these bits set in xcomp_bv, or it will
 	 * trigger #GP:
 	 */
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_user;
+	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
 }
 
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
@@ -615,6 +615,23 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 	__write_pkru(pkru_val);
 }
 
+/*
+ * On context switches, XSAVE states are not restored until returning
+ * to user-mode.  FPU registers need to be restored before any changes,
+ * and protected by fpregs_lock()/fpregs_unlock().
+ */
+static inline void modify_fpu_regs_begin(void)
+{
+	fpregs_lock();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		__fpregs_load_activate();
+}
+
+static inline void modify_fpu_regs_end(void)
+{
+	fpregs_unlock();
+}
+
 /*
  * MXCSR and XCR definitions:
  */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 9ded9532257d..970bbd303cfb 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -21,9 +21,6 @@
 #define XSAVE_YMM_SIZE	    256
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
-/* Supervisor features */
-#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
-
 /* All currently supported features */
 #define SUPPORTED_XFEATURES_MASK (XFEATURE_MASK_FP | \
 				  XFEATURE_MASK_SSE | \
@@ -42,6 +39,7 @@
 #endif
 
 extern u64 xfeatures_mask_user;
+extern u64 xfeatures_mask_all;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 12c70840980e..31d3cd70b5df 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -294,12 +294,16 @@ void fpu__drop(struct fpu *fpu)
  * Clear FPU registers by setting them up from
  * the init fpstate:
  */
-static inline void copy_init_fpstate_to_fpregs(void)
+static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
 {
 	fpregs_lock();
 
+	/*
+	 * Only XSAVES user states are copied.
+	 * System states are preserved.
+	 */
 	if (use_xsave())
-		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+		copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
 	else if (static_cpu_has(X86_FEATURE_FXSR))
 		copy_kernel_to_fxregs(&init_fpstate.fxsave);
 	else
@@ -318,7 +322,21 @@ static inline void copy_init_fpstate_to_fpregs(void)
  * Called by sys_execve(), by the signal handler code and by various
  * error paths.
  */
-void fpu__clear(struct fpu *fpu)
+void fpu__clear_user_states(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+
+	fpu__drop(fpu);
+
+	/*
+	 * Make sure fpstate is cleared and initialized.
+	 */
+	fpu__initialize(fpu);
+	if (static_cpu_has(X86_FEATURE_FPU))
+		copy_init_fpstate_to_fpregs(xfeatures_mask_user);
+}
+
+void fpu__clear_all(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
 
@@ -329,7 +347,7 @@ void fpu__clear(struct fpu *fpu)
 	 */
 	fpu__initialize(fpu);
 	if (static_cpu_has(X86_FEATURE_FPU))
-		copy_init_fpstate_to_fpregs();
+		copy_init_fpstate_to_fpregs(xfeatures_mask_all);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 73fed33e5bda..0a0ba584a533 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -217,16 +217,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	fpu_user_xstate_size = fpu_kernel_xstate_size;
 }
 
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
-{
-	return SUPPORTED_XFEATURES_MASK;
-}
-
 /* Legacy code to initialize eager fpu mode. */
 static void __init fpu__init_system_ctx_switch(void)
 {
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8a63f07cf400..4ecf1764a971 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -285,7 +285,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
 	if (!buf) {
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 		return 0;
 	}
 
@@ -407,7 +407,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 err_out:
 	if (ret)
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 	return ret;
 }
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d560e8861a3c..9fbe73c546df 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -61,9 +61,19 @@ static short xsave_cpuid_features[] __initdata = {
  */
 u64 xfeatures_mask_user __read_mostly;
 
+/*
+ * Supported XSAVES system states.
+ */
+static u64 xfeatures_mask_system __read_mostly;
+
+/*
+ * Combined XSAVES system and user states.
+ */
+u64 xfeatures_mask_all __read_mostly;
+
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_user)*8];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_all)*8];
 
 /*
  * The XSAVE area of kernel can be in standard or compacted format;
@@ -79,7 +89,7 @@ unsigned int fpu_user_xstate_size;
  */
 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 {
-	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_user;
+	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
 
 	if (unlikely(feature_name)) {
 		long xfeature_idx, max_idx;
@@ -158,7 +168,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	 * None of the feature bits are in init state. So nothing else
 	 * to do for us, as the memory layout is up to date.
 	 */
-	if ((xfeatures & xfeatures_mask_user) == xfeatures_mask_user)
+	if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
 		return;
 
 	/*
@@ -213,28 +223,27 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
  */
 void fpu__init_cpu_xstate(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
 		return;
 	/*
 	 * XCR_XFEATURE_ENABLED_MASK sets the features that are managed
 	 * by XSAVE{C, OPT} and XRSTOR.  Only XSAVE user states can be
 	 * set here.
 	 */
-
-	xfeatures_mask_user &= ~XFEATURE_MASK_SUPERVISOR;
-
 	cr4_set_bits(X86_CR4_OSXSAVE);
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
+
+	/*
+	 * MSR_IA32_XSS controls which system (not user) states are
+	 * to be managed by XSAVES.
+	 */
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_system);
 }
 
-/*
- * Note that in the future we will likely need a pair of
- * functions here: one for user xstates and the other for
- * system xstates.  For now, they are the same.
- */
 static int xfeature_enabled(enum xfeature xfeature)
 {
-	return !!(xfeatures_mask_user & BIT_ULL(xfeature));
+	return !!(xfeatures_mask_all & BIT_ULL(xfeature));
 }
 
 /*
@@ -340,7 +349,7 @@ static int xfeature_is_aligned(int xfeature_nr)
  */
 static void __init setup_xstate_comp(void)
 {
-	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_user)*8];
+	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_all)*8];
 	int i;
 
 	/*
@@ -413,7 +422,7 @@ static void __init setup_init_fpu_buf(void)
 	print_xstate_features();
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		init_fpstate.xsave.header.xcomp_bv = BIT_ULL(63) | xfeatures_mask_user;
+		init_fpstate.xsave.header.xcomp_bv = BIT_ULL(63) | xfeatures_mask_all;
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
@@ -436,7 +445,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
 	 * format. Checking a system state's uncompacted offset is
 	 * an error.
 	 */
-	if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
+	if (~xfeatures_mask_user & BIT_ULL(xfeature_nr)) {
 		WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
 		return -1;
 	}
@@ -608,15 +617,12 @@ static void do_extra_xstate_size_checks(void)
 
 
 /*
- * Get total size of enabled xstates in XCR0/xfeatures_mask_user.
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
  *
  * Note the SDM's wording here.  "sub-function 0" only enumerates
  * the size of the *user* states.  If we use it to size a buffer
  * that we use 'XSAVES' on, we could potentially overflow the
  * buffer because 'XSAVES' saves system states too.
- *
- * Note that we do not currently set any bits on IA32_XSS so
- * 'XCR0 | IA32_XSS == XCR0' for now.
  */
 static unsigned int __init get_xsaves_size(void)
 {
@@ -698,6 +704,7 @@ static int __init init_xstate_size(void)
  */
 static void fpu__init_disable_system_xstate(void)
 {
+	xfeatures_mask_all = 0;
 	xfeatures_mask_user = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
@@ -733,10 +740,23 @@ void __init fpu__init_system_xstate(void)
 		return;
 	}
 
+	/*
+	 * Find user states supported by the processor.
+	 * Only these bits can be set in XCR0.
+	 */
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	xfeatures_mask_user = eax + ((u64)edx << 32);
 
-	if ((xfeatures_mask_user & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+	/*
+	 * Find system states supported by the processor.
+	 * Only these bits can be set in IA32_XSS MSR.
+	 */
+	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+	xfeatures_mask_system = ecx + ((u64)edx << 32);
+
+	xfeatures_mask_all = xfeatures_mask_user | xfeatures_mask_system;
+
+	if ((xfeatures_mask_all & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
 		 * with the enumeration.  Disable XSAVE and try to continue
@@ -751,10 +771,12 @@ void __init fpu__init_system_xstate(void)
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
 		if (!boot_cpu_has(xsave_cpuid_features[i]))
-			xfeatures_mask_user &= ~BIT_ULL(i);
+			xfeatures_mask_all &= ~BIT_ULL(i);
 	}
 
-	xfeatures_mask_user &= fpu__get_supported_xfeatures_mask();
+	xfeatures_mask_all &= SUPPORTED_XFEATURES_MASK;
+	xfeatures_mask_user &= xfeatures_mask_all;
+	xfeatures_mask_system &= xfeatures_mask_all;
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
@@ -766,7 +788,7 @@ void __init fpu__init_system_xstate(void)
 	 * Update info used for ptrace frames; use standard-format size and no
 	 * system xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user & ~XFEATURE_MASK_SUPERVISOR);
+	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user);
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -774,7 +796,7 @@ void __init fpu__init_system_xstate(void)
 	print_xstate_offset_size();
 
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
-		xfeatures_mask_user,
+		xfeatures_mask_all,
 		fpu_kernel_xstate_size,
 		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 	return;
@@ -794,6 +816,12 @@ void fpu__resume_cpu(void)
 	 */
 	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
+
+	/*
+	 * Restore IA32_XSS
+	 */
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_system);
 }
 
 /*
@@ -839,9 +867,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 	/*
 	 * We should not ever be requesting features that we
 	 * have not enabled.  Remember that pcntxt_mask is
-	 * what we write to the XCR0 register.
+	 * what we write to the XCR0 | IA32_XSS registers.
 	 */
-	WARN_ONCE(!(xfeatures_mask_user & BIT_ULL(xfeature_nr)),
+	WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
 		  "get of unsupported state");
 	/*
 	 * This assumes the last 'xsave*' instruction to
@@ -989,7 +1017,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 	 */
 	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+	header.xfeatures &= xfeatures_mask_user;
 
 	/*
 	 * Copy xregs_state->header:
@@ -1073,7 +1101,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 	 */
 	memset(&header, 0, sizeof(header));
 	header.xfeatures = xsave->header.xfeatures;
-	header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+	header.xfeatures &= xfeatures_mask_user;
 
 	/*
 	 * Copy xregs_state->header:
@@ -1166,7 +1194,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
 	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+	xsave->header.xfeatures &= xfeatures_mask_system;
 
 	/*
 	 * Add back in the features that came in from userspace:
@@ -1222,7 +1250,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 	 * The state that came in from userspace was user-state only.
 	 * Mask all the user states out of 'xfeatures':
 	 */
-	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+	xsave->header.xfeatures &= xfeatures_mask_system;
 
 	/*
 	 * Add back in the features that came in from userspace:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 75fea0d48c0e..d360bf4d696b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -139,7 +139,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	fpu__clear(&tsk->thread.fpu);
+	fpu__clear_all(&tsk->thread.fpu);
 }
 
 void disable_TSC(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 8eb7193e158d..ce9421ec285f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -763,7 +763,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		fpu__clear(fpu);
+		fpu__clear_user_states(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 03/27] x86/fpu/xstate: Change names to separate XSAVES system and user states
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Control-flow Enforcement (CET) MSR contents are XSAVES system states.
To support CET, introduce XSAVES system states first.

XSAVES is a "supervisor" instruction and, comparing to XSAVE, saves
additional "supervisor" states that can be modified only from CPL 0.
However, these states are per-task and not kernel's own.  Rename
"supervisor" states to "system" states to clearly separate them from
"user" states.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/include/asm/fpu/internal.h |  4 +-
 arch/x86/include/asm/fpu/xstate.h   | 20 +++----
 arch/x86/kernel/fpu/init.c          |  2 +-
 arch/x86/kernel/fpu/signal.c        | 10 ++--
 arch/x86/kernel/fpu/xstate.c        | 86 ++++++++++++++---------------
 5 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 4c95c365058a..652be3853b40 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -92,7 +92,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
 	 * XRSTORS requires these bits set in xcomp_bv, or it will
 	 * trigger #GP:
 	 */
-	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
+	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_user;
 }
 
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
@@ -225,7 +225,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 
 /*
  * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
+ * format and system states in addition to modified optimization in
  * XSAVEOPT.
  *
  * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c6136d79f8c0..9ded9532257d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -25,15 +25,15 @@
 #define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
 
 /* All currently supported features */
-#define XCNTXT_MASK		(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE | \
-				 XFEATURE_MASK_YMM | \
-				 XFEATURE_MASK_OPMASK | \
-				 XFEATURE_MASK_ZMM_Hi256 | \
-				 XFEATURE_MASK_Hi16_ZMM	 | \
-				 XFEATURE_MASK_PKRU | \
-				 XFEATURE_MASK_BNDREGS | \
-				 XFEATURE_MASK_BNDCSR)
+#define SUPPORTED_XFEATURES_MASK (XFEATURE_MASK_FP | \
+				  XFEATURE_MASK_SSE | \
+				  XFEATURE_MASK_YMM | \
+				  XFEATURE_MASK_OPMASK | \
+				  XFEATURE_MASK_ZMM_Hi256 | \
+				  XFEATURE_MASK_Hi16_ZMM | \
+				  XFEATURE_MASK_PKRU | \
+				  XFEATURE_MASK_BNDREGS | \
+				  XFEATURE_MASK_BNDCSR)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
@@ -41,7 +41,7 @@
 #define REX_PREFIX
 #endif
 
-extern u64 xfeatures_mask;
+extern u64 xfeatures_mask_user;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6ce7e0a23268..73fed33e5bda 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -224,7 +224,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  */
 u64 __init fpu__get_supported_xfeatures_mask(void)
 {
-	return XCNTXT_MASK;
+	return SUPPORTED_XFEATURES_MASK;
 }
 
 /* Legacy code to initialize eager fpu mode. */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 0071b794ed19..8a63f07cf400 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -254,11 +254,11 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 {
 	if (use_xsave()) {
 		if (fx_only) {
-			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			u64 init_bv = xfeatures_mask_user & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_fxregs(buf);
 		} else {
-			u64 init_bv = xfeatures_mask & ~xbv;
+			u64 init_bv = xfeatures_mask_user & ~xbv;
 			if (unlikely(init_bv))
 				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_xregs(buf, xbv);
@@ -357,7 +357,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 
 	if (use_xsave() && !fx_only) {
-		u64 init_bv = xfeatures_mask & ~xfeatures;
+		u64 init_bv = xfeatures_mask_user & ~xfeatures;
 
 		if (using_compacted_format()) {
 			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
@@ -388,7 +388,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 
 		fpregs_lock();
 		if (use_xsave()) {
-			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			u64 init_bv = xfeatures_mask_user & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
@@ -462,7 +462,7 @@ void fpu__init_prepare_fx_sw_frame(void)
 
 	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
 	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xfeatures = xfeatures_mask;
+	fx_sw_reserved.xfeatures = xfeatures_mask_user;
 	fx_sw_reserved.xstate_size = fpu_user_xstate_size;
 
 	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e5cb67d67c03..d560e8861a3c 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -54,13 +54,16 @@ static short xsave_cpuid_features[] __initdata = {
 };
 
 /*
- * Mask of xstate features supported by the CPU and the kernel:
+ * XSAVES system states can only be modified from CPL 0 and saved by
+ * XSAVES.  The rest are user states.  The following is a mask of
+ * supported user state features derived from boot_cpu_has() and
+ * SUPPORTED_XFEATURES_MASK.
  */
-u64 xfeatures_mask __read_mostly;
+u64 xfeatures_mask_user __read_mostly;
 
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_user)*8];
 
 /*
  * The XSAVE area of kernel can be in standard or compacted format;
@@ -76,7 +79,7 @@ unsigned int fpu_user_xstate_size;
  */
 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 {
-	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_user;
 
 	if (unlikely(feature_name)) {
 		long xfeature_idx, max_idx;
@@ -107,15 +110,12 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 }
 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 
-static int xfeature_is_supervisor(int xfeature_nr)
+static int xfeature_is_system(int xfeature_nr)
 {
 	/*
-	 * We currently do not support supervisor states, but if
-	 * we did, we could find out like this.
-	 *
 	 * SDM says: If state component 'i' is a user state component,
-	 * ECX[0] return 0; if state component i is a supervisor
-	 * state component, ECX[0] returns 1.
+	 * ECX[0] is 0; if state component i is a system state component,
+	 * ECX[0] is 1.
 	 */
 	u32 eax, ebx, ecx, edx;
 
@@ -125,7 +125,7 @@ static int xfeature_is_supervisor(int xfeature_nr)
 
 static int xfeature_is_user(int xfeature_nr)
 {
-	return !xfeature_is_supervisor(xfeature_nr);
+	return !xfeature_is_system(xfeature_nr);
 }
 
 /*
@@ -158,7 +158,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	 * None of the feature bits are in init state. So nothing else
 	 * to do for us, as the memory layout is up to date.
 	 */
-	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+	if ((xfeatures & xfeatures_mask_user) == xfeatures_mask_user)
 		return;
 
 	/*
@@ -185,7 +185,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 	 * in a special way already:
 	 */
 	feature_bit = 0x2;
-	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+	xfeatures = (xfeatures_mask_user & ~xfeatures) >> 2;
 
 	/*
 	 * Update all the remaining memory layouts according to their
@@ -213,20 +213,18 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
  */
 void fpu__init_cpu_xstate(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
 		return;
 	/*
-	 * Make it clear that XSAVES supervisor states are not yet
-	 * implemented should anyone expect it to work by changing
-	 * bits in XFEATURE_MASK_* macros and XCR0.
+	 * XCR_XFEATURE_ENABLED_MASK sets the features that are managed
+	 * by XSAVE{C, OPT} and XRSTOR.  Only XSAVE user states can be
+	 * set here.
 	 */
-	WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
-		"x86/fpu: XSAVES supervisor states are not yet implemented.\n");
 
-	xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+	xfeatures_mask_user &= ~XFEATURE_MASK_SUPERVISOR;
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
 }
 
 /*
@@ -236,7 +234,7 @@ void fpu__init_cpu_xstate(void)
  */
 static int xfeature_enabled(enum xfeature xfeature)
 {
-	return !!(xfeatures_mask & (1UL << xfeature));
+	return !!(xfeatures_mask_user & BIT_ULL(xfeature));
 }
 
 /*
@@ -266,7 +264,7 @@ static void __init setup_xstate_features(void)
 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 
 		/*
-		 * If an xfeature is supervisor state, the offset
+		 * If an xfeature is a system state, the offset
 		 * in EBX is invalid. We leave it to -1.
 		 */
 		if (xfeature_is_user(i))
@@ -342,7 +340,7 @@ static int xfeature_is_aligned(int xfeature_nr)
  */
 static void __init setup_xstate_comp(void)
 {
-	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_user)*8];
 	int i;
 
 	/*
@@ -415,7 +413,7 @@ static void __init setup_init_fpu_buf(void)
 	print_xstate_features();
 
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+		init_fpstate.xsave.header.xcomp_bv = BIT_ULL(63) | xfeatures_mask_user;
 
 	/*
 	 * Init all the features state with header.xfeatures being 0x0
@@ -434,8 +432,8 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
 	u32 eax, ebx, ecx, edx;
 
 	/*
-	 * Only XSAVES supports supervisor states and it uses compacted
-	 * format. Checking a supervisor state's uncompacted offset is
+	 * Only XSAVES supports system states and it uses compacted
+	 * format. Checking a system state's uncompacted offset is
 	 * an error.
 	 */
 	if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
@@ -459,7 +457,7 @@ static int xfeature_size(int xfeature_nr)
 
 /*
  * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
+ * 1. saving of system state
  * 2. using the compacted format
  *
  * Use this function when dealing with the compacted format so
@@ -474,8 +472,8 @@ int using_compacted_format(void)
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_xstate_header(const struct xstate_header *hdr)
 {
-	/* No unknown or supervisor features may be set */
-	if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+	/* No unknown or system features may be set */
+	if (hdr->xfeatures & ~xfeatures_mask_user)
 		return -EINVAL;
 
 	/* Userspace must use the uncompacted format */
@@ -582,11 +580,11 @@ static void do_extra_xstate_size_checks(void)
 
 		check_xstate_against_struct(i);
 		/*
-		 * Supervisor state components can be managed only by
+		 * System state components can be managed only by
 		 * XSAVES, which is compacted-format only.
 		 */
 		if (!using_compacted_format())
-			XSTATE_WARN_ON(xfeature_is_supervisor(i));
+			XSTATE_WARN_ON(xfeature_is_system(i));
 
 		/* Align from the end of the previous feature */
 		if (xfeature_is_aligned(i))
@@ -610,7 +608,7 @@ static void do_extra_xstate_size_checks(void)
 
 
 /*
- * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ * Get total size of enabled xstates in XCR0/xfeatures_mask_user.
  *
  * Note the SDM's wording here.  "sub-function 0" only enumerates
  * the size of the *user* states.  If we use it to size a buffer
@@ -700,7 +698,7 @@ static int __init init_xstate_size(void)
  */
 static void fpu__init_disable_system_xstate(void)
 {
-	xfeatures_mask = 0;
+	xfeatures_mask_user = 0;
 	cr4_clear_bits(X86_CR4_OSXSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 }
@@ -736,15 +734,15 @@ void __init fpu__init_system_xstate(void)
 	}
 
 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	xfeatures_mask = eax + ((u64)edx << 32);
+	xfeatures_mask_user = eax + ((u64)edx << 32);
 
-	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+	if ((xfeatures_mask_user & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 		/*
 		 * This indicates that something really unexpected happened
 		 * with the enumeration.  Disable XSAVE and try to continue
 		 * booting without it.  This is too early to BUG().
 		 */
-		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask_user);
 		goto out_disable;
 	}
 
@@ -753,10 +751,10 @@ void __init fpu__init_system_xstate(void)
 	 */
 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
 		if (!boot_cpu_has(xsave_cpuid_features[i]))
-			xfeatures_mask &= ~BIT(i);
+			xfeatures_mask_user &= ~BIT_ULL(i);
 	}
 
-	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+	xfeatures_mask_user &= fpu__get_supported_xfeatures_mask();
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
@@ -766,9 +764,9 @@ void __init fpu__init_system_xstate(void)
 
 	/*
 	 * Update info used for ptrace frames; use standard-format size and no
-	 * supervisor xstates:
+	 * system xstates:
 	 */
-	update_regset_xstate_info(fpu_user_xstate_size,	xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+	update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user & ~XFEATURE_MASK_SUPERVISOR);
 
 	fpu__init_prepare_fx_sw_frame();
 	setup_init_fpu_buf();
@@ -776,7 +774,7 @@ void __init fpu__init_system_xstate(void)
 	print_xstate_offset_size();
 
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
-		xfeatures_mask,
+		xfeatures_mask_user,
 		fpu_kernel_xstate_size,
 		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 	return;
@@ -795,7 +793,7 @@ void fpu__resume_cpu(void)
 	 * Restore XCR0 on xsave capable CPUs:
 	 */
 	if (boot_cpu_has(X86_FEATURE_XSAVE))
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
 }
 
 /*
@@ -843,7 +841,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 	 * have not enabled.  Remember that pcntxt_mask is
 	 * what we write to the XCR0 register.
 	 */
-	WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
+	WARN_ONCE(!(xfeatures_mask_user & BIT_ULL(xfeature_nr)),
 		  "get of unsupported state");
 	/*
 	 * This assumes the last 'xsave*' instruction to
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 02/27] x86/cpufeatures: Add CET CPU feature flags for Control-flow Enforcement Technology (CET)
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Add CPU feature flags for Control-flow Enforcement Technology (CET).

CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect branch tracking

Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/include/asm/cpufeatures.h | 2 ++
 arch/x86/kernel/cpu/cpuid-deps.c   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index e880f2408e29..122265ab46c1 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -334,6 +334,7 @@
 #define X86_FEATURE_OSPKE		(16*32+ 4) /* OS Protection Keys Enable */
 #define X86_FEATURE_WAITPKG		(16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
 #define X86_FEATURE_AVX512_VBMI2	(16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK		(16*32+ 7) /* Shadow Stack */
 #define X86_FEATURE_GFNI		(16*32+ 8) /* Galois Field New Instructions */
 #define X86_FEATURE_VAES		(16*32+ 9) /* Vector AES */
 #define X86_FEATURE_VPCLMULQDQ		(16*32+10) /* Carry-Less Multiplication Double Quadword */
@@ -358,6 +359,7 @@
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
+#define X86_FEATURE_IBT			(18*32+20) /* Indirect Branch Tracking */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b5353244749b..9bf35f081080 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -68,6 +68,8 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_CQM_MBM_TOTAL,	X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_CQM_MBM_LOCAL,	X86_FEATURE_CQM_LLC   },
 	{ X86_FEATURE_AVX512_BF16,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_SHSTK,		X86_FEATURE_XSAVES    },
+	{ X86_FEATURE_IBT,		X86_FEATURE_XSAVES    },
 	{}
 };
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 01/27] Documentation/x86: Add CET description
From: Yu-cheng Yu @ 2019-08-13 20:51 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>

Explain how CET works and the no_cet_shstk/no_cet_ibt kernel
parameters.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 .../admin-guide/kernel-parameters.txt         |   6 +
 Documentation/x86/index.rst                   |   1 +
 Documentation/x86/intel_cet.rst               | 269 ++++++++++++++++++
 3 files changed, 276 insertions(+)
 create mode 100644 Documentation/x86/intel_cet.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 47d981a86e2f..31ba7b408407 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2935,6 +2935,12 @@
 			noexec=on: enable non-executable mappings (default)
 			noexec=off: disable non-executable mappings
 
+	no_cet_ibt	[X86-64] Disable indirect branch tracking for user-mode
+			applications
+
+	no_cet_shstk	[X86-64] Disable shadow stack support for user-mode
+			applications
+
 	nosmap		[X86,PPC]
 			Disable SMAP (Supervisor Mode Access Prevention)
 			even if it is supported by processor.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index af64c4bb4447..4be2d34ee631 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -19,6 +19,7 @@ x86-specific Documentation
    tlb
    mtrr
    pat
+   intel_cet
    intel_mpx
    intel-iommu
    intel_txt
diff --git a/Documentation/x86/intel_cet.rst b/Documentation/x86/intel_cet.rst
new file mode 100644
index 000000000000..a6a34a92987e
--- /dev/null
+++ b/Documentation/x86/intel_cet.rst
@@ -0,0 +1,269 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Control-flow Enforcement Technology (CET)
+=========================================
+
+[1] Overview
+============
+
+Control-flow Enforcement Technology (CET) provides protection against
+return/jump-oriented programming (ROP) attacks.  It can be setup to
+protect both the kernel and applications.  In the first phase,
+only the user-mode protection is implemented in 64-bit mode; 32-bit
+applications are supported in compatibility mode.
+
+CET introduces shadow stack (SHSTK) and indirect branch tracking
+(IBT).  SHSTK is a secondary stack allocated from memory and cannot
+be directly modified by applications.  When executing a CALL, the
+processor pushes a copy of the return address to SHSTK.  Upon
+function return, the processor pops the SHSTK copy and compares it
+to the one from the program stack.  If the two copies differ, the
+processor raises a control-protection exception.  IBT verifies all
+indirect CALL/JMP targets are intended as marked by the compiler
+with 'ENDBR' opcodes (see CET instructions below).
+
+There are two kernel configuration options:
+
+    INTEL_X86_SHADOW_STACK_USER, and
+    INTEL_X86_BRANCH_TRACKING_USER.
+
+To build a CET-enabled kernel, Binutils v2.31 and GCC v8.1 or later
+are required.  To build a CET-enabled application, GLIBC v2.28 or
+later is also required.
+
+There are two command-line options for disabling CET features:
+
+    no_cet_shstk - disables SHSTK, and
+    no_cet_ibt - disables IBT.
+
+At run time, /proc/cpuinfo shows the availability of SHSTK and IBT.
+
+[2] CET assembly instructions
+=============================
+
+RDSSP %r
+    Read the SHSTK pointer into %r.
+
+INCSSP %r
+    Unwind (increment) the SHSTK pointer (0 ~ 255) steps as indicated
+    in the operand register.  The GLIBC longjmp uses INCSSP to unwind
+    the SHSTK until that matches the program stack.  When it is
+    necessary to unwind beyond 255 steps, longjmp divides and repeats
+    the process.
+
+RSTORSSP (%r)
+    Switch to the SHSTK indicated in the 'restore token' pointed by
+    the operand register and replace the 'restore token' with a new
+    token to be saved (with SAVEPREVSSP) for the outgoing SHSTK.
+
+::
+
+                               Before RSTORSSP
+
+             Incoming SHSTK                   Current/Outgoing SHSTK
+
+        |----------------------|             |----------------------|
+ addr=x |                      |       ssp-> |                      |
+        |----------------------|             |----------------------|
+ (%r)-> | rstor_token=(x|Lg)   |    addr=y-8 |                      |
+        |----------------------|             |----------------------|
+
+                               After RSTORSSP
+
+        |----------------------|             |----------------------|
+        |                      |             |                      |
+        |----------------------|             |----------------------|
+  ssp-> | rstor_token=(y|Bz|Lg)|    addr=y-8 |                      |
+        |----------------------|             |----------------------|
+
+    note:
+        1. Only valid addresses and restore tokens can be on the
+           user-mode SHSTK.
+        2. A token is always of type u64 and must align to u64.
+        3. The incoming SHSTK pointer in a rstor_token must point to
+           immediately above the token.
+        4. 'Lg' is bit[0] of a rstor_token indicating a 64-bit SHSTK.
+        5. 'Bz' is bit[1] of a rstor_token indicating the token is to
+           be used only for the next SAVEPREVSSP and invalid for the
+           RSTORSSP.
+
+SAVEPREVSSP
+    Store the SHSTK 'restore token' pointed by
+        (current_SHSTK_pointer + 8).
+
+::
+
+                             After SAVEPREVSSP
+
+        |----------------------|             |----------------------|
+  ssp-> |                      |             |                      |
+        |----------------------|             |----------------------|
+        | rstor_token=(y|Bz|Lg)|    addr=y-8 | rstor_token(y|Lg)    |
+        |----------------------|             |----------------------|
+
+WRUSS %r0, (%r1)
+    Write the value in %r0 to the SHSTK address pointed by (%r1).
+    This is a kernel-mode only instruction.
+
+ENDBR
+    The compiler inserts an ENDBR at all valid branch targets.  Any
+    CALL/JMP to a target without an ENDBR triggers a control
+    protection fault.
+
+[3] Application Enabling
+========================
+
+An application's CET capability is marked in its ELF header and can
+be verified from the following command output, in the
+NT_GNU_PROPERTY_TYPE_0 field:
+
+    readelf -n <application>
+
+If an application supports CET and is statically linked, it will run
+with CET protection.  If the application needs any shared libraries,
+the loader checks all dependencies and enables CET only when all
+requirements are met.
+
+[4] Legacy Libraries
+====================
+
+GLIBC provides a few tunables for backward compatibility.
+
+GLIBC_TUNABLES=glibc.tune.hwcaps=-SHSTK,-IBT
+    Turn off SHSTK/IBT for the current shell.
+
+GLIBC_TUNABLES=glibc.tune.x86_shstk=<on, permissive>
+    This controls how dlopen() handles SHSTK legacy libraries:
+        on: continue with SHSTK enabled;
+        permissive: continue with SHSTK off.
+
+[5] CET system calls
+====================
+
+The following arch_prctl() system calls are added for CET:
+
+arch_prctl(ARCH_X86_CET_STATUS, unsigned long *addr)
+    Return CET feature status.
+
+    The parameter 'addr' is a pointer to a user buffer.
+    On returning to the caller, the kernel fills the following
+    information:
+
+    *addr = SHSTK/IBT status
+    *(addr + 1) = SHSTK base address
+    *(addr + 2) = SHSTK size
+
+arch_prctl(ARCH_X86_CET_DISABLE, unsigned long features)
+    Disable SHSTK and/or IBT specified in 'features'.  Return -EPERM
+    if CET is locked.
+
+arch_prctl(ARCH_X86_CET_LOCK)
+    Lock in CET feature.
+
+arch_prctl(ARCH_X86_CET_ALLOC_SHSTK, unsigned long *addr)
+    Allocate a new SHSTK and put a restore token at top.
+
+    The parameter 'addr' is a pointer to a user buffer and indicates
+    the desired SHSTK size to allocate.  On returning to the caller,
+    the kernel fills *addr with the base address of the new SHSTK.
+
+arch_prctl(ARCH_X86_CET_MARK_LEGACY_CODE, unsigned long *addr)
+    Mark an address range as IBT legacy code.
+
+    The parameter 'addr' is a pointer to a user buffer that has the
+    following information:
+
+    *addr = starting linear address of the legacy code
+    *(addr + 1) = size of the legacy code
+    *(addr + 2): set (1); clear (0)
+
+Note:
+  There is no CET enabling arch_prctl function.  By design, CET is
+  enabled automatically if the binary and the system can support it.
+
+  The parameters passed are always unsigned 64-bit.  When an ia32
+  application passing pointers, it should only use the lower 32 bits.
+
+[6] The implementation of the SHSTK
+===================================
+
+SHSTK size
+----------
+
+A task's SHSTK is allocated from memory to a fixed size of
+RLIMIT_STACK.  A compat-mode thread's SHSTK size is 1/4 of
+RLIMIT_STACK.  The smaller 32-bit thread SHSTK allows more threads to
+share a 32-bit address space.
+
+Signal
+------
+
+The main program and its signal handlers use the same SHSTK.  Because
+the SHSTK stores only return addresses, a large SHSTK will cover the
+condition that both the program stack and the sigaltstack run out.
+
+The kernel creates a restore token at the SHSTK restoring address and
+verifies that token when restoring from the signal handler.
+
+Fork
+----
+
+The SHSTK's vma has VM_SHSTK flag set; its PTEs are required to be
+read-only and dirty.  When a SHSTK PTE is not present, RO, and dirty,
+a SHSTK access triggers a page fault with an additional SHSTK bit set
+in the page fault error code.
+
+When a task forks a child, its SHSTK PTEs are copied and both the
+parent's and the child's SHSTK PTEs are cleared of the dirty bit.
+Upon the next SHSTK access, the resulting SHSTK page fault is handled
+by page copy/re-use.
+
+When a pthread child is created, the kernel allocates a new SHSTK for
+the new thread.
+
+Setjmp/Longjmp
+--------------
+
+Longjmp unwinds SHSTK until it matches the program stack.
+
+Ucontext
+--------
+
+In GLIBC, getcontext/setcontext is implemented in similar way as
+setjmp/longjmp.
+
+When makecontext creates a new ucontext, a new SHSTK is allocated for
+that context with ARCH_X86_CET_ALLOC_SHSTK the syscall.  The kernel
+creates a restore token at the top of the new SHSTK and the user-mode
+code switches to the new SHSTK with the RSTORSSP instruction.
+
+[7] The management of read-only & dirty PTEs for SHSTK
+======================================================
+
+A RO and dirty PTE exists in the following cases:
+
+(a) A page is modified and then shared with a fork()'ed child;
+(b) A R/O page that has been COW'ed;
+(c) A SHSTK page.
+
+The processor only checks the dirty bit for (c).  To prevent the use
+of non-SHSTK memory as SHSTK, we use a spare bit of the 64-bit PTE as
+DIRTY_SW for (a) and (b) above.  This results to the following PTE
+settings:
+
+Modified PTE:             (R/W + DIRTY_HW)
+Modified and shared PTE:  (R/O + DIRTY_SW)
+R/O PTE, COW'ed:          (R/O + DIRTY_SW)
+SHSTK PTE:                (R/O + DIRTY_HW)
+SHSTK PTE, COW'ed:        (R/O + DIRTY_HW)
+SHSTK PTE, shared:        (R/O + DIRTY_SW)
+
+Note that DIRTY_SW is only used in R/O PTEs but not R/W PTEs.
+
+[8] The implementation of IBT
+=============================
+
+The kernel provides IBT support in mmap() of the legacy code bit map.
+However, the management of the bitmap is done in the GLIBC or the
+application.
-- 
2.17.1

^ permalink raw reply related

* [PATCH v8 00/27] Control-flow Enforcement: Shadow Stack
From: Yu-cheng Yu @ 2019-08-13 20:51 UTC (permalink / raw)
  To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
	Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
	Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
  Cc: Yu-cheng Yu

Intel has published Control-flow Enforcement (CET) in the Architecture
Instruction Set Extensions Programming Reference:

  https://software.intel.com/en-us/download/intel-architecture-instruction-set-
  extensions-programming-reference

The previous version (v7) of CET Shadow Stack patches is here:

  https://lkml.org/lkml/2019/6/6/1003

Summary of changes from v7:

  Rewrite ELF GNU property parsing (Patch #22).  Look at PT_GNU_PROPERTY now.
  Rebase to v5.3-rc4.
  Small fixes in response to comments.

Yu-cheng Yu (27):
  Documentation/x86: Add CET description
  x86/cpufeatures: Add CET CPU feature flags for Control-flow
    Enforcement Technology (CET)
  x86/fpu/xstate: Change names to separate XSAVES system and user states
  x86/fpu/xstate: Introduce XSAVES system states
  x86/fpu/xstate: Introduce CET MSR system states
  x86/cet: Add control protection exception handler
  x86/cet/shstk: Add Kconfig option for user-mode shadow stack
  mm: Introduce VM_SHSTK for shadow stack memory
  mm/mmap: Prevent Shadow Stack VMA merges
  x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
  x86/mm: Introduce _PAGE_DIRTY_SW
  drm/i915/gvt: Update _PAGE_DIRTY to _PAGE_DIRTY_BITS
  x86/mm: Modify ptep_set_wrprotect and pmdp_set_wrprotect for
    _PAGE_DIRTY_SW
  x86/mm: Shadow stack page fault error checking
  mm: Handle shadow stack page fault
  mm: Handle THP/HugeTLB shadow stack page fault
  mm: Update can_follow_write_pte/pmd for shadow stack
  mm: Introduce do_mmap_locked()
  x86/cet/shstk: User-mode shadow stack support
  x86/cet/shstk: Introduce WRUSS instruction
  x86/cet/shstk: Handle signals for shadow stack
  binfmt_elf: Extract .note.gnu.property from an ELF file
  x86/cet/shstk: ELF header parsing of Shadow Stack
  x86/cet/shstk: Handle thread shadow stack
  mm/mmap: Add Shadow stack pages to memory accounting
  x86/cet/shstk: Add arch_prctl functions for Shadow Stack
  x86/cet/shstk: Add Shadow Stack instructions to opcode map

 .../admin-guide/kernel-parameters.txt         |   6 +
 Documentation/x86/index.rst                   |   1 +
 Documentation/x86/intel_cet.rst               | 269 ++++++++++++++
 arch/x86/Kconfig                              |  27 ++
 arch/x86/Makefile                             |   7 +
 arch/x86/entry/entry_64.S                     |   2 +-
 arch/x86/ia32/ia32_signal.c                   |   8 +
 arch/x86/include/asm/cet.h                    |  48 +++
 arch/x86/include/asm/cpufeatures.h            |   2 +
 arch/x86/include/asm/disabled-features.h      |   8 +-
 arch/x86/include/asm/elf.h                    |  13 +
 arch/x86/include/asm/fpu/internal.h           |  27 +-
 arch/x86/include/asm/fpu/signal.h             |   2 +
 arch/x86/include/asm/fpu/types.h              |  22 ++
 arch/x86/include/asm/fpu/xstate.h             |  26 +-
 arch/x86/include/asm/mmu_context.h            |   3 +
 arch/x86/include/asm/msr-index.h              |  18 +
 arch/x86/include/asm/pgtable.h                | 191 ++++++++--
 arch/x86/include/asm/pgtable_types.h          |  38 +-
 arch/x86/include/asm/processor.h              |   5 +
 arch/x86/include/asm/special_insns.h          |  32 ++
 arch/x86/include/asm/traps.h                  |   5 +
 arch/x86/include/uapi/asm/prctl.h             |   5 +
 arch/x86/include/uapi/asm/processor-flags.h   |   2 +
 arch/x86/include/uapi/asm/sigcontext.h        |  15 +
 arch/x86/kernel/Makefile                      |   2 +
 arch/x86/kernel/cet.c                         | 327 ++++++++++++++++++
 arch/x86/kernel/cet_prctl.c                   |  85 +++++
 arch/x86/kernel/cpu/common.c                  |  25 ++
 arch/x86/kernel/cpu/cpuid-deps.c              |   2 +
 arch/x86/kernel/fpu/core.c                    |  26 +-
 arch/x86/kernel/fpu/init.c                    |  10 -
 arch/x86/kernel/fpu/signal.c                  |  81 ++++-
 arch/x86/kernel/fpu/xstate.c                  | 169 +++++----
 arch/x86/kernel/idt.c                         |   4 +
 arch/x86/kernel/process.c                     |   8 +-
 arch/x86/kernel/process_64.c                  |  41 +++
 arch/x86/kernel/relocate_kernel_64.S          |   2 +-
 arch/x86/kernel/signal.c                      |  10 +-
 arch/x86/kernel/signal_compat.c               |   2 +-
 arch/x86/kernel/traps.c                       |  57 +++
 arch/x86/kvm/vmx/vmx.c                        |   2 +-
 arch/x86/lib/x86-opcode-map.txt               |  26 +-
 arch/x86/mm/fault.c                           |  18 +
 arch/x86/mm/pgtable.c                         |  41 +++
 drivers/gpu/drm/i915/gvt/gtt.c                |   2 +-
 fs/Kconfig.binfmt                             |   3 +
 fs/Makefile                                   |   1 +
 fs/binfmt_elf.c                               |  20 ++
 fs/gnu_property.c                             | 178 ++++++++++
 fs/proc/task_mmu.c                            |   3 +
 include/asm-generic/pgtable.h                 |  33 ++
 include/linux/elf.h                           |  11 +
 include/linux/mm.h                            |  26 ++
 include/uapi/asm-generic/siginfo.h            |   3 +-
 include/uapi/linux/elf.h                      |  14 +
 mm/gup.c                                      |   8 +-
 mm/huge_memory.c                              |  12 +-
 mm/memory.c                                   |   7 +-
 mm/mmap.c                                     |  11 +
 .../arch/x86/include/asm/disabled-features.h  |   8 +-
 tools/objtool/arch/x86/lib/x86-opcode-map.txt |  26 +-
 62 files changed, 1920 insertions(+), 166 deletions(-)
 create mode 100644 Documentation/x86/intel_cet.rst
 create mode 100644 arch/x86/include/asm/cet.h
 create mode 100644 arch/x86/kernel/cet.c
 create mode 100644 arch/x86/kernel/cet_prctl.c
 create mode 100644 fs/gnu_property.c

-- 
2.17.1

^ permalink raw reply

* Re: [PATCH] syscalls: Update the syscall #defines to match uapi
From: Arnd Bergmann @ 2019-08-13 19:41 UTC (permalink / raw)
  To: Alistair Francis
  Cc: Alistair Francis, Linux Kernel Mailing List, Linux API,
	Deepa Dinamani
In-Reply-To: <CAKmqyKNH7G=_gs2Hfc3OZMFaHzUwU8fSomfu_r92hJrnJHJT3A@mail.gmail.com>

On Tue, Aug 13, 2019 at 9:01 PM Alistair Francis <alistair23@gmail.com> wrote:
> On Mon, Aug 12, 2019 at 2:49 AM Arnd Bergmann <arnd@arndb.de> wrote:

> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > index 2bcef4c70183..e4bf5e480d60 100644
> > > --- a/include/linux/syscalls.h
> > > +++ b/include/linux/syscalls.h
> > > @@ -512,7 +512,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu
> > >  asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
> > >                                struct stat __user *statbuf, int flag);
> > >  asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
> > > -#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
> > > +#if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64)
> > >  asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
> > >  asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
> > >                                struct stat64 __user *statbuf, int flag);
> >
> > I think this is wrong: when __ARCH_WANT_NEW_STAT is set, we are
> > on a 64-bit architecture and only want the sys_newfstat{,at} system
> > calls, not sys_fstat{,at}64 that gets used on 32-bit machines.
>
> Ah, that would make sense then. I don't think you will see the error then.

So we don't need this patch to build riscv32 kernels, right? It's possible
that it was the result of an incorrect forward port of some other patch,
as older riscv32 kernels did provide stat64(), but newer ones only have
statx().

       Arnd

^ permalink raw reply

* Re: [PATCH v5 2/6] mm/page_idle: Add support for handling swapped PG_Idle pages
From: Konstantin Khlebnikov @ 2019-08-13 19:24 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Michal Hocko,
	Константин Хлебников,
	Linux Kernel Mailing List, Minchan Kim, Alexey Dobriyan,
	Andrew Morton, Borislav Petkov, Brendan Gregg, Catalin Marinas,
	Christian Hansen, dancol, fmayer, H. Peter Anvin, Ingo Molnar,
	Jonathan Corbet, Kees Cook, kernel-team, Linux API, linux-doc,
	linux-fsdevel
In-Reply-To: <20190813153659.GD14622@google.com>

On Tue, Aug 13, 2019 at 6:37 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Tue, Aug 13, 2019 at 05:04:50PM +0200, Michal Hocko wrote:
> > On Wed 07-08-19 13:15:55, Joel Fernandes (Google) wrote:
> > > Idle page tracking currently does not work well in the following
> > > scenario:
> > >  1. mark page-A idle which was present at that time.
> > >  2. run workload
> > >  3. page-A is not touched by workload
> > >  4. *sudden* memory pressure happen so finally page A is finally swapped out
> > >  5. now see the page A - it appears as if it was accessed (pte unmapped
> > >     so idle bit not set in output) - but it's incorrect.
> > >
> > > To fix this, we store the idle information into a new idle bit of the
> > > swap PTE during swapping of anonymous pages.
> > >
> > > Also in the future, madvise extensions will allow a system process
> > > manager (like Android's ActivityManager) to swap pages out of a process
> > > that it knows will be cold. To an external process like a heap profiler
> > > that is doing idle tracking on another process, this procedure will
> > > interfere with the idle page tracking similar to the above steps.
> >
> > This could be solved by checking the !present/swapped out pages
> > right? Whoever decided to put the page out to the swap just made it
> > idle effectively.  So the monitor can make some educated guess for
> > tracking. If that is fundamentally not possible then please describe
> > why.
>
> But the monitoring process (profiler) does not have control over the 'whoever
> made it effectively idle' process.
>
> As you said it will be a guess, it will not be accurate.

Yep. Without saving idle bit in swap entry (and presuming that all swap is idle)
profiler could miss access. This patch adds accurate tracking almost for free.
After that profiler could work with any pace without races.

>
> I am curious what is your concern with using a bit in the swap PTE?
>
> (Adding Konstantin as well since we may be interested in this, since we also
> suggested this idea).
>
> thanks,
>
>  - Joel
>
>

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Joel Fernandes @ 2019-08-13 19:18 UTC (permalink / raw)
  To: Daniel Gruss
  Cc: Jann Horn, Michal Hocko, kernel list, Alexey Dobriyan,
	Andrew Morton, Borislav Petkov, Brendan Gregg, Catalin Marinas,
	Christian Hansen, Daniel Colascione, fmayer, H. Peter Anvin,
	Ingo Molnar, Jonathan Corbet, Kees Cook, kernel-team, Linux API,
	linux-doc, linux-fsdevel, Linux-MM, Mike Rapoport
In-Reply-To: <d6ae7f06-f0ef-ec00-a020-98e7cfada281@iaik.tugraz.at>

On Tue, Aug 13, 2019 at 05:34:16PM +0200, Daniel Gruss wrote:
> On 8/13/19 5:29 PM, Jann Horn wrote:
> > On Tue, Aug 13, 2019 at 12:09 PM Michal Hocko <mhocko@kernel.org> wrote:
> >> On Mon 12-08-19 20:14:38, Jann Horn wrote:
> >>> On Wed, Aug 7, 2019 at 7:16 PM Joel Fernandes (Google)
> >>> <joel@joelfernandes.org> wrote:
> >>>> The page_idle tracking feature currently requires looking up the pagemap
> >>>> for a process followed by interacting with /sys/kernel/mm/page_idle.
> >>>> Looking up PFN from pagemap in Android devices is not supported by
> >>>> unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
> >>>>
> >>>> This patch adds support to directly interact with page_idle tracking at
> >>>> the PID level by introducing a /proc/<pid>/page_idle file.  It follows
> >>>> the exact same semantics as the global /sys/kernel/mm/page_idle, but now
> >>>> looking up PFN through pagemap is not needed since the interface uses
> >>>> virtual frame numbers, and at the same time also does not require
> >>>> SYS_ADMIN.
> >>>>
> >>>> In Android, we are using this for the heap profiler (heapprofd) which
> >>>> profiles and pin points code paths which allocates and leaves memory
> >>>> idle for long periods of time. This method solves the security issue
> >>>> with userspace learning the PFN, and while at it is also shown to yield
> >>>> better results than the pagemap lookup, the theory being that the window
> >>>> where the address space can change is reduced by eliminating the
> >>>> intermediate pagemap look up stage. In virtual address indexing, the
> >>>> process's mmap_sem is held for the duration of the access.
> >>>
> >>> What happens when you use this interface on shared pages, like memory
> >>> inherited from the zygote, library file mappings and so on? If two
> >>> profilers ran concurrently for two different processes that both map
> >>> the same libraries, would they end up messing up each other's data?
> >>
> >> Yup PageIdle state is shared. That is the page_idle semantic even now
> >> IIRC.
> >>
> >>> Can this be used to observe which library pages other processes are
> >>> accessing, even if you don't have access to those processes, as long
> >>> as you can map the same libraries? I realize that there are already a
> >>> bunch of ways to do that with side channels and such; but if you're
> >>> adding an interface that allows this by design, it seems to me like
> >>> something that should be gated behind some sort of privilege check.
> >>
> >> Hmm, you need to be priviledged to get the pfn now and without that you
> >> cannot get to any page so the new interface is weakening the rules.
> >> Maybe we should limit setting the idle state to processes with the write
> >> status. Or do you think that even observing idle status is useful for
> >> practical side channel attacks? If yes, is that a problem of the
> >> profiler which does potentially dangerous things?
> > 
> > I suppose read-only access isn't a real problem as long as the
> > profiler isn't writing the idle state in a very tight loop... but I
> > don't see a usecase where you'd actually want that? As far as I can
> > tell, if you can't write the idle state, being able to read it is
> > pretty much useless.
> > 
> > If the profiler only wants to profile process-private memory, then
> > that should be implementable in a safe way in principle, I think, but
> > since Joel said that they want to profile CoW memory as well, I think
> > that's inherently somewhat dangerous.
> 
> I agree that allowing profiling of shared pages would leak information.

Will think more about it. If we limit it to private pages, then it could
become useless. Consider a scenario where:
A process allocates a some memory, then forks a bunch of worker processes
that read that memory and perform some work with them. Per-PID page idle
tracking is now run on the parent processes. Now it should appear that the
pages are actively accessed (not-idle). If we don't track shared pages, then
we cannot detect if those pages are really due to memory leaking, or if they
are there for a purpose and are actively used.

> To me the use case is not entirely clear. This is not a feature that
> would normally be run in everyday computer usage, right?

Generally, this to be used as a debugging feature that helps developers
detect memory leaks in their programs.

thanks,

 - Joel

^ permalink raw reply

* Re: [PATCH] syscalls: Update the syscall #defines to match uapi
From: Alistair Francis @ 2019-08-13 18:57 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Alistair Francis, Linux Kernel Mailing List, Linux API,
	Deepa Dinamani
In-Reply-To: <CAK8P3a2wYMsBRm1X-TFo1d7-B7Xug9gwqF77HitoE7wmOqD7rw@mail.gmail.com>

On Mon, Aug 12, 2019 at 2:49 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Sat, Aug 10, 2019 at 3:11 AM Alistair Francis
> <alistair.francis@wdc.com> wrote:
> >
> > Update the #defines around sys_fstat64() and sys_fstatat64() to match
> > the #defines around the __NR3264_fstatat and __NR3264_fstat definitions
> > in include/uapi/asm-generic/unistd.h. This avoids compiler failures if
> > one is defined.
>
> What is the compiler failure you get?

I don't have it infornt of me but it was along the lines of
sys_fstat64/sys_fstatat64 not being defined when __ARCH_WANT_NEW_STAT
is defined but __ARCH_WANT_STAT64 isn't.

>
> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > ---
> >  include/linux/syscalls.h | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index 2bcef4c70183..e4bf5e480d60 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -512,7 +512,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu
> >  asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
> >                                struct stat __user *statbuf, int flag);
> >  asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
> > -#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
> > +#if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64)
> >  asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
> >  asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
> >                                struct stat64 __user *statbuf, int flag);
>
> I think this is wrong: when __ARCH_WANT_NEW_STAT is set, we are
> on a 64-bit architecture and only want the sys_newfstat{,at} system
> calls, not sys_fstat{,at}64 that gets used on 32-bit machines.

Ah, that would make sense then. I don't think you will see the error then.

Alistair

>
> The #if check in the syscalls.h file also matches the definition of
> the function.
>
>        Arnd

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Jann Horn @ 2019-08-13 15:40 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: kernel list, Alexey Dobriyan, Andrew Morton, Borislav Petkov,
	Brendan Gregg, Catalin Marinas, Christian Hansen,
	Daniel Colascione, fmayer, H. Peter Anvin, Ingo Molnar,
	Jonathan Corbet, Kees Cook, kernel-team, Linux API, linux-doc,
	linux-fsdevel, Linux-MM, Michal Hocko, Mike Rapoport, Minchan Kim,
	namhyu
In-Reply-To: <20190813153020.GC14622@google.com>

On Tue, Aug 13, 2019 at 5:30 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> On Mon, Aug 12, 2019 at 08:14:38PM +0200, Jann Horn wrote:
> [snip]
> > > +/* Helper to get the start and end frame given a pos and count */
> > > +static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
> > > +                               unsigned long *start, unsigned long *end)
> > > +{
> > > +       unsigned long max_frame;
> > > +
> > > +       /* If an mm is not given, assume we want physical frames */
> > > +       max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
> > > +
> > > +       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
> > > +               return -EINVAL;
> > > +
> > > +       *start = pos * BITS_PER_BYTE;
> > > +       if (*start >= max_frame)
> > > +               return -ENXIO;
> > > +
> > > +       *end = *start + count * BITS_PER_BYTE;
> > > +       if (*end > max_frame)
> > > +               *end = max_frame;
> > > +       return 0;
> > > +}
> >
> > You could add some overflow checks for the multiplications. I haven't
> > seen any place where it actually matters, but it seems unclean; and in
> > particular, on a 32-bit architecture where the maximum user address is
> > very high (like with a 4G:4G split), it looks like this function might
> > theoretically return with `*start > *end`, which could be confusing to
> > callers.
>
> I could store the multiplication result in unsigned long long (since we are
> bounds checking with max_frame, start > end would not occur). Something like
> the following (with extraneous casts). But I'll think some more about the
> point you are raising.

check_mul_overflow() exists and could make that a bit cleaner.


> > This means that BITMAP_CHUNK_SIZE is UAPI on big-endian systems,
> > right? My opinion is that it would be slightly nicer to design the
> > UAPI such that incrementing virtual addresses are mapped to
> > incrementing offsets in the buffer (iow, either use bytewise access or
> > use little-endian), but I'm not going to ask you to redesign the UAPI
> > this late.
>
> That would also be slow and consume more memory in userspace buffers.
> Currently, a 64-bit (8 byte) chunk accounts for 64 pages worth or 256KB.

I still wanted to use one bit per page; I just wanted to rearrange the
bits. So the first byte would always contain 8 bits corresponding to
the first 8 pages, instead of corresponding to pages 56-63 on some
systems depending on endianness. Anyway, this is a moot point, since
as you said...

> Also I wanted to keep the interface consistent with the global
> /sys/kernel/mm/page_idle interface.

Sorry, I missed that this is already UAPI in the global interface. I
agree, using a different API for the per-process interface would be a
bad idea.

^ permalink raw reply

* Re: [PATCH v5 2/6] mm/page_idle: Add support for handling swapped PG_Idle pages
From: Joel Fernandes @ 2019-08-13 15:36 UTC (permalink / raw)
  To: Michal Hocko, khlebnikov
  Cc: linux-kernel, Minchan Kim, Alexey Dobriyan, Andrew Morton,
	Borislav Petkov, Brendan Gregg, Catalin Marinas, Christian Hansen,
	dancol, fmayer, H. Peter Anvin, Ingo Molnar, Jonathan Corbet,
	Kees Cook, kernel-team, linux-api, linux-doc, linux-fsdevel,
	linux-mm, Mike Rapoport, namhyung, paulmck, Robin Murphy,
	Roman Gushchin, Stephen Rothwell <sfr@
In-Reply-To: <20190813150450.GN17933@dhcp22.suse.cz>

On Tue, Aug 13, 2019 at 05:04:50PM +0200, Michal Hocko wrote:
> On Wed 07-08-19 13:15:55, Joel Fernandes (Google) wrote:
> > Idle page tracking currently does not work well in the following
> > scenario:
> >  1. mark page-A idle which was present at that time.
> >  2. run workload
> >  3. page-A is not touched by workload
> >  4. *sudden* memory pressure happen so finally page A is finally swapped out
> >  5. now see the page A - it appears as if it was accessed (pte unmapped
> >     so idle bit not set in output) - but it's incorrect.
> > 
> > To fix this, we store the idle information into a new idle bit of the
> > swap PTE during swapping of anonymous pages.
> >
> > Also in the future, madvise extensions will allow a system process
> > manager (like Android's ActivityManager) to swap pages out of a process
> > that it knows will be cold. To an external process like a heap profiler
> > that is doing idle tracking on another process, this procedure will
> > interfere with the idle page tracking similar to the above steps.
> 
> This could be solved by checking the !present/swapped out pages
> right? Whoever decided to put the page out to the swap just made it
> idle effectively.  So the monitor can make some educated guess for
> tracking. If that is fundamentally not possible then please describe
> why.

But the monitoring process (profiler) does not have control over the 'whoever
made it effectively idle' process.

As you said it will be a guess, it will not be accurate.

I am curious what is your concern with using a bit in the swap PTE?

(Adding Konstantin as well since we may be interested in this, since we also
suggested this idea).

thanks,

 - Joel

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Daniel Gruss @ 2019-08-13 15:34 UTC (permalink / raw)
  To: Jann Horn, Michal Hocko, Joel Fernandes (Google)
  Cc: kernel list, Alexey Dobriyan, Andrew Morton, Borislav Petkov,
	Brendan Gregg, Catalin Marinas, Christian Hansen,
	Daniel Colascione, fmayer, H. Peter Anvin, Ingo Molnar,
	Joel Fernandes, Jonathan Corbet, Kees Cook, kernel-team,
	Linux API, linux-doc, linux-fsdevel, Linux-MM, Mike Rapoport,
	Minchan Kim <minchan>
In-Reply-To: <CAG48ez2cuqe_VYhhaqw8Hcyswv47cmz2XmkqNdvkXEhokMVaXg@mail.gmail.com>

On 8/13/19 5:29 PM, Jann Horn wrote:
> On Tue, Aug 13, 2019 at 12:09 PM Michal Hocko <mhocko@kernel.org> wrote:
>> On Mon 12-08-19 20:14:38, Jann Horn wrote:
>>> On Wed, Aug 7, 2019 at 7:16 PM Joel Fernandes (Google)
>>> <joel@joelfernandes.org> wrote:
>>>> The page_idle tracking feature currently requires looking up the pagemap
>>>> for a process followed by interacting with /sys/kernel/mm/page_idle.
>>>> Looking up PFN from pagemap in Android devices is not supported by
>>>> unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
>>>>
>>>> This patch adds support to directly interact with page_idle tracking at
>>>> the PID level by introducing a /proc/<pid>/page_idle file.  It follows
>>>> the exact same semantics as the global /sys/kernel/mm/page_idle, but now
>>>> looking up PFN through pagemap is not needed since the interface uses
>>>> virtual frame numbers, and at the same time also does not require
>>>> SYS_ADMIN.
>>>>
>>>> In Android, we are using this for the heap profiler (heapprofd) which
>>>> profiles and pin points code paths which allocates and leaves memory
>>>> idle for long periods of time. This method solves the security issue
>>>> with userspace learning the PFN, and while at it is also shown to yield
>>>> better results than the pagemap lookup, the theory being that the window
>>>> where the address space can change is reduced by eliminating the
>>>> intermediate pagemap look up stage. In virtual address indexing, the
>>>> process's mmap_sem is held for the duration of the access.
>>>
>>> What happens when you use this interface on shared pages, like memory
>>> inherited from the zygote, library file mappings and so on? If two
>>> profilers ran concurrently for two different processes that both map
>>> the same libraries, would they end up messing up each other's data?
>>
>> Yup PageIdle state is shared. That is the page_idle semantic even now
>> IIRC.
>>
>>> Can this be used to observe which library pages other processes are
>>> accessing, even if you don't have access to those processes, as long
>>> as you can map the same libraries? I realize that there are already a
>>> bunch of ways to do that with side channels and such; but if you're
>>> adding an interface that allows this by design, it seems to me like
>>> something that should be gated behind some sort of privilege check.
>>
>> Hmm, you need to be priviledged to get the pfn now and without that you
>> cannot get to any page so the new interface is weakening the rules.
>> Maybe we should limit setting the idle state to processes with the write
>> status. Or do you think that even observing idle status is useful for
>> practical side channel attacks? If yes, is that a problem of the
>> profiler which does potentially dangerous things?
> 
> I suppose read-only access isn't a real problem as long as the
> profiler isn't writing the idle state in a very tight loop... but I
> don't see a usecase where you'd actually want that? As far as I can
> tell, if you can't write the idle state, being able to read it is
> pretty much useless.
> 
> If the profiler only wants to profile process-private memory, then
> that should be implementable in a safe way in principle, I think, but
> since Joel said that they want to profile CoW memory as well, I think
> that's inherently somewhat dangerous.

I agree that allowing profiling of shared pages would leak information.
To me the use case is not entirely clear. This is not a feature that
would normally be run in everyday computer usage, right?

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Joel Fernandes @ 2019-08-13 15:30 UTC (permalink / raw)
  To: Jann Horn
  Cc: kernel list, Alexey Dobriyan, Andrew Morton, Borislav Petkov,
	Brendan Gregg, Catalin Marinas, Christian Hansen,
	Daniel Colascione, fmayer, H. Peter Anvin, Ingo Molnar,
	Jonathan Corbet, Kees Cook, kernel-team, Linux API, linux-doc,
	linux-fsdevel, Linux-MM, Michal Hocko, Mike Rapoport, Minchan Kim,
	namhyu
In-Reply-To: <CAG48ez0ysprvRiENhBkLeV9YPTN_MB18rbu2HDa2jsWo5FYR8g@mail.gmail.com>

On Mon, Aug 12, 2019 at 08:14:38PM +0200, Jann Horn wrote:
[snip] 
> > +/* Helper to get the start and end frame given a pos and count */
> > +static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
> > +                               unsigned long *start, unsigned long *end)
> > +{
> > +       unsigned long max_frame;
> > +
> > +       /* If an mm is not given, assume we want physical frames */
> > +       max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
> > +
> > +       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
> > +               return -EINVAL;
> > +
> > +       *start = pos * BITS_PER_BYTE;
> > +       if (*start >= max_frame)
> > +               return -ENXIO;
> > +
> > +       *end = *start + count * BITS_PER_BYTE;
> > +       if (*end > max_frame)
> > +               *end = max_frame;
> > +       return 0;
> > +}
> 
> You could add some overflow checks for the multiplications. I haven't
> seen any place where it actually matters, but it seems unclean; and in
> particular, on a 32-bit architecture where the maximum user address is
> very high (like with a 4G:4G split), it looks like this function might
> theoretically return with `*start > *end`, which could be confusing to
> callers.

I could store the multiplication result in unsigned long long (since we are
bounds checking with max_frame, start > end would not occur). Something like
the following (with extraneous casts). But I'll think some more about the
point you are raising.

diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1ea21bbb36cb..8fd7a5559986 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -135,6 +135,7 @@ static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
 				unsigned long *start, unsigned long *end)
 {
 	unsigned long max_frame;
+	unsigned long long tmp;
 
 	/* If an mm is not given, assume we want physical frames */
 	max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
@@ -142,13 +143,16 @@ static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
 	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 		return -EINVAL;
 
-	*start = pos * BITS_PER_BYTE;
-	if (*start >= max_frame)
+	tmp = pos * BITS_PER_BYTE;
+	if (tmp >= (unsigned long long)max_frame)
 		return -ENXIO;
+	*start = (unsigned long)tmp;
 
-	*end = *start + count * BITS_PER_BYTE;
-	if (*end > max_frame)
+	tmp = *start + count * BITS_PER_BYTE;
+	if (tmp > (unsigned long long)max_frame)
 		*end = max_frame;
+	else
+		*end = (unsigned long)tmp;
 	return 0;
 }
 
> [...]
> >         for (; pfn < end_pfn; pfn++) {
> >                 bit = pfn % BITMAP_CHUNK_BITS;
> >                 if (!bit)
> >                         *out = 0ULL;
> > -               page = page_idle_get_page(pfn);
> > -               if (page) {
> > -                       if (page_is_idle(page)) {
> > -                               /*
> > -                                * The page might have been referenced via a
> > -                                * pte, in which case it is not idle. Clear
> > -                                * refs and recheck.
> > -                                */
> > -                               page_idle_clear_pte_refs(page);
> > -                               if (page_is_idle(page))
> > -                                       *out |= 1ULL << bit;
> > -                       }
> > +               page = page_idle_get_page_pfn(pfn);
> > +               if (page && page_idle_pte_check(page)) {
> > +                       *out |= 1ULL << bit;
> >                         put_page(page);
> >                 }
> 
> The `page && !page_idle_pte_check(page)` case looks like it's missing
> a put_page(); you probably intended to write something like this?
> 
>     page = page_idle_get_page_pfn(pfn);
>     if (page) {
>         if (page_idle_pte_check(page))
>             *out |= 1ULL << bit;
>         put_page(page);
>     }

Ah, you're right. Will fix, good eyes and thank you!

> [...]
> > +/*  page_idle tracking for /proc/<pid>/page_idle */
> > +
> > +struct page_node {
> > +       struct page *page;
> > +       unsigned long addr;
> > +       struct list_head list;
> > +};
> > +
> > +struct page_idle_proc_priv {
> > +       unsigned long start_addr;
> > +       char *buffer;
> > +       int write;
> > +
> > +       /* Pre-allocate and provide nodes to pte_page_idle_proc_add() */
> > +       struct page_node *page_nodes;
> > +       int cur_page_node;
> > +       struct list_head *idle_page_list;
> > +};
> 
> A linked list is a weird data structure to use if the list elements
> are just consecutive array elements.

Not all of the pages will be considered in the later parts of the code, some
pages are skipped.

However, in v4 I added an array to allocate all the page_node structures,
since Andrew did not want GFP_ATOMIC allocations of individual list nodes.

So I think I could get rid of the linked list and leave the unused
page_node(s) as NULL, but I don't know I have to check if that is possible. I
could be missing a corner case, regardless I'll make a note of this and try!

> > +/*
> > + * Add page to list to be set as idle later.
> > + */
> > +static void pte_page_idle_proc_add(struct page *page,
> > +                              unsigned long addr, struct mm_walk *walk)
> > +{
> > +       struct page *page_get = NULL;
> > +       struct page_node *pn;
> > +       int bit;
> > +       unsigned long frames;
> > +       struct page_idle_proc_priv *priv = walk->private;
> > +       u64 *chunk = (u64 *)priv->buffer;
> > +
> > +       if (priv->write) {
> > +               VM_BUG_ON(!page);
> > +
> > +               /* Find whether this page was asked to be marked */
> > +               frames = (addr - priv->start_addr) >> PAGE_SHIFT;
> > +               bit = frames % BITMAP_CHUNK_BITS;
> > +               chunk = &chunk[frames / BITMAP_CHUNK_BITS];
> > +               if (((*chunk >> bit) & 1) == 0)
> > +                       return;
> 
> This means that BITMAP_CHUNK_SIZE is UAPI on big-endian systems,
> right? My opinion is that it would be slightly nicer to design the
> UAPI such that incrementing virtual addresses are mapped to
> incrementing offsets in the buffer (iow, either use bytewise access or
> use little-endian), but I'm not going to ask you to redesign the UAPI
> this late.

That would also be slow and consume more memory in userspace buffers.
Currently, a 64-bit (8 byte) chunk accounts for 64 pages worth or 256KB.

Also I wanted to keep the interface consistent with the global
/sys/kernel/mm/page_idle interface.

> [...]
> > +ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
> > +                              size_t count, loff_t *pos, int write)
> > +{
> [...]
> > +       down_read(&mm->mmap_sem);
> [...]
> > +
> > +       if (!write && !walk_error)
> > +               ret = copy_to_user(ubuff, buffer, count);
> > +
> > +       up_read(&mm->mmap_sem);
> 
> I'd move the up_read() above the copy_to_user(); copy_to_user() can
> block, and there's no reason to hold the mmap_sem across
> copy_to_user().

Will do.

> Sorry about only chiming in at v5 with all this.

No problem, I'm glad you did!

thanks,

 - Joel

^ permalink raw reply related

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Jann Horn @ 2019-08-13 15:29 UTC (permalink / raw)
  To: Michal Hocko, Daniel Gruss, Joel Fernandes (Google)
  Cc: kernel list, Alexey Dobriyan, Andrew Morton, Borislav Petkov,
	Brendan Gregg, Catalin Marinas, Christian Hansen,
	Daniel Colascione, fmayer, H. Peter Anvin, Ingo Molnar,
	Joel Fernandes, Jonathan Corbet, Kees Cook, kernel-team,
	Linux API, linux-doc, linux-fsdevel, Linux-MM, Mike Rapoport,
	Minchan Kim
In-Reply-To: <20190813100856.GF17933@dhcp22.suse.cz>

On Tue, Aug 13, 2019 at 12:09 PM Michal Hocko <mhocko@kernel.org> wrote:
> On Mon 12-08-19 20:14:38, Jann Horn wrote:
> > On Wed, Aug 7, 2019 at 7:16 PM Joel Fernandes (Google)
> > <joel@joelfernandes.org> wrote:
> > > The page_idle tracking feature currently requires looking up the pagemap
> > > for a process followed by interacting with /sys/kernel/mm/page_idle.
> > > Looking up PFN from pagemap in Android devices is not supported by
> > > unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
> > >
> > > This patch adds support to directly interact with page_idle tracking at
> > > the PID level by introducing a /proc/<pid>/page_idle file.  It follows
> > > the exact same semantics as the global /sys/kernel/mm/page_idle, but now
> > > looking up PFN through pagemap is not needed since the interface uses
> > > virtual frame numbers, and at the same time also does not require
> > > SYS_ADMIN.
> > >
> > > In Android, we are using this for the heap profiler (heapprofd) which
> > > profiles and pin points code paths which allocates and leaves memory
> > > idle for long periods of time. This method solves the security issue
> > > with userspace learning the PFN, and while at it is also shown to yield
> > > better results than the pagemap lookup, the theory being that the window
> > > where the address space can change is reduced by eliminating the
> > > intermediate pagemap look up stage. In virtual address indexing, the
> > > process's mmap_sem is held for the duration of the access.
> >
> > What happens when you use this interface on shared pages, like memory
> > inherited from the zygote, library file mappings and so on? If two
> > profilers ran concurrently for two different processes that both map
> > the same libraries, would they end up messing up each other's data?
>
> Yup PageIdle state is shared. That is the page_idle semantic even now
> IIRC.
>
> > Can this be used to observe which library pages other processes are
> > accessing, even if you don't have access to those processes, as long
> > as you can map the same libraries? I realize that there are already a
> > bunch of ways to do that with side channels and such; but if you're
> > adding an interface that allows this by design, it seems to me like
> > something that should be gated behind some sort of privilege check.
>
> Hmm, you need to be priviledged to get the pfn now and without that you
> cannot get to any page so the new interface is weakening the rules.
> Maybe we should limit setting the idle state to processes with the write
> status. Or do you think that even observing idle status is useful for
> practical side channel attacks? If yes, is that a problem of the
> profiler which does potentially dangerous things?

I suppose read-only access isn't a real problem as long as the
profiler isn't writing the idle state in a very tight loop... but I
don't see a usecase where you'd actually want that? As far as I can
tell, if you can't write the idle state, being able to read it is
pretty much useless.

If the profiler only wants to profile process-private memory, then
that should be implementable in a safe way in principle, I think, but
since Joel said that they want to profile CoW memory as well, I think
that's inherently somewhat dangerous.

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Jann Horn @ 2019-08-13 15:19 UTC (permalink / raw)
  To: Joel Fernandes, Daniel Gruss
  Cc: Michal Hocko, kernel list, Alexey Dobriyan, Andrew Morton,
	Borislav Petkov, Brendan Gregg, Catalin Marinas, Christian Hansen,
	Daniel Colascione, fmayer, H. Peter Anvin, Ingo Molnar,
	Jonathan Corbet, Kees Cook, kernel-team, Linux API, linux-doc,
	linux-fsdevel, Linux-MM, Mike Rapoport, Minchan Kim
In-Reply-To: <20190813142527.GD258732@google.com>

On Tue, Aug 13, 2019 at 4:25 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> On Tue, Aug 13, 2019 at 12:08:56PM +0200, Michal Hocko wrote:
> > On Mon 12-08-19 20:14:38, Jann Horn wrote:
> > > On Wed, Aug 7, 2019 at 7:16 PM Joel Fernandes (Google)
> > > <joel@joelfernandes.org> wrote:
> > > > The page_idle tracking feature currently requires looking up the pagemap
> > > > for a process followed by interacting with /sys/kernel/mm/page_idle.
> > > > Looking up PFN from pagemap in Android devices is not supported by
> > > > unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
> > > >
> > > > This patch adds support to directly interact with page_idle tracking at
> > > > the PID level by introducing a /proc/<pid>/page_idle file.  It follows
> > > > the exact same semantics as the global /sys/kernel/mm/page_idle, but now
> > > > looking up PFN through pagemap is not needed since the interface uses
> > > > virtual frame numbers, and at the same time also does not require
> > > > SYS_ADMIN.
> > > >
> > > > In Android, we are using this for the heap profiler (heapprofd) which
> > > > profiles and pin points code paths which allocates and leaves memory
> > > > idle for long periods of time. This method solves the security issue
> > > > with userspace learning the PFN, and while at it is also shown to yield
> > > > better results than the pagemap lookup, the theory being that the window
> > > > where the address space can change is reduced by eliminating the
> > > > intermediate pagemap look up stage. In virtual address indexing, the
> > > > process's mmap_sem is held for the duration of the access.
> > >
> > > What happens when you use this interface on shared pages, like memory
> > > inherited from the zygote, library file mappings and so on? If two
> > > profilers ran concurrently for two different processes that both map
> > > the same libraries, would they end up messing up each other's data?
> >
> > Yup PageIdle state is shared. That is the page_idle semantic even now
> > IIRC.
>
> Yes, that's right. This patch doesn't change that semantic. Idle page
> tracking at the core is a global procedure which is based on pages that can
> be shared.
>
> One of the usecases of the heap profiler is to enable profiling of pages that
> are shared between zygote and any processes that are forked. In this case,
> I am told by our team working on the heap profiler, that the monitoring of
> shared pages will help.
>
> > > Can this be used to observe which library pages other processes are
> > > accessing, even if you don't have access to those processes, as long
> > > as you can map the same libraries? I realize that there are already a
> > > bunch of ways to do that with side channels and such; but if you're
> > > adding an interface that allows this by design, it seems to me like
> > > something that should be gated behind some sort of privilege check.
> >
> > Hmm, you need to be priviledged to get the pfn now and without that you
> > cannot get to any page so the new interface is weakening the rules.
> > Maybe we should limit setting the idle state to processes with the write
> > status. Or do you think that even observing idle status is useful for
> > practical side channel attacks? If yes, is that a problem of the
> > profiler which does potentially dangerous things?
>
> The heap profiler is currently unprivileged. Would it help the concern Jann
> raised, if the new interface was limited to only anonymous private/shared
> pages and not to file pages? Or, is this even a real concern?

+Daniel Gruss in case he wants to provide some more detail; he has
been involved in a lot of the public research around this topic.

It is a bit of a concern when code that wasn't hardened as rigorously
as cryptographic library code operates on secret values.
A paper was published this year that abused mincore() in combination
with tricks for flushing the page cache to obtain information about
when shared read-only memory was accessed:
<https://arxiv.org/pdf/1901.01161.pdf>. In response to that, the
semantics of mincore() were changed to prevent that information from
leaking (see commit 134fca9063ad4851de767d1768180e5dede9a881).

On the other hand, an attacker could also use things like cache timing
attacks instead of abusing operating system features; that is more
hardware-specific, but it has a higher spatial granularity (typically
64 bytes instead of 4096 bytes). Timing-granularity-wise, I'm not sure
whether the proposed interface would be more or less bad than existing
cache side-channels on common architectures. There are papers that
demonstrate things like being able to distinguish some classes of
keyboard keys from others on an Android phone:
<https://www.usenix.org/system/files/conference/usenixsecurity16/sec16_paper_lipp.pdf>

I don't think limiting it to anonymous pages is necessarily enough to
completely solve this; in a normal Linux environment, it might be good
enough, but on Android, I'm worried about the CoW private memory from
the zygote.

^ permalink raw reply

* Re: [PATCH v5 2/6] mm/page_idle: Add support for handling swapped PG_Idle pages
From: Michal Hocko @ 2019-08-13 15:04 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, Minchan Kim, Alexey Dobriyan, Andrew Morton,
	Borislav Petkov, Brendan Gregg, Catalin Marinas, Christian Hansen,
	dancol, fmayer, H. Peter Anvin, Ingo Molnar, joelaf,
	Jonathan Corbet, Kees Cook, kernel-team, linux-api, linux-doc,
	linux-fsdevel, linux-mm, Mike Rapoport, namhyung, paulmck,
	Robin Murphy, Roman Gushchin, Ste
In-Reply-To: <20190807171559.182301-2-joel@joelfernandes.org>

On Wed 07-08-19 13:15:55, Joel Fernandes (Google) wrote:
> Idle page tracking currently does not work well in the following
> scenario:
>  1. mark page-A idle which was present at that time.
>  2. run workload
>  3. page-A is not touched by workload
>  4. *sudden* memory pressure happen so finally page A is finally swapped out
>  5. now see the page A - it appears as if it was accessed (pte unmapped
>     so idle bit not set in output) - but it's incorrect.
> 
> To fix this, we store the idle information into a new idle bit of the
> swap PTE during swapping of anonymous pages.
>
> Also in the future, madvise extensions will allow a system process
> manager (like Android's ActivityManager) to swap pages out of a process
> that it knows will be cold. To an external process like a heap profiler
> that is doing idle tracking on another process, this procedure will
> interfere with the idle page tracking similar to the above steps.

This could be solved by checking the !present/swapped out pages
right? Whoever decided to put the page out to the swap just made it
idle effectively.  So the monitor can make some educated guess for
tracking. If that is fundamentally not possible then please describe
why.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Michal Hocko @ 2019-08-13 14:57 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Andrew Morton, linux-kernel, Alexey Dobriyan, Borislav Petkov,
	Brendan Gregg, Catalin Marinas, Christian Hansen, dancol, fmayer,
	H. Peter Anvin, Ingo Molnar, Jonathan Corbet, Kees Cook,
	kernel-team, linux-api, linux-doc, linux-fsdevel, linux-mm,
	Mike Rapoport, minchan, namhyung, paulmck, Robin Murphy,
	Roman Gushchin, Stephen Rothwell
In-Reply-To: <20190813144517.GE258732@google.com>

On Tue 13-08-19 10:45:17, Joel Fernandes wrote:
> On Tue, Aug 13, 2019 at 04:14:32PM +0200, Michal Hocko wrote:
> [snip] 
> > > > If the API is flawed then this is likely going
> > > > to kick us later and will be hard to fix. I am still not convinced about
> > > > the swap part of the thing TBH.
> > > 
> > > Ok, then let us discuss it. As I mentioned before, without this we lose the
> > > access information due to MADVISE or swapping. Minchan and Konstantin both
> > > suggested it that's why I also added it (other than me also realizing that it
> > > is neeed).
> > 
> > I have described my concerns about the general idle bit behavior after
> > unmapping pointing to discrepancy with !anon pages. And I believe those
> > haven't been addressed yet.
> 
> You are referring to this post right?
> https://lkml.org/lkml/2019/8/6/637
> 
> Specifically your question was:
> How are you going to handle situation when the page is unmapped  and refaulted again (e.g. a normal reclaim of a pagecache)?
> 
> Currently I don't know how to implement that. Would it work if I stored the
> page-idle bit information in the pte of the file page (after the page is
> unmapped by reclaim?).

It would work as long as we keep page tables around after unmap. As they
are easily reconstructable this is a good candidate for reclaim as well.

> Also, this could be a future extension - the Android heap profiler does not
> need it right now. I know that's not a good argument but it is useful to say
> that it doesn't affect a real world usecase.. the swap issue on the other
> hand, is a real usecase. Since the profiler should not get affected by
> swapping or MADVISE_COLD hints.
> 
> > Besides that I am still not seeing any
> > description of the usecase that would suffer from the lack of the
> > functionality in changelogs.
> 
> You are talking about the swap usecase? The usecase is well layed out in v5
> 2/6. Did you see it? https://lore.kernel.org/patchwork/patch/1112283/

For some reason I've missed it. I will coment on that.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Joel Fernandes @ 2019-08-13 14:45 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-kernel, Alexey Dobriyan, Borislav Petkov,
	Brendan Gregg, Catalin Marinas, Christian Hansen, dancol, fmayer,
	H. Peter Anvin, Ingo Molnar, Jonathan Corbet, Kees Cook,
	kernel-team, linux-api, linux-doc, linux-fsdevel, linux-mm,
	Mike Rapoport, minchan, namhyung, paulmck, Robin Murphy,
	Roman Gushchin, Stephen Rothwell
In-Reply-To: <20190813141432.GL17933@dhcp22.suse.cz>

On Tue, Aug 13, 2019 at 04:14:32PM +0200, Michal Hocko wrote:
[snip] 
> > > If the API is flawed then this is likely going
> > > to kick us later and will be hard to fix. I am still not convinced about
> > > the swap part of the thing TBH.
> > 
> > Ok, then let us discuss it. As I mentioned before, without this we lose the
> > access information due to MADVISE or swapping. Minchan and Konstantin both
> > suggested it that's why I also added it (other than me also realizing that it
> > is neeed).
> 
> I have described my concerns about the general idle bit behavior after
> unmapping pointing to discrepancy with !anon pages. And I believe those
> haven't been addressed yet.

You are referring to this post right?
https://lkml.org/lkml/2019/8/6/637

Specifically your question was:
How are you going to handle situation when the page is unmapped  and refaulted again (e.g. a normal reclaim of a pagecache)?

Currently I don't know how to implement that. Would it work if I stored the
page-idle bit information in the pte of the file page (after the page is
unmapped by reclaim?).

Also, this could be a future extension - the Android heap profiler does not
need it right now. I know that's not a good argument but it is useful to say
that it doesn't affect a real world usecase.. the swap issue on the other
hand, is a real usecase. Since the profiler should not get affected by
swapping or MADVISE_COLD hints.

> Besides that I am still not seeing any
> description of the usecase that would suffer from the lack of the
> functionality in changelogs.

You are talking about the swap usecase? The usecase is well layed out in v5
2/6. Did you see it? https://lore.kernel.org/patchwork/patch/1112283/

thanks,

 - Joel

^ permalink raw reply

* Re: [PATCH v5 1/6] mm/page_idle: Add per-pid idle page tracking using virtual index
From: Joel Fernandes @ 2019-08-13 14:25 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Jann Horn, kernel list, Alexey Dobriyan, Andrew Morton,
	Borislav Petkov, Brendan Gregg, Catalin Marinas, Christian Hansen,
	Daniel Colascione, fmayer, H. Peter Anvin, Ingo Molnar,
	Jonathan Corbet, Kees Cook, kernel-team, Linux API, linux-doc,
	linux-fsdevel, Linux-MM, Mike Rapoport, Minchan Kim
In-Reply-To: <20190813100856.GF17933@dhcp22.suse.cz>

On Tue, Aug 13, 2019 at 12:08:56PM +0200, Michal Hocko wrote:
> On Mon 12-08-19 20:14:38, Jann Horn wrote:
> > On Wed, Aug 7, 2019 at 7:16 PM Joel Fernandes (Google)
> > <joel@joelfernandes.org> wrote:
> > > The page_idle tracking feature currently requires looking up the pagemap
> > > for a process followed by interacting with /sys/kernel/mm/page_idle.
> > > Looking up PFN from pagemap in Android devices is not supported by
> > > unprivileged process and requires SYS_ADMIN and gives 0 for the PFN.
> > >
> > > This patch adds support to directly interact with page_idle tracking at
> > > the PID level by introducing a /proc/<pid>/page_idle file.  It follows
> > > the exact same semantics as the global /sys/kernel/mm/page_idle, but now
> > > looking up PFN through pagemap is not needed since the interface uses
> > > virtual frame numbers, and at the same time also does not require
> > > SYS_ADMIN.
> > >
> > > In Android, we are using this for the heap profiler (heapprofd) which
> > > profiles and pin points code paths which allocates and leaves memory
> > > idle for long periods of time. This method solves the security issue
> > > with userspace learning the PFN, and while at it is also shown to yield
> > > better results than the pagemap lookup, the theory being that the window
> > > where the address space can change is reduced by eliminating the
> > > intermediate pagemap look up stage. In virtual address indexing, the
> > > process's mmap_sem is held for the duration of the access.
> > 
> > What happens when you use this interface on shared pages, like memory
> > inherited from the zygote, library file mappings and so on? If two
> > profilers ran concurrently for two different processes that both map
> > the same libraries, would they end up messing up each other's data?
> 
> Yup PageIdle state is shared. That is the page_idle semantic even now
> IIRC.

Yes, that's right. This patch doesn't change that semantic. Idle page
tracking at the core is a global procedure which is based on pages that can
be shared.

One of the usecases of the heap profiler is to enable profiling of pages that
are shared between zygote and any processes that are forked. In this case,
I am told by our team working on the heap profiler, that the monitoring of
shared pages will help.

> > Can this be used to observe which library pages other processes are
> > accessing, even if you don't have access to those processes, as long
> > as you can map the same libraries? I realize that there are already a
> > bunch of ways to do that with side channels and such; but if you're
> > adding an interface that allows this by design, it seems to me like
> > something that should be gated behind some sort of privilege check.
> 
> Hmm, you need to be priviledged to get the pfn now and without that you
> cannot get to any page so the new interface is weakening the rules.
> Maybe we should limit setting the idle state to processes with the write
> status. Or do you think that even observing idle status is useful for
> practical side channel attacks? If yes, is that a problem of the
> profiler which does potentially dangerous things?

The heap profiler is currently unprivileged. Would it help the concern Jann
raised, if the new interface was limited to only anonymous private/shared
pages and not to file pages? Or, is this even a real concern?

thanks,

 - Joel

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox