* [PATCH v8 06/14] x86/cet/ibt: Add arch_prctl functions for IBT
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205359.12196-1-yu-cheng.yu@intel.com>
From: "H.J. Lu" <hjl.tools@gmail.com>
Update ARCH_X86_CET_STATUS and ARCH_X86_CET_DISABLE to include
Indirect Branch Tracking features.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/uapi/asm/prctl.h | 2 ++
arch/x86/kernel/cet_prctl.c | 4 ++++
2 files changed, 6 insertions(+)
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index d962f0ec9ccf..02243127dcf6 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -18,5 +18,7 @@
#define ARCH_X86_CET_DISABLE 0x3002
#define ARCH_X86_CET_LOCK 0x3003
#define ARCH_X86_CET_ALLOC_SHSTK 0x3004
+#define ARCH_X86_CET_GET_LEGACY_BITMAP 0x3005 /* deprecated */
+#define ARCH_X86_CET_SET_LEGACY_BITMAP 0x3006 /* deprecated */
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
index 9c9d4262b07e..09d8c4ea935c 100644
--- a/arch/x86/kernel/cet_prctl.c
+++ b/arch/x86/kernel/cet_prctl.c
@@ -20,6 +20,8 @@ static int handle_get_status(unsigned long arg2)
if (current->thread.cet.shstk_enabled)
features |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+ if (current->thread.cet.ibt_enabled)
+ features |= GNU_PROPERTY_X86_FEATURE_1_IBT;
shstk_base = current->thread.cet.shstk_base;
shstk_size = current->thread.cet.shstk_size;
@@ -69,6 +71,8 @@ int prctl_cet(int option, unsigned long arg2)
return -EPERM;
if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
cet_disable_free_shstk(current);
+ if (arg2 & GNU_PROPERTY_X86_FEATURE_1_IBT)
+ cet_disable_ibt();
return 0;
--
2.17.1
^ permalink raw reply related
* [PATCH v8 05/14] x86/cet/ibt: ELF header parsing for IBT
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205359.12196-1-yu-cheng.yu@intel.com>
Look in .note.gnu.property of an ELF file and check if Indirect
Branch Tracking needs to be enabled for the task.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/Kconfig | 1 +
arch/x86/kernel/process_64.c | 5 +++++
2 files changed, 6 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 666dc66a382e..7edb544cef00 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1971,6 +1971,7 @@ config X86_INTEL_BRANCH_TRACKING_USER
select X86_INTEL_CET
select ARCH_HAS_AS_LIMIT
select ARCH_USE_GNU_PROPERTY
+ select ARCH_BINFMT_ELF_STATE
---help---
Indirect Branch Tracking provides hardware protection against return-/jmp-
oriented programming attacks.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7ec60b14e96d..a051cfc5242f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -856,6 +856,11 @@ int arch_setup_property(struct arch_elf_state *state)
return r;
}
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_IBT)
+ r = cet_setup_ibt();
+ }
+
return r;
}
#endif
--
2.17.1
^ permalink raw reply related
* [PATCH v8 04/14] mm/mmap: Add IBT bitmap size to address space limit check
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205359.12196-1-yu-cheng.yu@intel.com>
The indirect branch tracking legacy bitmap takes a large address
space. This causes may_expand_vm() failure on the address limit
check. For a IBT-enabled task, add the bitmap size to the
address limit.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/mmu_context.h | 10 ++++++++++
mm/mmap.c | 19 ++++++++++++++++++-
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index a9a768529540..e18e89ef332d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -360,6 +360,16 @@ static inline unsigned long __get_current_cr3_fast(void)
return cr3;
}
+#ifdef CONFIG_X86_INTEL_BRANCH_TRACKING_USER
+static inline unsigned long arch_as_limit(void)
+{
+ if (current->thread.cet.ibt_enabled)
+ return IBT_BITMAP_SIZE;
+ else
+ return 0;
+}
+#endif
+
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1acded00f003..1bfd9aec9aed 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3283,13 +3283,30 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL;
}
+#ifndef CONFIG_ARCH_HAS_AS_LIMIT
+static inline unsigned long arch_as_limit(void)
+{
+ return 0;
+}
+#endif
+
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
*/
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
- if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+ unsigned long as_limit = rlimit(RLIMIT_AS);
+ unsigned long as_limit_plus = as_limit + arch_as_limit();
+
+ /* as_limit_plus overflowed */
+ if (as_limit_plus < as_limit)
+ as_limit_plus = RLIM_INFINITY;
+
+ if (as_limit_plus > as_limit)
+ as_limit = as_limit_plus;
+
+ if (mm->total_vm + npages > as_limit >> PAGE_SHIFT)
return false;
if (is_data_mapping(flags) &&
--
2.17.1
^ permalink raw reply related
* [PATCH v8 03/14] x86/cet/ibt: Handle signals for end branch
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205359.12196-1-yu-cheng.yu@intel.com>
Restore end branch tracking setting from thread header.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/kernel/cet.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 505a69f476e1..db542bd423cc 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -281,6 +281,15 @@ int cet_restore_signal(bool ia32, struct sc_ext *sc_ext)
msr_ia32_u_cet |= MSR_IA32_CET_SHSTK_EN;
}
+ if (current->thread.cet.ibt_enabled) {
+ if (current->thread.cet.ibt_bitmap_used)
+ msr_ia32_u_cet |= (IBT_BITMAP_ADDR |
+ MSR_IA32_CET_LEG_IW_EN);
+
+ msr_ia32_u_cet |= (MSR_IA32_CET_ENDBR_EN |
+ MSR_IA32_CET_NO_TRACK_EN);
+ }
+
wrmsrl(MSR_IA32_PL3_SSP, new_ssp);
wrmsrl(MSR_IA32_U_CET, msr_ia32_u_cet);
return 0;
@@ -321,6 +330,15 @@ int cet_setup_signal(bool ia32, unsigned long rstor_addr, struct sc_ext *sc_ext)
sc_ext->ssp = new_ssp;
}
+ if (current->thread.cet.ibt_enabled) {
+ if (current->thread.cet.ibt_bitmap_used)
+ msr_ia32_u_cet |= (IBT_BITMAP_ADDR |
+ MSR_IA32_CET_LEG_IW_EN);
+
+ msr_ia32_u_cet |= (MSR_IA32_CET_ENDBR_EN |
+ MSR_IA32_CET_NO_TRACK_EN);
+ }
+
modify_fpu_regs_begin();
wrmsrl(MSR_IA32_PL3_SSP, ssp);
wrmsrl(MSR_IA32_U_CET, msr_ia32_u_cet);
--
2.17.1
^ permalink raw reply related
* [PATCH v8 02/14] x86/cet/ibt: User-mode indirect branch tracking support
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205359.12196-1-yu-cheng.yu@intel.com>
Add user-mode indirect branch tracking enabling/disabling and
supporting routines.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 6 ++++
arch/x86/include/asm/disabled-features.h | 8 ++++-
arch/x86/kernel/cet.c | 36 +++++++++++++++++++
arch/x86/kernel/cpu/common.c | 17 +++++++++
.../arch/x86/include/asm/disabled-features.h | 8 ++++-
5 files changed, 73 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 2df357dffd24..2561efe081ad 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -16,6 +16,8 @@ struct cet_status {
unsigned long shstk_size;
unsigned int locked:1;
unsigned int shstk_enabled:1;
+ unsigned int ibt_enabled:1;
+ unsigned int ibt_bitmap_used:1;
};
#ifdef CONFIG_X86_INTEL_CET
@@ -27,6 +29,8 @@ void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(bool ia32, struct sc_ext *sc);
int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
+int cet_setup_ibt(void);
+void cet_disable_ibt(void);
#else
static inline int prctl_cet(int option, unsigned long arg2) { return -EINVAL; }
static inline int cet_setup_shstk(void) { return -EINVAL; }
@@ -37,6 +41,8 @@ static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(bool ia32, struct sc_ext *sc) { return -EINVAL; }
static inline int cet_setup_signal(bool ia32, unsigned long rstor,
struct sc_ext *sc) { return -EINVAL; }
+static inline int cet_setup_ibt(void) { return -EINVAL; }
+static inline void cet_disable_ibt(void) {}
#endif
#define cpu_x86_cet_enabled() \
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 06323ebed643..fc7d3d5a1bf4 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -68,6 +68,12 @@
#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
#endif
+#ifdef CONFIG_X86_INTEL_BRANCH_TRACKING_USER
+#define DISABLE_IBT 0
+#else
+#define DISABLE_IBT (1<<(X86_FEATURE_IBT & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -89,7 +95,7 @@
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
#define DISABLED_MASK17 0
-#define DISABLED_MASK18 0
+#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index e4e20d6ab07b..505a69f476e1 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -13,6 +13,8 @@
#include <linux/uaccess.h>
#include <linux/sched/signal.h>
#include <linux/compat.h>
+#include <linux/vmalloc.h>
+#include <linux/bitops.h>
#include <asm/msr.h>
#include <asm/user.h>
#include <asm/fpu/internal.h>
@@ -325,3 +327,37 @@ int cet_setup_signal(bool ia32, unsigned long rstor_addr, struct sc_ext *sc_ext)
modify_fpu_regs_end();
return 0;
}
+
+int cet_setup_ibt(void)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBT))
+ return -EOPNOTSUPP;
+
+ modify_fpu_regs_begin();
+ rdmsrl(MSR_IA32_U_CET, r);
+ r |= (MSR_IA32_CET_ENDBR_EN | MSR_IA32_CET_NO_TRACK_EN);
+ wrmsrl(MSR_IA32_U_CET, r);
+ modify_fpu_regs_end();
+
+ current->thread.cet.ibt_enabled = 1;
+ return 0;
+}
+
+void cet_disable_ibt(void)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBT))
+ return;
+
+ modify_fpu_regs_begin();
+ rdmsrl(MSR_IA32_U_CET, r);
+ r &= ~(MSR_IA32_CET_ENDBR_EN | MSR_IA32_CET_LEG_IW_EN |
+ MSR_IA32_CET_NO_TRACK_EN | MSR_IA32_CET_BITMAP_MASK);
+ wrmsrl(MSR_IA32_U_CET, r);
+ modify_fpu_regs_end();
+
+ current->thread.cet.ibt_enabled = 0;
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d3addbd3f4d4..092979715d16 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -512,6 +512,23 @@ static __init int setup_disable_shstk(char *s)
__setup("no_cet_shstk", setup_disable_shstk);
#endif
+#ifdef CONFIG_X86_INTEL_BRANCH_TRACKING_USER
+static __init int setup_disable_ibt(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (s[0] != '\0')
+ return 0;
+
+ if (!boot_cpu_has(X86_FEATURE_IBT))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+ pr_info("x86: 'no_cet_ibt' specified, disabling Branch Tracking\n");
+ return 1;
+}
+__setup("no_cet_ibt", setup_disable_ibt);
+#endif
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index 06323ebed643..fc7d3d5a1bf4 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -68,6 +68,12 @@
#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
#endif
+#ifdef CONFIG_X86_INTEL_BRANCH_TRACKING_USER
+#define DISABLE_IBT 0
+#else
+#define DISABLE_IBT (1<<(X86_FEATURE_IBT & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -89,7 +95,7 @@
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
#define DISABLED_MASK17 0
-#define DISABLED_MASK18 0
+#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
--
2.17.1
^ permalink raw reply related
* [PATCH v8 01/14] x86/cet/ibt: Add Kconfig option for user-mode Indirect Branch Tracking
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205359.12196-1-yu-cheng.yu@intel.com>
The user-mode indirect branch tracking support is done mostly by GCC
to insert ENDBR64/ENDBR32 instructions at branch targets. The kernel
provides CPUID enumeration and feature setup.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/Kconfig | 16 ++++++++++++++++
arch/x86/Makefile | 7 +++++++
2 files changed, 23 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7d13ba326962..666dc66a382e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1940,6 +1940,9 @@ config X86_INTEL_CET
config ARCH_HAS_SHSTK
def_bool n
+config ARCH_HAS_AS_LIMIT
+ def_bool n
+
config X86_INTEL_SHADOW_STACK_USER
prompt "Intel Shadow Stack for user-mode"
def_bool n
@@ -1961,6 +1964,19 @@ config X86_INTEL_SHADOW_STACK_USER
If unsure, say y.
+config X86_INTEL_BRANCH_TRACKING_USER
+ prompt "Intel Indirect Branch Tracking for user-mode"
+ def_bool n
+ depends on CPU_SUP_INTEL && X86_64
+ select X86_INTEL_CET
+ select ARCH_HAS_AS_LIMIT
+ select ARCH_USE_GNU_PROPERTY
+ ---help---
+ Indirect Branch Tracking provides hardware protection against return-/jmp-
+ oriented programming attacks.
+
+ If unsure, say y
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0b2e9df48907..25372cc4a303 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -155,6 +155,13 @@ ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
endif
endif
+# Check compiler ibt support
+ifdef CONFIG_X86_INTEL_BRANCH_TRACKING_USER
+ ifeq ($(call cc-option-yn, -fcf-protection=branch), n)
+ $(error CONFIG_X86_INTEL_BRANCH_TRACKING_USER not supported by compiler)
+ endif
+endif
+
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
--
2.17.1
^ permalink raw reply related
* [PATCH v8 00/14] Control-flow Enforcement: Branch Tracking, PTRACE
From: Yu-cheng Yu @ 2019-08-13 20:53 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
The previous version of CET Branch Tracking/PTRACE patches is here:
https://lkml.org/lkml/2019/6/6/1030
Summary of changes from v7:
Change legacy bitmap to a special mapping (patch #14).
Rebase to v5.3-rc4.
Small fixes in response to comments.
H.J. Lu (5):
x86/cet/ibt: Add arch_prctl functions for IBT
x86/vdso: Insert endbr32/endbr64 to vDSO
x86/vdso/32: Add ENDBR32 to __kernel_vsyscall entry point
x86/vsyscall/64: Add ENDBR64 to vsyscall entry points
x86: Discard .note.gnu.property sections
Yu-cheng Yu (9):
x86/cet/ibt: Add Kconfig option for user-mode Indirect Branch Tracking
x86/cet/ibt: User-mode indirect branch tracking support
x86/cet/ibt: Handle signals for end branch
mm/mmap: Add IBT bitmap size to address space limit check
x86/cet/ibt: ELF header parsing for IBT
x86/cet/ibt: Add ENDBR to op-code-map
x86/vsyscall/64: Fixup shadow stack and branch tracking for vsyscall
x86/cet: Add PTRACE interface for CET
Introduce arch_prctl(ARCH_X86_CET_MARK_LEGACY_CODE)
arch/x86/Kconfig | 17 ++
arch/x86/Makefile | 7 +
arch/x86/entry/vdso/Makefile | 12 +-
arch/x86/entry/vdso/vdso-layout.lds.S | 1 +
arch/x86/entry/vdso/vdso32/system_call.S | 3 +
arch/x86/entry/vsyscall/vsyscall_64.c | 29 +++
arch/x86/entry/vsyscall/vsyscall_emu_64.S | 9 +
arch/x86/entry/vsyscall/vsyscall_trace.h | 1 +
arch/x86/include/asm/cet.h | 9 +
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/fpu/regset.h | 7 +-
arch/x86/include/asm/mmu_context.h | 10 +
arch/x86/include/asm/processor.h | 13 +-
arch/x86/include/uapi/asm/prctl.h | 3 +
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/cet.c | 54 +++++
arch/x86/kernel/cet_bitmap.c | 210 ++++++++++++++++++
arch/x86/kernel/cet_prctl.c | 19 ++
arch/x86/kernel/cpu/common.c | 17 ++
arch/x86/kernel/fpu/regset.c | 41 ++++
arch/x86/kernel/process_64.c | 5 +
arch/x86/kernel/ptrace.c | 16 ++
arch/x86/kernel/vmlinux.lds.S | 10 +
arch/x86/lib/x86-opcode-map.txt | 13 +-
include/uapi/linux/elf.h | 1 +
mm/memory.c | 8 +
mm/mmap.c | 19 +-
.../arch/x86/include/asm/disabled-features.h | 8 +-
tools/objtool/arch/x86/lib/x86-opcode-map.txt | 13 +-
29 files changed, 552 insertions(+), 13 deletions(-)
create mode 100644 arch/x86/kernel/cet_bitmap.c
--
2.17.1
^ permalink raw reply
* [PATCH v8 27/27] x86/cet/shstk: Add Shadow Stack instructions to opcode map
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
Add the following shadow stack management instructions.
INCSSP:
Increment shadow stack pointer by the steps specified.
RDSSP:
Read SSP register into a GPR.
SAVEPREVSSP:
Use "prev ssp" token at top of current shadow stack to
create a "restore token" on previous shadow stack.
RSTORSSP:
Restore from a "restore token" pointed by a GPR to SSP.
WRSS:
Write to kernel-mode shadow stack (kernel-mode instruction).
WRUSS:
Write to user-mode shadow stack (kernel-mode instruction).
SETSSBSY:
Verify the "supervisor token" pointed by IA32_PL0_SSP MSR,
if valid, set the token to busy, and set SSP to the value
of IA32_PL0_SSP MSR.
CLRSSBSY:
Verify the "supervisor token" pointed by a GPR, if valid,
clear the busy bit from the token.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/lib/x86-opcode-map.txt | 26 +++++++++++++------
tools/objtool/arch/x86/lib/x86-opcode-map.txt | 26 +++++++++++++------
2 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index e0b85930dd77..c5e825d44766 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -366,7 +366,7 @@ AVXcode: 1
1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
1c:
1d:
-1e:
+1e: RDSSP Rd (F3),REX.W
1f: NOP Ev
# 0x0f 0x20-0x2f
20: MOV Rd,Cd
@@ -610,7 +610,17 @@ fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
ff: UD0
EndTable
-Table: 3-byte opcode 1 (0x0f 0x38)
+Table: 3-byte opcode 1 (0x0f 0x01)
+Referrer:
+AVXcode:
+# Skip 0x00-0xe7
+e8: SETSSBSY (f3)
+e9:
+ea: SAVEPREVSSP (f3)
+# Skip 0xeb-0xff
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x38)
Referrer: 3-byte escape 1
AVXcode: 2
# 0x0f 0x38 0x00-0x0f
@@ -789,12 +799,12 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
-f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
-f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSS Pq,Qq (66),REX.W
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSS Pq,Qq (66),REX.W
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
-Table: 3-byte opcode 2 (0x0f 0x3a)
+Table: 3-byte opcode 3 (0x0f 0x3a)
Referrer: 3-byte escape 2
AVXcode: 3
# 0x0f 0x3a 0x00-0xff
@@ -948,7 +958,7 @@ GrpTable: Grp7
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
3: LIDT Ms
4: SMSW Mw/Rv
-5: rdpkru (110),(11B) | wrpkru (111),(11B)
+5: rdpkru (110),(11B) | wrpkru (111),(11B) | RSTORSSP Mq (F3)
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
@@ -1019,8 +1029,8 @@ GrpTable: Grp15
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE | ptwrite Ey (F3),(11B)
-5: XRSTOR | lfence (11B)
-6: XSAVEOPT | clwb (66) | mfence (11B)
+5: XRSTOR | lfence (11B) | INCSSP Rd (F3),REX.W
+6: XSAVEOPT | clwb (66) | mfence (11B) | CLRSSBSY Mq (F3)
7: clflush | clflushopt (66) | sfence (11B)
EndTable
diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
index e0b85930dd77..c5e825d44766 100644
--- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
@@ -366,7 +366,7 @@ AVXcode: 1
1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
1c:
1d:
-1e:
+1e: RDSSP Rd (F3),REX.W
1f: NOP Ev
# 0x0f 0x20-0x2f
20: MOV Rd,Cd
@@ -610,7 +610,17 @@ fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
ff: UD0
EndTable
-Table: 3-byte opcode 1 (0x0f 0x38)
+Table: 3-byte opcode 1 (0x0f 0x01)
+Referrer:
+AVXcode:
+# Skip 0x00-0xe7
+e8: SETSSBSY (f3)
+e9:
+ea: SAVEPREVSSP (f3)
+# Skip 0xeb-0xff
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x38)
Referrer: 3-byte escape 1
AVXcode: 2
# 0x0f 0x38 0x00-0x0f
@@ -789,12 +799,12 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
-f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
-f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSS Pq,Qq (66),REX.W
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSS Pq,Qq (66),REX.W
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
-Table: 3-byte opcode 2 (0x0f 0x3a)
+Table: 3-byte opcode 3 (0x0f 0x3a)
Referrer: 3-byte escape 2
AVXcode: 3
# 0x0f 0x3a 0x00-0xff
@@ -948,7 +958,7 @@ GrpTable: Grp7
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
3: LIDT Ms
4: SMSW Mw/Rv
-5: rdpkru (110),(11B) | wrpkru (111),(11B)
+5: rdpkru (110),(11B) | wrpkru (111),(11B) | RSTORSSP Mq (F3)
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
@@ -1019,8 +1029,8 @@ GrpTable: Grp15
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE | ptwrite Ey (F3),(11B)
-5: XRSTOR | lfence (11B)
-6: XSAVEOPT | clwb (66) | mfence (11B)
+5: XRSTOR | lfence (11B) | INCSSP Rd (F3),REX.W
+6: XSAVEOPT | clwb (66) | mfence (11B) | CLRSSBSY Mq (F3)
7: clflush | clflushopt (66) | sfence (11B)
EndTable
--
2.17.1
^ permalink raw reply related
* [PATCH v8 26/27] x86/cet/shstk: Add arch_prctl functions for Shadow Stack
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
arch_prctl(ARCH_X86_CET_STATUS, unsigned long *addr)
Return CET feature status.
The parameter 'addr' is a pointer to a user buffer.
On returning to the caller, the kernel fills the following
information:
*addr = SHSTK/IBT status
*(addr + 1) = SHSTK base address
*(addr + 2) = SHSTK size
arch_prctl(ARCH_X86_CET_DISABLE, unsigned long features)
Disable CET features specified in 'features'. Return
-EPERM if CET is locked.
arch_prctl(ARCH_X86_CET_LOCK)
Lock in CET feature.
arch_prctl(ARCH_X86_CET_ALLOC_SHSTK, unsigned long *addr)
Allocate a new SHSTK.
The parameter 'addr' is a pointer to a user buffer and indicates
the desired SHSTK size to allocate. On returning to the caller
the buffer contains the address of the new SHSTK.
There is no CET enabling arch_prctl function. By design, CET is
enabled automatically if the binary and the system can support it.
The parameters passed are always unsigned 64-bit. When an ia32
application passing pointers, it should only use the lower 32 bits.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 5 ++
arch/x86/include/uapi/asm/prctl.h | 5 ++
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/cet.c | 29 +++++++++++
arch/x86/kernel/cet_prctl.c | 85 +++++++++++++++++++++++++++++++
arch/x86/kernel/process.c | 4 +-
6 files changed, 127 insertions(+), 3 deletions(-)
create mode 100644 arch/x86/kernel/cet_prctl.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 52c506a68848..2df357dffd24 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -14,19 +14,24 @@ struct sc_ext;
struct cet_status {
unsigned long shstk_base;
unsigned long shstk_size;
+ unsigned int locked:1;
unsigned int shstk_enabled:1;
};
#ifdef CONFIG_X86_INTEL_CET
+int prctl_cet(int option, unsigned long arg2);
int cet_setup_shstk(void);
int cet_setup_thread_shstk(struct task_struct *p);
+int cet_alloc_shstk(unsigned long *arg);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(bool ia32, struct sc_ext *sc);
int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
#else
+static inline int prctl_cet(int option, unsigned long arg2) { return -EINVAL; }
static inline int cet_setup_shstk(void) { return -EINVAL; }
static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
+static inline int cet_alloc_shstk(unsigned long *arg) { return -EINVAL; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(bool ia32, struct sc_ext *sc) { return -EINVAL; }
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..d962f0ec9ccf 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,9 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_X86_CET_STATUS 0x3001
+#define ARCH_X86_CET_DISABLE 0x3002
+#define ARCH_X86_CET_LOCK 0x3003
+#define ARCH_X86_CET_ALLOC_SHSTK 0x3004
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c7d918a87cac..311829335521 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -140,7 +140,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_X86_INTEL_CET) += cet.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o cet_prctl.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index e876150178ca..e4e20d6ab07b 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -127,6 +127,35 @@ static int create_rstor_token(bool ia32, unsigned long ssp,
return 0;
}
+int cet_alloc_shstk(unsigned long *arg)
+{
+ unsigned long len = *arg;
+ unsigned long addr;
+ unsigned long token;
+ unsigned long ssp;
+
+ addr = do_mmap_locked(NULL, 0, len, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK, NULL);
+ if (addr >= TASK_SIZE_MAX)
+ return -ENOMEM;
+
+ /* Restore token is 8 bytes and aligned to 8 bytes */
+ ssp = addr + len;
+ token = ssp;
+
+ if (!in_ia32_syscall())
+ token |= TOKEN_MODE_64;
+ ssp -= 8;
+
+ if (write_user_shstk_64(ssp, token)) {
+ vm_munmap(addr, len);
+ return -EINVAL;
+ }
+
+ *arg = addr;
+ return 0;
+}
+
int cet_setup_shstk(void)
{
unsigned long addr, size;
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index 000000000000..9c9d4262b07e
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+#include <linux/prctl.h>
+#include <linux/compat.h>
+#include <linux/mman.h>
+#include <linux/elfcore.h>
+#include <asm/processor.h>
+#include <asm/prctl.h>
+#include <asm/cet.h>
+
+/* See Documentation/x86/intel_cet.rst. */
+
+static int handle_get_status(unsigned long arg2)
+{
+ unsigned int features = 0;
+ unsigned long shstk_base, shstk_size;
+ unsigned long buf[3];
+
+ if (current->thread.cet.shstk_enabled)
+ features |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+ shstk_base = current->thread.cet.shstk_base;
+ shstk_size = current->thread.cet.shstk_size;
+
+ buf[0] = (unsigned long)features;
+ buf[1] = shstk_base;
+ buf[2] = shstk_size;
+ return copy_to_user((unsigned long __user *)arg2, buf,
+ sizeof(buf));
+}
+
+static int handle_alloc_shstk(unsigned long arg2)
+{
+ int err = 0;
+ unsigned long arg;
+ unsigned long addr = 0;
+ unsigned long size = 0;
+
+ if (get_user(arg, (unsigned long __user *)arg2))
+ return -EFAULT;
+
+ size = arg;
+ err = cet_alloc_shstk(&arg);
+ if (err)
+ return err;
+
+ addr = arg;
+ if (put_user(addr, (unsigned long __user *)arg2)) {
+ vm_munmap(addr, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int prctl_cet(int option, unsigned long arg2)
+{
+ if (!cpu_x86_cet_enabled())
+ return -EINVAL;
+
+ switch (option) {
+ case ARCH_X86_CET_STATUS:
+ return handle_get_status(arg2);
+
+ case ARCH_X86_CET_DISABLE:
+ if (current->thread.cet.locked)
+ return -EPERM;
+ if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+ cet_disable_free_shstk(current);
+
+ return 0;
+
+ case ARCH_X86_CET_LOCK:
+ current->thread.cet.locked = 1;
+ return 0;
+
+ case ARCH_X86_CET_ALLOC_SHSTK:
+ return handle_alloc_shstk(arg2);
+
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 58b1c52b38b5..e0090f2790df 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -873,7 +873,7 @@ long do_arch_prctl_common(struct task_struct *task, int option,
return get_cpuid_mode();
case ARCH_SET_CPUID:
return set_cpuid_mode(task, cpuid_enabled);
+ default:
+ return prctl_cet(option, cpuid_enabled);
}
-
- return -EINVAL;
}
--
2.17.1
^ permalink raw reply related
* [PATCH v8 25/27] mm/mmap: Add Shadow stack pages to memory accounting
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
Add shadow stack pages to memory accounting.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
mm/mmap.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index b1a921c0de63..1acded00f003 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1703,6 +1703,9 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
if (file && is_file_hugepages(file))
return 0;
+ if (arch_copy_pte_mapping(vm_flags))
+ return 1;
+
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
@@ -3319,6 +3322,8 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
mm->stack_vm += npages;
else if (is_data_mapping(flags))
mm->data_vm += npages;
+ else if (arch_copy_pte_mapping(flags))
+ mm->stack_vm += npages;
}
static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
--
2.17.1
^ permalink raw reply related
* [PATCH v8 24/27] x86/cet/shstk: Handle thread shadow stack
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
The shadow stack for clone/fork is handled as the following:
(1) If ((clone_flags & (CLONE_VFORK | CLONE_VM)) == CLONE_VM),
the kernel allocates (and frees on thread exit) a new SHSTK
for the child.
It is possible for the kernel to complete the clone syscall
and set the child's SHSTK pointer to NULL and let the child
thread allocate a SHSTK for itself. There are two issues
in this approach: It is not compatible with existing code
that does inline syscall and it cannot handle signals before
the child can successfully allocate a SHSTK.
(2) For (clone_flags & CLONE_VFORK), the child uses the existing
SHSTK.
(3) For all other cases, the SHSTK is copied/reused whenever the
parent or the child does a call/ret.
This patch handles cases (1) & (2). Case (3) is handled in the
SHSTK page fault patches.
A 64-bit SHSTK has a fixed size of RLIMIT_STACK. A compat-mode
thread SHSTK has a fixed size of 1/4 RLIMIT_STACK. This allows
more threads to share a 32-bit address space.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 2 ++
arch/x86/include/asm/mmu_context.h | 3 +++
arch/x86/kernel/cet.c | 41 ++++++++++++++++++++++++++++++
arch/x86/kernel/process.c | 1 +
arch/x86/kernel/process_64.c | 7 +++++
5 files changed, 54 insertions(+)
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 422ccb8adbb7..52c506a68848 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -19,12 +19,14 @@ struct cet_status {
#ifdef CONFIG_X86_INTEL_CET
int cet_setup_shstk(void);
+int cet_setup_thread_shstk(struct task_struct *p);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(bool ia32, struct sc_ext *sc);
int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
#else
static inline int cet_setup_shstk(void) { return -EINVAL; }
+static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(bool ia32, struct sc_ext *sc) { return -EINVAL; }
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 9024236693d2..a9a768529540 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
+#include <asm/cet.h>
#include <asm/debugreg.h>
extern atomic64_t last_mm_ctx_id;
@@ -228,6 +229,8 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
+ if (!tsk->vfork_done) \
+ cet_disable_free_shstk(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index f1cc8f4c57b8..e876150178ca 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -151,6 +151,47 @@ int cet_setup_shstk(void)
return 0;
}
+int cet_setup_thread_shstk(struct task_struct *tsk)
+{
+ unsigned long addr, size;
+ struct cet_user_state *state;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ state = get_xsave_addr(&tsk->thread.fpu.state.xsave,
+ XFEATURE_CET_USER);
+
+ if (!state)
+ return -EINVAL;
+
+ size = rlimit(RLIMIT_STACK);
+
+ /*
+ * Compat-mode pthreads share a limited address space.
+ * If each function call takes an average of four slots
+ * stack space, we need 1/4 of stack size for shadow stack.
+ */
+ if (in_compat_syscall())
+ size /= 4;
+
+ addr = do_mmap_locked(NULL, 0, size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK, NULL);
+
+ if (addr >= TASK_SIZE_MAX) {
+ tsk->thread.cet.shstk_base = 0;
+ tsk->thread.cet.shstk_size = 0;
+ tsk->thread.cet.shstk_enabled = 0;
+ return -ENOMEM;
+ }
+
+ fpu__prepare_write(&tsk->thread.fpu);
+ state->user_ssp = (u64)(addr + size - sizeof(u64));
+ tsk->thread.cet.shstk_base = addr;
+ tsk->thread.cet.shstk_size = size;
+ return 0;
+}
+
void cet_disable_shstk(void)
{
u64 r;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index a4deb79b1089..58b1c52b38b5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -130,6 +130,7 @@ void exit_thread(struct task_struct *tsk)
free_vm86(t);
+ cet_disable_free_shstk(tsk);
fpu__drop(fpu);
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1232f7a6c023..7ec60b14e96d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -411,6 +411,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (sp)
childregs->sp = sp;
+ /* Allocate a new shadow stack for pthread */
+ if ((clone_flags & (CLONE_VFORK | CLONE_VM)) == CLONE_VM) {
+ err = cet_setup_thread_shstk(p);
+ if (err)
+ goto out;
+ }
+
err = -ENOMEM;
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
--
2.17.1
^ permalink raw reply related
* [PATCH v8 23/27] x86/cet/shstk: ELF header parsing of Shadow Stack
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
to be enabled for the task.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/Kconfig | 2 ++
arch/x86/include/asm/elf.h | 13 +++++++++++++
arch/x86/kernel/process_64.c | 34 ++++++++++++++++++++++++++++++++++
3 files changed, 49 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index eaf86ef13348..7d13ba326962 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1947,6 +1947,8 @@ config X86_INTEL_SHADOW_STACK_USER
select ARCH_USES_HIGH_VMA_FLAGS
select X86_INTEL_CET
select ARCH_HAS_SHSTK
+ select ARCH_USE_GNU_PROPERTY
+ select ARCH_BINFMT_ELF_STATE
---help---
Shadow stack provides hardware protection against program stack
corruption. Only when all the following are true will an application
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 69c0f892e310..fac79b621e0a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -367,6 +367,19 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
+#ifdef CONFIG_ARCH_BINFMT_ELF_STATE
+struct arch_elf_state {
+ unsigned int gnu_property;
+};
+
+#define INIT_ARCH_ELF_STATE { \
+ .gnu_property = 0, \
+}
+
+#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0)
+#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0)
+#endif
+
/* Do not change the values. See get_align_mask() */
enum align_flags {
ALIGN_VA_32 = BIT(0),
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index af64519b2695..1232f7a6c023 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -818,3 +818,37 @@ unsigned long KSTK_ESP(struct task_struct *task)
{
return task_pt_regs(task)->sp;
}
+
+#ifdef CONFIG_ARCH_USE_GNU_PROPERTY
+int arch_parse_property(void *ehdr, void *phdr, struct file *f, bool inter,
+ struct arch_elf_state *state)
+{
+ int r = 0;
+ unsigned int property = 0;
+
+ r = get_gnu_property(ehdr, phdr, f, GNU_PROPERTY_X86_FEATURE_1_AND,
+ &property);
+
+ if (r)
+ return r;
+
+ state->gnu_property = property;
+ return 0;
+}
+
+int arch_setup_property(struct arch_elf_state *state)
+{
+ int r = 0;
+
+ memset(¤t->thread.cet, 0, sizeof(struct cet_status));
+
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+ if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+ r = cet_setup_shstk();
+ if (r < 0)
+ return r;
+ }
+
+ return r;
+}
+#endif
--
2.17.1
^ permalink raw reply related
* [PATCH v8 22/27] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
An ELF file's .note.gnu.property indicates features the executable file
can support. For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
GNU_PROPERTY_X86_FEATURE_1_SHSTK.
With this patch, if an arch needs to setup features from ELF properties,
it needs CONFIG_ARCH_USE_GNU_PROPERTY to be set, and specific
arch_parse_property() and arch_setup_property().
For example, for X86_64:
int arch_setup_property(void *ehdr, void *phdr, struct file *f, bool inter)
{
int r;
uint32_t property;
r = get_gnu_property(ehdr, phdr, f, GNU_PROPERTY_X86_FEATURE_1_AND,
&property);
...
}
This patch is derived from code provided by H.J. Lu <hjl.tools@gmail.com>.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
fs/Kconfig.binfmt | 3 +
fs/Makefile | 1 +
fs/binfmt_elf.c | 20 +++++
fs/gnu_property.c | 178 +++++++++++++++++++++++++++++++++++++++
include/linux/elf.h | 11 +++
include/uapi/linux/elf.h | 14 +++
6 files changed, 227 insertions(+)
create mode 100644 fs/gnu_property.c
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 62dc4f577ba1..d2cfe0729a73 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -36,6 +36,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_USE_GNU_PROPERTY
+ bool
+
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
diff --git a/fs/Makefile b/fs/Makefile
index d60089fd689b..939b1eb7e8cc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
+obj-$(CONFIG_ARCH_USE_GNU_PROPERTY) += gnu_property.o
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4e11b2e04f6..a4e87fcb10a8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -852,6 +852,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
}
}
+ if (interpreter) {
+ retval = arch_parse_property(&loc->interp_elf_ex,
+ interp_elf_phdata,
+ interpreter, true,
+ &arch_state);
+ } else {
+ retval = arch_parse_property(&loc->elf_ex,
+ elf_phdata,
+ bprm->file, false,
+ &arch_state);
+ }
+
+ if (retval)
+ goto out_free_dentry;
+
/*
* Allow arch code to reject the ELF at this point, whilst it's
* still possible to return an error to the code that invoked
@@ -1080,6 +1095,11 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out_free_dentry;
}
+ retval = arch_setup_property(&arch_state);
+
+ if (retval < 0)
+ goto out_free_dentry;
+
if (interpreter) {
unsigned long interp_map_addr = 0;
diff --git a/fs/gnu_property.c b/fs/gnu_property.c
new file mode 100644
index 000000000000..b22b43f4d6a0
--- /dev/null
+++ b/fs/gnu_property.c
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Extract an ELF file's .note.gnu.property.
+ *
+ * The path from the ELF header to .note.gnu.property is:
+ * elfhdr->elf_phdr->elf_note.
+ *
+ * .note.gnu.property layout:
+ *
+ * struct elf_note {
+ * u32 n_namesz; --> sizeof(n_name[]); always (4)
+ * u32 n_ndescsz;--> sizeof(property[])
+ * u32 n_type; --> always NT_GNU_PROPERTY_TYPE_0 (5)
+ * };
+ * char n_name[4]; --> always 'GNU\0'
+ *
+ * struct {
+ * struct gnu_property {
+ * u32 pr_type;
+ * u32 pr_datasz;
+ * };
+ * u8 pr_data[pr_datasz];
+ * }[];
+ */
+
+#include <linux/elf.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/compat.h>
+
+/*
+ * Search a note's payload for 'pr_type'.
+ */
+static int check_note_payload(void *buf, unsigned long len, u32 pr_type,
+ u32 *property)
+{
+ u32 pr_type_max = 0;
+
+ *property = 0;
+
+ while (len > 0) {
+ struct gnu_property *pr = buf;
+ unsigned long pr_len;
+
+ if (sizeof(*pr) > len)
+ return 0;
+
+ pr_len = sizeof(*pr) + pr->pr_datasz;
+
+ if (pr_len > len)
+ return -ENOEXEC;
+
+ /* property types are in ascending order */
+ if ((pr_type_max != 0) && (pr->pr_type > pr_type_max))
+ return 0;
+
+ if (pr->pr_type > pr_type)
+ return 0;
+
+ if ((pr->pr_type == pr_type) &&
+ (pr->pr_datasz >= sizeof(u32))) {
+ *property = *(u32 *)(buf + sizeof(*pr));
+ return 0;
+ }
+
+ if (pr->pr_type > pr_type_max)
+ pr_type_max = pr->pr_type;
+
+ buf += pr_len;
+ len -= pr_len;
+ }
+
+ return 0;
+}
+
+/*
+ * Look at an ELF file's NT_GNU_PROPERTY for the property of pr_type.
+ *
+ * Input:
+ * buf: the buffer containing the whole note.
+ * len: size of buf.
+ * align: alignment of the note's payload.
+ * pr_type: the property type.
+ *
+ * Output:
+ * The property found.
+ *
+ * Return:
+ * Zero or error.
+ */
+static int check_note(void *buf, unsigned long len, int align,
+ u32 pr_type, u32 *property)
+{
+ struct elf_note *n = buf;
+ char *note_name = buf + sizeof(*n);
+ unsigned long payload_offset;
+ unsigned long payload_len;
+
+ if (len < sizeof(*n) + 4)
+ return -ENOEXEC;
+
+ if ((n->n_namesz != 4) || strncmp("GNU", note_name, 3))
+ return -ENOEXEC;
+
+ payload_offset = round_up(sizeof(*n) + n->n_namesz, align);
+ payload_len = n->n_descsz;
+
+ if (payload_offset + payload_len > len)
+ return -ENOEXEC;
+
+ buf += payload_offset;
+ len -= payload_offset;
+
+ return check_note_payload(buf, len, pr_type, property);
+}
+
+#define find_note(phdr, nr_phdrs, align, pos, len) { \
+ int cnt; \
+ \
+ for (cnt = 0; cnt < nr_phdrs; cnt++) { \
+ if ((phdr)[cnt].p_align != align) \
+ continue; \
+ if ((phdr)[cnt].p_type == PT_GNU_PROPERTY) { \
+ pos = (phdr)[cnt].p_offset; \
+ len = (phdr)[cnt].p_filesz; \
+ } \
+ } \
+}
+
+int get_gnu_property(void *ehdr, void *phdr, struct file *file,
+ u32 pr_type, u32 *property)
+{
+ Elf64_Ehdr *ehdr64 = ehdr;
+ Elf32_Ehdr *ehdr32 = ehdr;
+ void *buf;
+ int align;
+ loff_t pos = 0;
+ unsigned long len = 0;
+ int err = 0;
+
+ /*
+ * Find PT_GNU_PROPERTY from ELF program headers.
+ */
+ if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
+ align = 8;
+ find_note((Elf64_Phdr *)phdr, ehdr64->e_phnum, align, pos, len);
+ } else if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
+ align = 4;
+ find_note((Elf32_Phdr *)phdr, ehdr32->e_phnum, align, pos, len);
+ }
+
+ /*
+ * Read in the whole note. PT_GNU_PROPERTY
+ * is not expected to be larger than a page.
+ */
+ if (len == 0)
+ return 0;
+
+ if (len > PAGE_SIZE)
+ return -ENOEXEC;
+
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ err = kernel_read(file, buf, len, &pos);
+ if (err < len) {
+ if (err >= 0)
+ err = -EIO;
+ goto out;
+ }
+
+ err = check_note(buf, len, align, pr_type, property);
+out:
+ kfree(buf);
+ return err;
+}
diff --git a/include/linux/elf.h b/include/linux/elf.h
index e3649b3e970e..c86cbfd17382 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -56,4 +56,15 @@ static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) {
extern int elf_coredump_extra_notes_size(void);
extern int elf_coredump_extra_notes_write(struct coredump_params *cprm);
#endif
+
+#ifdef CONFIG_ARCH_USE_GNU_PROPERTY
+extern int arch_parse_property(void *ehdr, void *phdr, struct file *f,
+ bool inter, struct arch_elf_state *state);
+extern int arch_setup_property(struct arch_elf_state *state);
+extern int get_gnu_property(void *ehdr_p, void *phdr_p, struct file *f,
+ u32 pr_type, u32 *feature);
+#else
+#define arch_parse_property(ehdr, phdr, file, inter, state) (0)
+#define arch_setup_property(state) (0)
+#endif
#endif /* _LINUX_ELF_H */
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 34c02e4290fe..530ce08467c2 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -36,6 +36,7 @@ typedef __s64 Elf64_Sxword;
#define PT_LOPROC 0x70000000
#define PT_HIPROC 0x7fffffff
#define PT_GNU_EH_FRAME 0x6474e550
+#define PT_GNU_PROPERTY 0x6474e553
#define PT_GNU_STACK (PT_LOOS + 0x474e551)
@@ -443,4 +444,17 @@ typedef struct elf64_note {
Elf64_Word n_type; /* Content type */
} Elf64_Nhdr;
+/* NT_GNU_PROPERTY_TYPE_0 header */
+struct gnu_property {
+ __u32 pr_type;
+ __u32 pr_datasz;
+};
+
+/* .note.gnu.property types */
+#define GNU_PROPERTY_X86_FEATURE_1_AND 0xc0000002
+
+/* Bits of GNU_PROPERTY_X86_FEATURE_1_AND */
+#define GNU_PROPERTY_X86_FEATURE_1_IBT 0x00000001
+#define GNU_PROPERTY_X86_FEATURE_1_SHSTK 0x00000002
+
#endif /* _UAPI_LINUX_ELF_H */
--
2.17.1
^ permalink raw reply related
* [PATCH v8 21/27] x86/cet/shstk: Handle signals for shadow stack
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
When setting up a signal, the kernel creates a shadow stack restore
token at the current SHSTK address and then stores the token's
address in the signal frame, right after the FPU state. Before
restoring a signal, the kernel verifies and then uses the restore
token to set the SHSTK pointer.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/ia32/ia32_signal.c | 8 ++
arch/x86/include/asm/cet.h | 7 ++
arch/x86/include/asm/fpu/internal.h | 2 +
arch/x86/include/asm/fpu/signal.h | 2 +
arch/x86/include/uapi/asm/sigcontext.h | 15 +++
arch/x86/kernel/cet.c | 141 +++++++++++++++++++++++++
arch/x86/kernel/fpu/signal.c | 67 ++++++++++++
arch/x86/kernel/signal.c | 8 ++
8 files changed, 250 insertions(+)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 1cee10091b9f..73a0c1dc3123 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,6 +34,7 @@
#include <asm/sigframe.h>
#include <asm/sighandling.h>
#include <asm/smap.h>
+#include <asm/cet.h>
/*
* Do a signal return; undo the signal stack.
@@ -235,6 +236,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
ksig->ka.sa.sa_restorer)
sp = (unsigned long) ksig->ka.sa.sa_restorer;
+ sp = fpu__alloc_sigcontext_ext(sp);
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
*fpstate = (struct _fpstate_32 __user *) sp;
if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
@@ -295,6 +297,9 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
restorer = &frame->retcode;
}
+ if (setup_fpu_system_states(1, (unsigned long)restorer, fpstate))
+ return -EFAULT;
+
put_user_try {
put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
@@ -384,6 +389,9 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (!err)
+ err = setup_fpu_system_states(1, (unsigned long)restorer, fpstate);
+
if (err)
return -EFAULT;
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index c952a2ec65fe..422ccb8adbb7 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -6,6 +6,8 @@
#include <linux/types.h>
struct task_struct;
+struct sc_ext;
+
/*
* Per-thread CET status
*/
@@ -19,10 +21,15 @@ struct cet_status {
int cet_setup_shstk(void);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
+int cet_restore_signal(bool ia32, struct sc_ext *sc);
+int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
#else
static inline int cet_setup_shstk(void) { return -EINVAL; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
+static inline int cet_restore_signal(bool ia32, struct sc_ext *sc) { return -EINVAL; }
+static inline int cet_setup_signal(bool ia32, unsigned long rstor,
+ struct sc_ext *sc) { return -EINVAL; }
#endif
#define cpu_x86_cet_enabled() \
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2ca5c36a77d5..8a40d676c448 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -472,6 +472,8 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
__copy_kernel_to_fpregs(fpstate, -1);
}
+extern int setup_fpu_system_states(int is_ia32, unsigned long restorer,
+ void __user *fp);
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
/*
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..630a658aeea3 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -25,6 +25,8 @@ extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
extern void convert_to_fxsr(struct fxregs_state *fxsave,
const struct user_i387_ia32_struct *env);
+unsigned long fpu__alloc_sigcontext_ext(unsigned long sp);
+
unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size);
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 844d60eb1882..e3b08d1c0d3b 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -196,6 +196,21 @@ struct _xstate {
/* New processor state extensions go here: */
};
+/*
+ * Sigcontext extension (struct sc_ext) is located after
+ * sigcontext->fpstate. Because currently only the shadow
+ * stack pointer is saved there and the shadow stack depends
+ * on XSAVES, we can find sc_ext from sigcontext->fpstate.
+ *
+ * The 64-bit fpstate has a size of fpu_user_xstate_size, plus
+ * FP_XSTATE_MAGIC2_SIZE when XSAVE* is used. The struct sc_ext
+ * is located at the end of sigcontext->fpstate, aligned to 8.
+ */
+struct sc_ext {
+ unsigned long total_size;
+ unsigned long ssp;
+};
+
/*
* The 32-bit signal frame:
*/
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 5e7af0cc75f9..f1cc8f4c57b8 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -19,6 +19,8 @@
#include <asm/fpu/xstate.h>
#include <asm/fpu/types.h>
#include <asm/cet.h>
+#include <asm/special_insns.h>
+#include <uapi/asm/sigcontext.h>
static int set_shstk_ptr(unsigned long addr)
{
@@ -51,6 +53,80 @@ static unsigned long get_shstk_addr(void)
return ptr;
}
+#define TOKEN_MODE_MASK 3UL
+#define TOKEN_MODE_64 1UL
+#define IS_TOKEN_64(token) ((token & TOKEN_MODE_MASK) == TOKEN_MODE_64)
+#define IS_TOKEN_32(token) ((token & TOKEN_MODE_MASK) == 0)
+
+/*
+ * Verify the restore token at the address of 'ssp' is
+ * valid and then set shadow stack pointer according to the
+ * token.
+ */
+static int verify_rstor_token(bool ia32, unsigned long ssp,
+ unsigned long *new_ssp)
+{
+ unsigned long token;
+
+ *new_ssp = 0;
+
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+
+ if (get_user(token, (unsigned long __user *)ssp))
+ return -EFAULT;
+
+ /* Is 64-bit mode flag correct? */
+ if (!ia32 && !IS_TOKEN_64(token))
+ return -EINVAL;
+ else if (ia32 && !IS_TOKEN_32(token))
+ return -EINVAL;
+
+ token &= ~TOKEN_MODE_MASK;
+
+ /*
+ * Restore address properly aligned?
+ */
+ if ((!ia32 && !IS_ALIGNED(token, 8)) || !IS_ALIGNED(token, 4))
+ return -EINVAL;
+
+ /*
+ * Token was placed properly?
+ */
+ if ((ALIGN_DOWN(token, 8) - 8) != ssp)
+ return -EINVAL;
+
+ *new_ssp = token;
+ return 0;
+}
+
+/*
+ * Create a restore token on the shadow stack.
+ * A token is always 8-byte and aligned to 8.
+ */
+static int create_rstor_token(bool ia32, unsigned long ssp,
+ unsigned long *new_ssp)
+{
+ unsigned long addr;
+
+ *new_ssp = 0;
+
+ if ((!ia32 && !IS_ALIGNED(ssp, 8)) || !IS_ALIGNED(ssp, 4))
+ return -EINVAL;
+
+ addr = ALIGN_DOWN(ssp, 8) - 8;
+
+ /* Is the token for 64-bit? */
+ if (!ia32)
+ ssp |= TOKEN_MODE_64;
+
+ if (write_user_shstk_64(addr, ssp))
+ return -EFAULT;
+
+ *new_ssp = addr;
+ return 0;
+}
+
int cet_setup_shstk(void)
{
unsigned long addr, size;
@@ -114,3 +190,68 @@ void cet_disable_free_shstk(struct task_struct *tsk)
tsk->thread.cet.shstk_enabled = 0;
}
+
+/*
+ * Called from __fpu__restore_sig() under the protection
+ * of fpregs_lock().
+ */
+int cet_restore_signal(bool ia32, struct sc_ext *sc_ext)
+{
+ unsigned long new_ssp = 0;
+ u64 msr_ia32_u_cet = 0;
+ int err;
+
+ if (current->thread.cet.shstk_enabled) {
+ err = verify_rstor_token(ia32, sc_ext->ssp, &new_ssp);
+ if (err)
+ return err;
+
+ msr_ia32_u_cet |= MSR_IA32_CET_SHSTK_EN;
+ }
+
+ wrmsrl(MSR_IA32_PL3_SSP, new_ssp);
+ wrmsrl(MSR_IA32_U_CET, msr_ia32_u_cet);
+ return 0;
+}
+
+/*
+ * Setup the shadow stack for the signal handler: first,
+ * create a restore token to keep track of the current ssp,
+ * and then the return address of the signal handler.
+ */
+int cet_setup_signal(bool ia32, unsigned long rstor_addr, struct sc_ext *sc_ext)
+{
+ unsigned long ssp = 0, new_ssp = 0;
+ u64 msr_ia32_u_cet = 0;
+ int err;
+
+ msr_ia32_u_cet = 0;
+ ssp = 0;
+
+ if (current->thread.cet.shstk_enabled) {
+ ssp = get_shstk_addr();
+ err = create_rstor_token(ia32, ssp, &new_ssp);
+ if (err)
+ return err;
+
+ if (ia32) {
+ ssp = new_ssp - sizeof(u32);
+ err = write_user_shstk_32(ssp, (unsigned int)rstor_addr);
+ } else {
+ ssp = new_ssp - sizeof(u64);
+ err = write_user_shstk_64(ssp, rstor_addr);
+ }
+
+ if (err)
+ return err;
+
+ msr_ia32_u_cet |= MSR_IA32_CET_SHSTK_EN;
+ sc_ext->ssp = new_ssp;
+ }
+
+ modify_fpu_regs_begin();
+ wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ wrmsrl(MSR_IA32_U_CET, msr_ia32_u_cet);
+ modify_fpu_regs_end();
+ return 0;
+}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 4ecf1764a971..2673a2567174 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -52,6 +52,58 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
return 0;
}
+int setup_fpu_system_states(int is_ia32, unsigned long restorer,
+ void __user *fp)
+{
+ int err = 0;
+
+#ifdef CONFIG_X86_64
+ if (cpu_x86_cet_enabled() && fp) {
+ struct sc_ext ext = {0, 0};
+
+ err = cet_setup_signal(is_ia32, restorer, &ext);
+ if (!err) {
+ void __user *p;
+
+ ext.total_size = sizeof(ext);
+
+ p = fp + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+ p = (void __user *)ALIGN((unsigned long)p, 8);
+
+ if (copy_to_user(p, &ext, sizeof(ext)))
+ return -EFAULT;
+ }
+ }
+#endif
+
+ return err;
+}
+
+static int restore_fpu_system_states(int is_ia32, void __user *fp)
+{
+ int err = 0;
+
+#ifdef CONFIG_X86_64
+ if (cpu_x86_cet_enabled() && fp) {
+ struct sc_ext ext = {0, 0};
+ void __user *p;
+
+ p = fp + fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+ p = (void __user *)ALIGN((unsigned long)p, 8);
+
+ if (copy_from_user(&ext, p, sizeof(ext)))
+ return -EFAULT;
+
+ if (ext.total_size != sizeof(ext))
+ return -EFAULT;
+
+ err = cet_restore_signal(is_ia32, &ext);
+ }
+#endif
+
+ return err;
+}
+
/*
* Signal frame handlers.
*/
@@ -347,6 +399,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
pagefault_disable();
ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only);
pagefault_enable();
+
+ if (!ret)
+ ret = restore_fpu_system_states(0, buf);
+
if (!ret) {
fpregs_mark_activate();
fpregs_unlock();
@@ -433,6 +489,17 @@ int fpu__restore_sig(void __user *buf, int ia32_frame)
return __fpu__restore_sig(buf, buf_fx, size);
}
+unsigned long fpu__alloc_sigcontext_ext(unsigned long sp)
+{
+ /*
+ * sigcontext_ext is at: fpu + fpu_user_xstate_size +
+ * FP_XSTATE_MAGIC2_SIZE, then aligned to 8.
+ */
+ if (cpu_x86_cet_enabled())
+ sp -= (sizeof(struct sc_ext) + 8);
+ return sp;
+}
+
unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ce9421ec285f..92a2c2b2dcd0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -46,6 +46,7 @@
#include <asm/sigframe.h>
#include <asm/signal.h>
+#include <asm/cet.h>
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
@@ -264,6 +265,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = (unsigned long) ka->sa.sa_restorer;
}
+ sp = fpu__alloc_sigcontext_ext(sp);
sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
&buf_fx, &math_size);
*fpstate = (void __user *)sp;
@@ -493,6 +495,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (!err)
+ err = setup_fpu_system_states(0, (unsigned long)ksig->ka.sa.sa_restorer, fp);
+
if (err)
return -EFAULT;
@@ -579,6 +584,9 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (!err)
+ err = setup_fpu_system_states(0, (unsigned long)ksig->ka.sa.sa_restorer, fpstate);
+
if (err)
return -EFAULT;
--
2.17.1
^ permalink raw reply related
* [PATCH v8 20/27] x86/cet/shstk: Introduce WRUSS instruction
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
WRUSS is a new kernel-mode instruction but writes directly to user
shadow stack memory. This is used to construct a return address on
the shadow stack for the signal handler.
This instruction can fault if the user shadow stack is invalid shadow
stack memory. In that case, the kernel does a fixup.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/special_insns.h | 32 ++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 219be88a59d2..10f821d6b469 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -246,6 +246,38 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+#ifdef CONFIG_X86_INTEL_CET
+#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_X32)
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+ asm_volatile_goto("1: wrussd %1, (%0)\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: "r" (addr), "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -EPERM;
+}
+#else
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+ WARN_ONCE(1, "%s used but not supported.\n", __func__);
+ return -EFAULT;
+}
+#endif
+
+static inline int write_user_shstk_64(unsigned long addr, unsigned long val)
+{
+ asm_volatile_goto("1: wrussq %1, (%0)\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: "r" (addr), "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -EPERM;
+}
+#endif /* CONFIG_X86_INTEL_CET */
+
#define nop() asm volatile ("nop")
--
2.17.1
^ permalink raw reply related
* [PATCH v8 19/27] x86/cet/shstk: User-mode shadow stack support
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
This patch adds basic shadow stack enabling/disabling routines.
A task's shadow stack is allocated from memory with VM_SHSTK flag set
and read-only protection. It has a fixed size of RLIMIT_STACK.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 34 +++++
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/processor.h | 5 +
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/cet.c | 116 ++++++++++++++++++
arch/x86/kernel/cpu/common.c | 25 ++++
arch/x86/kernel/process.c | 1 +
.../arch/x86/include/asm/disabled-features.h | 8 +-
8 files changed, 197 insertions(+), 2 deletions(-)
create mode 100644 arch/x86/include/asm/cet.h
create mode 100644 arch/x86/kernel/cet.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index 000000000000..c952a2ec65fe
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+/*
+ * Per-thread CET status
+ */
+struct cet_status {
+ unsigned long shstk_base;
+ unsigned long shstk_size;
+ unsigned int shstk_enabled:1;
+};
+
+#ifdef CONFIG_X86_INTEL_CET
+int cet_setup_shstk(void);
+void cet_disable_shstk(void);
+void cet_disable_free_shstk(struct task_struct *p);
+#else
+static inline int cet_setup_shstk(void) { return -EINVAL; }
+static inline void cet_disable_shstk(void) {}
+static inline void cet_disable_free_shstk(struct task_struct *p) {}
+#endif
+
+#define cpu_x86_cet_enabled() \
+ (cpu_feature_enabled(X86_FEATURE_SHSTK) || \
+ cpu_feature_enabled(X86_FEATURE_IBT))
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index a5ea841cc6d2..06323ebed643 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -62,6 +62,12 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+#define DISABLE_SHSTK 0
+#else
+#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -81,7 +87,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..0f9bc7fd1351 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -24,6 +24,7 @@ struct vm86;
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
+#include <asm/cet.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -490,6 +491,10 @@ struct thread_struct {
unsigned int sig_on_uaccess_err:1;
unsigned int uaccess_err:1; /* uaccess failed */
+#ifdef CONFIG_X86_INTEL_CET
+ struct cet_status cet;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3578ad248bc9..c7d918a87cac 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -140,6 +140,8 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..5e7af0cc75f9
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cet.c - Control-flow Enforcement (CET)
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <asm/msr.h>
+#include <asm/user.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/cet.h>
+
+static int set_shstk_ptr(unsigned long addr)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -1;
+
+ if ((addr >= TASK_SIZE_MAX) || (!IS_ALIGNED(addr, 4)))
+ return -1;
+
+ modify_fpu_regs_begin();
+ rdmsrl(MSR_IA32_U_CET, r);
+ wrmsrl(MSR_IA32_PL3_SSP, addr);
+ wrmsrl(MSR_IA32_U_CET, r | MSR_IA32_CET_SHSTK_EN);
+ modify_fpu_regs_end();
+ return 0;
+}
+
+static unsigned long get_shstk_addr(void)
+{
+ unsigned long ptr;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ modify_fpu_regs_begin();
+ rdmsrl(MSR_IA32_PL3_SSP, ptr);
+ modify_fpu_regs_end();
+ return ptr;
+}
+
+int cet_setup_shstk(void)
+{
+ unsigned long addr, size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -EOPNOTSUPP;
+
+ size = rlimit(RLIMIT_STACK);
+ addr = do_mmap_locked(NULL, 0, size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK, NULL);
+
+ /*
+ * Return actual error from do_mmap().
+ */
+ if (addr >= TASK_SIZE_MAX)
+ return addr;
+
+ set_shstk_ptr(addr + size - sizeof(u64));
+ current->thread.cet.shstk_base = addr;
+ current->thread.cet.shstk_size = size;
+ current->thread.cet.shstk_enabled = 1;
+ return 0;
+}
+
+void cet_disable_shstk(void)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return;
+
+ modify_fpu_regs_begin();
+ rdmsrl(MSR_IA32_U_CET, r);
+ r &= ~(MSR_IA32_CET_SHSTK_EN);
+ wrmsrl(MSR_IA32_U_CET, r);
+ wrmsrl(MSR_IA32_PL3_SSP, 0);
+ modify_fpu_regs_end();
+ current->thread.cet.shstk_enabled = 0;
+}
+
+void cet_disable_free_shstk(struct task_struct *tsk)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+ !tsk->thread.cet.shstk_enabled)
+ return;
+
+ if (tsk->mm && (tsk == current))
+ cet_disable_shstk();
+
+ /*
+ * Free only when tsk is current or shares mm
+ * with current but has its own shstk.
+ */
+ if (tsk->mm && (tsk->mm == current->mm) &&
+ (tsk->thread.cet.shstk_base)) {
+ vm_munmap(tsk->thread.cet.shstk_base,
+ tsk->thread.cet.shstk_size);
+ tsk->thread.cet.shstk_base = 0;
+ tsk->thread.cet.shstk_size = 0;
+ }
+
+ tsk->thread.cet.shstk_enabled = 0;
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f125bf7ecb6f..d3addbd3f4d4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -53,6 +53,7 @@
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
+#include <asm/cet.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
@@ -488,6 +489,29 @@ static __init int setup_disable_pku(char *arg)
__setup("nopku", setup_disable_pku);
#endif /* CONFIG_X86_64 */
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ if (cpu_x86_cet_enabled())
+ cr4_set_bits(X86_CR4_CET);
+}
+
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+static __init int setup_disable_shstk(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (s[0] != '\0')
+ return 0;
+
+ if (!boot_cpu_has(X86_FEATURE_SHSTK))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_SHSTK);
+ pr_info("x86: 'no_cet_shstk' specified, disabling Shadow Stack\n");
+ return 1;
+}
+__setup("no_cet_shstk", setup_disable_shstk);
+#endif
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -1481,6 +1505,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
x86_init_rdrand(c);
x86_init_cache_qos(c);
setup_pku(c);
+ setup_cet(c);
/*
* Clear/Set all flags overridden by options, need do it
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d360bf4d696b..a4deb79b1089 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -42,6 +42,7 @@
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
#include <asm/proto.h>
+#include <asm/cet.h>
#include "process.h"
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index a5ea841cc6d2..06323ebed643 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -62,6 +62,12 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+#define DISABLE_SHSTK 0
+#else
+#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -81,7 +87,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
--
2.17.1
^ permalink raw reply related
* [PATCH v8 18/27] mm: Introduce do_mmap_locked()
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
There are a few places that need do_mmap() with mm->mmap_sem held.
Create an in-line function for that.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
include/linux/mm.h | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc58585014c9..275c385f53c6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2394,6 +2394,24 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
+static inline unsigned long do_mmap_locked(struct file *file,
+ unsigned long addr, unsigned long len, unsigned long prot,
+ unsigned long flags, vm_flags_t vm_flags, struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long populate;
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap(file, addr, len, prot, flags, vm_flags, 0,
+ &populate, uf);
+ up_write(&mm->mmap_sem);
+
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
+}
+
/* These take the mm semaphore themselves */
extern int __must_check vm_brk(unsigned long, unsigned long);
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
--
2.17.1
^ permalink raw reply related
* [PATCH v8 17/27] mm: Update can_follow_write_pte/pmd for shadow stack
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
can_follow_write_pte/pmd look for the (RO & DIRTY) PTE/PMD to
verify an exclusive RO page still exists after a broken COW.
A shadow stack PTE is RO & PAGE_DIRTY_SW when it is shared,
otherwise RO & PAGE_DIRTY_HW.
Introduce pte_exclusive() and pmd_exclusive() to also verify a
shadow stack PTE is exclusive.
Also rename can_follow_write_pte/pmd() to can_follow_write() to
make their meaning clear; i.e. "Can we write to the page?", not
"Is the PTE writable?"
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/mm/pgtable.c | 18 ++++++++++++++++++
include/asm-generic/pgtable.h | 12 ++++++++++++
mm/gup.c | 8 +++++---
mm/huge_memory.c | 8 +++++---
4 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6f3959ca2a08..326715fd0c50 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -898,4 +898,22 @@ inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
{
return (vm_flags & VM_SHSTK);
}
+
+inline bool pte_exclusive(pte_t pte, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pte_dirty_hw(pte);
+ else
+ return pte_dirty(pte);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+inline bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pmd_dirty_hw(pmd);
+ else
+ return pmd_dirty(pmd);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_X86_INTEL_SHADOW_STACK_USER */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 438ce73b57ea..b58f40525ebc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1203,10 +1203,22 @@ static inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
{
return false;
}
+
+static inline bool pte_exclusive(pte_t pte, struct vm_area_struct *vma)
+{
+ return pte_dirty(pte);
+}
+
+static inline bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma)
+{
+ return pmd_dirty(pmd);
+}
#else
pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma);
bool arch_copy_pte_mapping(vm_flags_t vm_flags);
+bool pte_exclusive(pte_t pte, struct vm_area_struct *vma);
+bool pmd_exclusive(pmd_t pmd, struct vm_area_struct *vma);
#endif
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..d7b298c5f6cb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -179,10 +179,12 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
* FOLL_FORCE can write to even unwritable pte's, but only
* after we've gone through a COW cycle and they are dirty.
*/
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+static inline bool can_follow_write(pte_t pte, unsigned int flags,
+ struct vm_area_struct *vma)
{
return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+ pte_exclusive(pte, vma));
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -220,7 +222,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
+ if ((flags & FOLL_WRITE) && !can_follow_write(pte, flags, vma)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 39d66c628121..947eb0121671 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1444,10 +1444,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
* FOLL_FORCE can write to even unwritable pmd's, but only
* after we've gone through a COW cycle and they are dirty.
*/
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+static inline bool can_follow_write(pmd_t pmd, unsigned int flags,
+ struct vm_area_struct *vma)
{
return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+ pmd_exclusive(pmd, vma));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1460,7 +1462,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+ if (flags & FOLL_WRITE && !can_follow_write(*pmd, flags, vma))
goto out;
/* Avoid dumping huge zero page */
--
2.17.1
^ permalink raw reply related
* [PATCH v8 16/27] mm: Handle THP/HugeTLB shadow stack page fault
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
This patch implements THP shadow stack (SHSTK) copying in the same
way as in the previous patch for regular PTE.
In copy_huge_pmd(), clear the dirty bit from the PMD to cause a page
fault upon the next SHSTK access to the PMD. At that time, fix the
PMD and copy/re-use the page.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/mm/pgtable.c | 8 ++++++++
include/asm-generic/pgtable.h | 6 ++++++
mm/huge_memory.c | 4 ++++
3 files changed, 18 insertions(+)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0c10d0c5e329..6f3959ca2a08 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -886,6 +886,14 @@ inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma)
return pte;
}
+inline pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pmd_mkdirty_shstk(pmd);
+ else
+ return pmd;
+}
+
inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
{
return (vm_flags & VM_SHSTK);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 89b0fa132f1f..438ce73b57ea 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1194,12 +1194,18 @@ static inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma)
return pte;
}
+static inline pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma)
+{
+ return pmd;
+}
+
static inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
{
return false;
}
#else
pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
+pmd_t pmd_set_vma_features(pmd_t pmd, struct vm_area_struct *vma);
bool arch_copy_pte_mapping(vm_flags_t vm_flags);
#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1334ede667a8..39d66c628121 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -611,6 +611,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_set_vma_features(entry, vma);
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
@@ -1253,6 +1254,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = pte_set_vma_features(entry, vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
@@ -1335,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_set_vma_features(entry, vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
@@ -1407,6 +1410,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_set_vma_features(entry, vma);
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
--
2.17.1
^ permalink raw reply related
* [PATCH v8 15/27] mm: Handle shadow stack page fault
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
When a task does fork(), its shadow stack (SHSTK) must be duplicated
for the child. This patch implements a flow similar to copy-on-write
of an anonymous page, but for SHSTK.
A SHSTK PTE must be RO and dirty. This dirty bit requirement is used
to effect the copying. In copy_one_pte(), clear the dirty bit from a
SHSTK PTE to cause a page fault upon the next SHSTK access. At that
time, fix the PTE and copy/re-use the page.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/mm/pgtable.c | 15 +++++++++++++++
include/asm-generic/pgtable.h | 15 +++++++++++++++
mm/memory.c | 7 ++++++-
3 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 44816ff6411f..0c10d0c5e329 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -876,3 +876,18 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHSTK)
+ return pte_mkdirty_shstk(pte);
+ else
+ return pte;
+}
+
+inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
+{
+ return (vm_flags & VM_SHSTK);
+}
+#endif /* CONFIG_X86_INTEL_SHADOW_STACK_USER */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 75d9d68a6de7..89b0fa132f1f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1188,4 +1188,19 @@ static inline bool arch_has_pfn_modify_check(void)
#define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED)
#endif
+#ifndef CONFIG_ARCH_HAS_SHSTK
+static inline pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma)
+{
+ return pte;
+}
+
+static inline bool arch_copy_pte_mapping(vm_flags_t vm_flags)
+{
+ return false;
+}
+#else
+pte_t pte_set_vma_features(pte_t pte, struct vm_area_struct *vma);
+bool arch_copy_pte_mapping(vm_flags_t vm_flags);
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..be93a73b5152 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -754,7 +754,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
- if (is_cow_mapping(vm_flags) && pte_write(pte)) {
+ if ((is_cow_mapping(vm_flags) && pte_write(pte)) ||
+ arch_copy_pte_mapping(vm_flags)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -2273,6 +2274,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = pte_set_vma_features(entry, vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2348,6 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = pte_set_vma_features(entry, vma);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
@@ -2866,6 +2869,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte = mk_pte(page, vma->vm_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ pte = pte_set_vma_features(pte, vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
@@ -3008,6 +3012,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_set_vma_features(entry, vma);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
--
2.17.1
^ permalink raw reply related
* [PATCH v8 14/27] x86/mm: Shadow stack page fault error checking
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
If a page fault is triggered by a shadow stack access (e.g. call/ret)
or shadow stack management instructions (e.g. wrussq), then bit[6] of
the page fault error code is set.
In access_error(), verify a shadow stack page fault is within a
shadow stack memory area. It is always an error otherwise.
For a valid shadow stack access, set FAULT_FLAG_WRITE to effect
copy-on-write.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/traps.h | 2 ++
arch/x86/mm/fault.c | 18 ++++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 8691261faeb0..918b0e48b2eb 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -166,6 +166,7 @@ enum {
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
@@ -174,5 +175,6 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SHSTK = 1 << 6,
};
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ceacd1156db..75ec38d125fc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1187,6 +1187,17 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
(error_code & X86_PF_INSTR), foreign))
return 1;
+ /*
+ * Verify X86_PF_SHSTK is within a shadow stack VMA.
+ * It is always an error if there is a shadow stack
+ * fault outside a shadow stack VMA.
+ */
+ if (error_code & X86_PF_SHSTK) {
+ if (!(vma->vm_flags & VM_SHSTK))
+ return 1;
+ return 0;
+ }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1344,6 +1355,13 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ /*
+ * If the fault is caused by a shadow stack access,
+ * i.e. CALL/RET/SAVEPREVSSP/RSTORSSP, then set
+ * FAULT_FLAG_WRITE to effect copy-on-write.
+ */
+ if (hw_error_code & X86_PF_SHSTK)
+ flags |= FAULT_FLAG_WRITE;
if (hw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (hw_error_code & X86_PF_INSTR)
--
2.17.1
^ permalink raw reply related
* [PATCH v8 13/27] x86/mm: Modify ptep_set_wrprotect and pmdp_set_wrprotect for _PAGE_DIRTY_SW
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
When Shadow Stack is enabled, the [R/O + PAGE_DIRTY_HW] setting is
reserved only for the Shadow Stack. Non-Shadow Stack R/O PTEs use
[R/O + PAGE_DIRTY_SW].
When a PTE goes from [R/W + PAGE_DIRTY_HW] to [R/O + PAGE_DIRTY_SW],
it could become a transient Shadow Stack PTE in two cases.
The first case is that some processors can start a write but end up
seeing a read-only PTE by the time they get to the Dirty bit,
creating a transient Shadow Stack PTE. However, this will not occur
on processors supporting Shadow Stack therefore we don't need a TLB
flush here.
The second case is that when the software, without atomic, tests &
replaces PAGE_DIRTY_HW with PAGE_DIRTY_SW, a transient Shadow Stack
PTE can exist. This is prevented with cmpxchg.
Dave Hansen, Jann Horn, Andy Lutomirski, and Peter Zijlstra provided
many insights to the issue. Jann Horn provided the cmpxchg solution.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/pgtable.h | 58 ++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1448fb38f248..81c8c5ec221e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1222,7 +1222,36 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ pte_t new_pte, pte = READ_ONCE(*ptep);
+
+ /*
+ * Some processors can start a write, but end up
+ * seeing a read-only PTE by the time they get
+ * to the Dirty bit. In this case, they will
+ * set the Dirty bit, leaving a read-only, Dirty
+ * PTE which looks like a Shadow Stack PTE.
+ *
+ * However, this behavior has been improved and
+ * will not occur on processors supporting
+ * Shadow Stacks. Without this guarantee, a
+ * transition to a non-present PTE and flush the
+ * TLB would be needed.
+ *
+ * When changing a writable PTE to read-only and
+ * if the PTE has _PAGE_DIRTY_HW set, we move
+ * that bit to _PAGE_DIRTY_SW so that the PTE is
+ * not a valid Shadow Stack PTE.
+ */
+ do {
+ new_pte = pte_wrprotect(pte);
+ new_pte.pte |= (new_pte.pte & _PAGE_DIRTY_HW) >>
+ _PAGE_BIT_DIRTY_HW << _PAGE_BIT_DIRTY_SW;
+ new_pte.pte &= ~_PAGE_DIRTY_HW;
+ } while (!try_cmpxchg(ptep, &pte, new_pte));
+#else
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+#endif
}
#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
@@ -1285,7 +1314,36 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ pmd_t new_pmd, pmd = READ_ONCE(*pmdp);
+
+ /*
+ * Some processors can start a write, but end up
+ * seeing a read-only PMD by the time they get
+ * to the Dirty bit. In this case, they will
+ * set the Dirty bit, leaving a read-only, Dirty
+ * PMD which looks like a Shadow Stack PMD.
+ *
+ * However, this behavior has been improved and
+ * will not occur on processors supporting
+ * Shadow Stacks. Without this guarantee, a
+ * transition to a non-present PMD and flush the
+ * TLB would be needed.
+ *
+ * When changing a writable PMD to read-only and
+ * if the PMD has _PAGE_DIRTY_HW set, we move
+ * that bit to _PAGE_DIRTY_SW so that the PMD is
+ * not a valid Shadow Stack PMD.
+ */
+ do {
+ new_pmd = pmd_wrprotect(pmd);
+ new_pmd.pmd |= (new_pmd.pmd & _PAGE_DIRTY_HW) >>
+ _PAGE_BIT_DIRTY_HW << _PAGE_BIT_DIRTY_SW;
+ new_pmd.pmd &= ~_PAGE_DIRTY_HW;
+ } while (!try_cmpxchg(pmdp, &pmd, new_pmd));
+#else
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+#endif
}
#define pud_write pud_write
--
2.17.1
^ permalink raw reply related
* [PATCH v8 12/27] drm/i915/gvt: Update _PAGE_DIRTY to _PAGE_DIRTY_BITS
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
Update _PAGE_DIRTY to _PAGE_DIRTY_BITS in split_2MB_gtt_entry().
In order to support Control-flow Enforcement (CET), _PAGE_DIRTY is
now _PAGE_DIRTY_HW or _PAGE_DIRTY_SW.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
drivers/gpu/drm/i915/gvt/gtt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 4b04af569c05..e467ca182633 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1201,7 +1201,7 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
}
/* Clear dirty field. */
- se->val64 &= ~_PAGE_DIRTY;
+ se->val64 &= ~_PAGE_DIRTY_BITS;
ops->clear_pse(se);
ops->clear_ips(se);
--
2.17.1
^ permalink raw reply related
* [PATCH v8 11/27] x86/mm: Introduce _PAGE_DIRTY_SW
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
A RO and dirty PTE exists in the following cases:
(a) A page is modified and then shared with a fork()'ed child;
(b) A R/O page that has been COW'ed;
(c) A SHSTK page.
The processor does not read the dirty bit for (a) and (b), but
checks the dirty bit for (c). To prevent the use of non-SHSTK
memory as SHSTK, we introduce a spare bit of the 64-bit PTE as
_PAGE_BIT_DIRTY_SW and use that for (a) and (b). This results
to the following possible PTE settings:
Modified PTE: (R/W + DIRTY_HW)
Modified and shared PTE: (R/O + DIRTY_SW)
R/O PTE COW'ed: (R/O + DIRTY_SW)
SHSTK PTE: (R/O + DIRTY_HW)
SHSTK PTE COW'ed: (R/O + DIRTY_HW)
SHSTK PTE shared: (R/O + DIRTY_SW)
Note that _PAGE_BIT_DRITY_SW is only used in R/O PTEs but
not R/W PTEs.
When this patch is applied, there are six free bits left in
the 64-bit PTE. There is no more free bit in the 32-bit
PTE (except for PAE) and shadow stack is not implemented
for the 32-bit kernel.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/pgtable.h | 129 ++++++++++++++++++++++-----
arch/x86/include/asm/pgtable_types.h | 21 ++++-
2 files changed, 128 insertions(+), 22 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 8e38d87fce6e..1448fb38f248 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -120,9 +120,9 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
}
@@ -159,9 +159,9 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}
static inline int pmd_young(pmd_t pmd)
@@ -169,9 +169,9 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -310,9 +310,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pte_t pte_move_flags(pte_t pte, pteval_t from, pteval_t to)
+{
+ if (pte_flags(pte) & from)
+ pte = pte_set_flags(pte_clear_flags(pte, from), to);
+ return pte;
+}
+#else
+static inline pte_t pte_move_flags(pte_t pte, pteval_t from, pteval_t to)
+{
+ return pte;
+}
+#endif
+
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -322,6 +336,7 @@ static inline pte_t pte_mkold(pte_t pte)
static inline pte_t pte_wrprotect(pte_t pte)
{
+ pte = pte_move_flags(pte, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pte_clear_flags(pte, _PAGE_RW);
}
@@ -332,9 +347,24 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
+ pteval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHADOW_STACK_USER) ||
+ pte_write(pte)) ? _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+ return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_HAS_SHSTK
+static inline pte_t pte_mkdirty_shstk(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_DIRTY_SW);
return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
+static inline bool pte_dirty_hw(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY_HW;
+}
+#endif
+
static inline pte_t pte_mkyoung(pte_t pte)
{
return pte_set_flags(pte, _PAGE_ACCESSED);
@@ -342,6 +372,7 @@ static inline pte_t pte_mkyoung(pte_t pte)
static inline pte_t pte_mkwrite(pte_t pte)
{
+ pte = pte_move_flags(pte, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pte_set_flags(pte, _PAGE_RW);
}
@@ -389,6 +420,20 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pmd_t pmd_move_flags(pmd_t pmd, pmdval_t from, pmdval_t to)
+{
+ if (pmd_flags(pmd) & from)
+ pmd = pmd_set_flags(pmd_clear_flags(pmd, from), to);
+ return pmd;
+}
+#else
+static inline pmd_t pmd_move_flags(pmd_t pmd, pmdval_t from, pmdval_t to)
+{
+ return pmd;
+}
+#endif
+
static inline pmd_t pmd_mkold(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -396,19 +441,36 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
+ pmd = pmd_move_flags(pmd, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pmd_clear_flags(pmd, _PAGE_RW);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
+ pmdval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHADOW_STACK_USER) ||
+ (pmd_flags(pmd) & _PAGE_RW)) ?
+ _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+ return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_HAS_SHSTK
+static inline pmd_t pmd_mkdirty_shstk(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_DIRTY_SW);
return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
+static inline bool pmd_dirty_hw(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY_HW;
+}
+#endif
+
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DEVMAP);
@@ -426,6 +488,7 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
+ pmd = pmd_move_flags(pmd, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pmd_set_flags(pmd, _PAGE_RW);
}
@@ -443,6 +506,20 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pud_t pud_move_flags(pud_t pud, pudval_t from, pudval_t to)
+{
+ if (pud_flags(pud) & from)
+ pud = pud_set_flags(pud_clear_flags(pud, from), to);
+ return pud;
+}
+#else
+static inline pud_t pud_move_flags(pud_t pud, pudval_t from, pudval_t to)
+{
+ return pud;
+}
+#endif
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -450,17 +527,22 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
+ pud = pud_move_flags(pud, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pud_clear_flags(pud, _PAGE_RW);
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
+ pudval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHADOW_STACK_USER) ||
+ (pud_flags(pud) & _PAGE_RW)) ?
+ _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+
+ return pud_set_flags(pud, dirty | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -480,6 +562,7 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
+ pud = pud_move_flags(pud, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pud_set_flags(pud, _PAGE_RW);
}
@@ -611,19 +694,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
+ if ((pte_write(pte) && !(pgprot_val(newprot) & _PAGE_RW)))
+ return pte_move_flags(__pte(val), _PAGE_DIRTY_HW,
+ _PAGE_DIRTY_SW);
return __pte(val);
}
-static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
- pmdval_t val = pmd_val(pmd), oldval = val;
-
- val &= _HPAGE_CHG_MASK;
- val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
- val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
- return __pmd(val);
-}
-
/* mprotect needs to preserve PAT bits when updating vm_page_prot */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -1178,6 +1254,19 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd), oldval = val;
+
+ val &= _HPAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
+ if ((pmd_write(pmd) && !(pgprot_val(newprot) & _PAGE_RW)))
+ return pmd_move_flags(__pmd(val), _PAGE_DIRTY_HW,
+ _PAGE_DIRTY_SW);
+ return __pmd(val);
+}
+
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index e647e3c75578..cd95afc82e9f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -23,6 +23,7 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW5 57 /* available for programmer */
#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
@@ -34,6 +35,7 @@
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_DIRTY_SW _PAGE_BIT_SOFTW5 /* was written to */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -108,6 +110,21 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
+/*
+ * _PAGE_DIRTY_HW: set by the processor when a page is written.
+ * _PAGE_DIRTY_SW: a spare bit tracking a written, but now R/O page.
+ * [R/W + _PAGE_DIRTY_HW] <-> [R/O + _PAGE_DIRTY_SW].
+ * _PAGE_SOFT_DIRTY: a spare bit used to track written pages since a time point
+ * set by the system admin; see Documentation/admin-guide/mm/soft-dirty.rst.
+ */
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+#define _PAGE_DIRTY_SW (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_SW)
+#else
+#define _PAGE_DIRTY_SW (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY_HW | _PAGE_DIRTY_SW)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
@@ -121,9 +138,9 @@
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
+ _PAGE_DIRTY_SW | _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
--
2.17.1
^ permalink raw reply related
* [PATCH v8 10/27] x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
From: Yu-cheng Yu @ 2019-08-13 20:52 UTC (permalink / raw)
To: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
Andy Lutomirski, Balbir Singh, Borislav Petkov, Cyrill Gorcunov,
Dave Hansen, Eugene Syromiatnikov, Florian Weimer, H.J. Lu,
Jann Horn, Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit
Cc: Yu-cheng Yu
In-Reply-To: <20190813205225.12032-1-yu-cheng.yu@intel.com>
Before introducing _PAGE_DIRTY_SW for non-hardware, memory management
purposes in the next patch, rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
_PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty bits
more clear. There are no functional changes in this patch.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/pgtable.h | 6 +++---
arch/x86/include/asm/pgtable_types.h | 17 +++++++++--------
arch/x86/kernel/relocate_kernel_64.S | 2 +-
arch/x86/kvm/vmx/vmx.c | 2 +-
4 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0bc530c4eb13..8e38d87fce6e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -332,7 +332,7 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -406,7 +406,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -460,7 +460,7 @@ static inline pud_t pud_wrprotect(pud_t pud)
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b5e49e6bac63..e647e3c75578 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -15,7 +15,7 @@
#define _PAGE_BIT_PWT 3 /* page write through */
#define _PAGE_BIT_PCD 4 /* page cache disabled */
#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
-#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_DIRTY_HW 6 /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
@@ -45,7 +45,7 @@
#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_DIRTY_HW (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_HW)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
@@ -73,7 +73,7 @@
_PAGE_PKEY_BIT3)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY_HW | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif
@@ -111,9 +111,9 @@
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
- _PAGE_ACCESSED | _PAGE_DIRTY)
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW)
#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW)
/*
* Set of bits not changed in pte_modify. The pte's
@@ -122,7 +122,7 @@
* pte_modify() does modify it.
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW | \
_PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
@@ -167,7 +167,8 @@ enum page_cache_mode {
_PAGE_ACCESSED)
#define __PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY_HW | _PAGE_ACCESSED | \
+ _PAGE_GLOBAL)
#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
@@ -186,7 +187,7 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY | _PAGE_ENC)
+ _PAGE_DIRTY_HW | _PAGE_ENC)
#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c51ccff5cd01..60b75e8f4c14 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -15,7 +15,7 @@
*/
#define PTR(x) (x << 3)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY_HW)
/*
* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 42ed3faa6af8..226875fbfa45 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3395,7 +3395,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW | _PAGE_PSE);
r = kvm_write_guest_page(kvm, identity_map_pfn,
&tmp, i * sizeof(tmp), sizeof(tmp));
if (r < 0)
--
2.17.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox