* [PATCH 07/10] mm: Prevent mprotect from changing shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:38 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
mm/mprotect.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 625608bc8962..128dcb880c12 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -446,6 +446,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
error = -ENOMEM;
if (!vma)
goto out;
+
+ /*
+ * Do not allow changing shadow stack memory.
+ */
+ if (vma->vm_flags & VM_SHSTK) {
+ error = -EINVAL;
+ goto out;
+ }
+
prev = vma->vm_prev;
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 06/10] x86/cet: Add arch_prctl functions for shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:38 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
The following operations are provided.
ARCH_CET_STATUS:
return the current CET status
ARCH_CET_DISABLE:
disable CET features
ARCH_CET_LOCK:
lock out CET features
ARCH_CET_EXEC:
set CET features for exec()
ARCH_CET_ALLOC_SHSTK:
allocate a new shadow stack
ARCH_CET_PUSH_SHSTK:
put a return address on shadow stack
ARCH_CET_ALLOC_SHSTK and ARCH_CET_PUSH_SHSTK are intended only for
the implementation of GLIBC ucontext related APIs.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 7 ++
arch/x86/include/uapi/asm/prctl.h | 15 +++
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/cet.c | 18 +++-
arch/x86/kernel/cet_prctl.c | 203 ++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/elf.c | 24 ++++-
arch/x86/kernel/process.c | 7 ++
7 files changed, 270 insertions(+), 6 deletions(-)
create mode 100644 arch/x86/kernel/cet_prctl.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index c8fd87e13859..a2a53fe4d5e6 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -12,24 +12,31 @@ struct task_struct;
struct cet_stat {
unsigned long shstk_base;
unsigned long shstk_size;
+ unsigned long exec_shstk_size;
unsigned int shstk_enabled:1;
+ unsigned int locked:1;
+ unsigned int exec_shstk:2;
};
#ifdef CONFIG_X86_INTEL_CET
+int prctl_cet(int option, unsigned long arg2);
unsigned long cet_get_shstk_ptr(void);
int cet_push_shstk(int ia32, unsigned long ssp, unsigned long val);
int cet_setup_shstk(void);
int cet_setup_thread_shstk(struct task_struct *p);
+int cet_alloc_shstk(unsigned long *arg);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(unsigned long ssp);
int cet_setup_signal(int ia32, unsigned long addr);
#else
+static inline int prctl_cet(int option, unsigned long arg2) { return 0; }
static inline unsigned long cet_get_shstk_ptr(void) { return 0; }
static inline int cet_push_shstk(int ia32, unsigned long ssp,
unsigned long val) { return 0; }
static inline int cet_setup_shstk(void) { return 0; }
static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
+static inline int cet_alloc_shstk(unsigned long *arg) { return -EINVAL; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(unsigned long ssp) { return 0; }
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..f9965403b655 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,19 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_CET_STATUS 0x3001
+#define ARCH_CET_DISABLE 0x3002
+#define ARCH_CET_LOCK 0x3003
+#define ARCH_CET_EXEC 0x3004
+#define ARCH_CET_ALLOC_SHSTK 0x3005
+#define ARCH_CET_PUSH_SHSTK 0x3006
+
+/*
+ * Settings for ARCH_CET_EXEC
+ */
+#define CET_EXEC_ELF_PROPERTY 0
+#define CET_EXEC_ALWAYS_OFF 1
+#define CET_EXEC_ALWAYS_ON 2
+#define CET_EXEC_MAX CET_EXEC_ALWAYS_ON
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cbf983f44b61..80464f925a6a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,7 +138,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_X86_INTEL_CET) += cet.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o cet_prctl.o
obj-$(CONFIG_ARCH_HAS_PROGRAM_PROPERTIES) += elf.o
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 156f5d88ffd5..1b7089dcf1ea 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -83,6 +83,19 @@ static unsigned long shstk_mmap(unsigned long addr, unsigned long len)
return addr;
}
+int cet_alloc_shstk(unsigned long *arg)
+{
+ unsigned long size = *arg;
+ unsigned long addr;
+
+ addr = shstk_mmap(0, size);
+ if (addr >= TASK_SIZE)
+ return -ENOMEM;
+
+ *arg = addr;
+ return 0;
+}
+
int cet_setup_shstk(void)
{
unsigned long addr, size;
@@ -90,7 +103,10 @@ int cet_setup_shstk(void)
if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
return -EOPNOTSUPP;
- size = SHSTK_SIZE;
+ size = current->thread.cet.exec_shstk_size;
+ if ((size > TASK_SIZE) || (size == 0))
+ size = SHSTK_SIZE;
+
addr = shstk_mmap(0, size);
if (addr >= TASK_SIZE)
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index 000000000000..326996e2ea80
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+#include <linux/prctl.h>
+#include <linux/compat.h>
+#include <asm/processor.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+#include <asm/elf_property.h>
+#include <asm/cet.h>
+
+/*
+ * Handler of prctl for CET:
+ *
+ * ARCH_CET_STATUS: return the current status
+ * ARCH_CET_DISABLE: disable features
+ * ARCH_CET_LOCK: lock out cet features until exec()
+ * ARCH_CET_EXEC: set default features for exec()
+ * ARCH_CET_ALLOC_SHSTK: allocate shadow stack
+ * ARCH_CET_PUSH_SHSTK: put a return address on shadow stack
+ */
+
+static int handle_get_status(unsigned long arg2)
+{
+ unsigned int features = 0, cet_exec = 0;
+ unsigned long shstk_size = 0;
+
+ if (current->thread.cet.shstk_enabled)
+ features |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+ if (current->thread.cet.exec_shstk == CET_EXEC_ALWAYS_ON)
+ cet_exec |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+ shstk_size = current->thread.cet.exec_shstk_size;
+
+ if (in_compat_syscall()) {
+ unsigned int buf[3];
+
+ buf[0] = features;
+ buf[1] = cet_exec;
+ buf[2] = (unsigned int)shstk_size;
+ return copy_to_user((unsigned int __user *)arg2, buf,
+ sizeof(buf));
+ } else {
+ unsigned long buf[3];
+
+ buf[0] = (unsigned long)features;
+ buf[1] = (unsigned long)cet_exec;
+ buf[2] = shstk_size;
+ return copy_to_user((unsigned long __user *)arg2, buf,
+ sizeof(buf));
+ }
+}
+
+static int handle_set_exec(unsigned long arg2)
+{
+ unsigned int features = 0, cet_exec = 0;
+ unsigned long shstk_size = 0;
+ int err = 0;
+
+ if (in_compat_syscall()) {
+ unsigned int buf[3];
+
+ err = copy_from_user(buf, (unsigned int __user *)arg2,
+ sizeof(buf));
+ if (!err) {
+ features = buf[0];
+ cet_exec = buf[1];
+ shstk_size = (unsigned long)buf[2];
+ }
+ } else {
+ unsigned long buf[3];
+
+ err = copy_from_user(buf, (unsigned long __user *)arg2,
+ sizeof(buf));
+ if (!err) {
+ features = (unsigned int)buf[0];
+ cet_exec = (unsigned int)buf[1];
+ shstk_size = buf[2];
+ }
+ }
+
+ if (err)
+ return -EFAULT;
+ if (cet_exec > CET_EXEC_MAX)
+ return -EINVAL;
+ if (shstk_size >= TASK_SIZE)
+ return -EINVAL;
+
+ if (features & GNU_PROPERTY_X86_FEATURE_1_SHSTK) {
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -EINVAL;
+ if ((current->thread.cet.exec_shstk == CET_EXEC_ALWAYS_ON) &&
+ (cet_exec != CET_EXEC_ALWAYS_ON))
+ return -EPERM;
+ }
+
+ if (features & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+ current->thread.cet.exec_shstk = cet_exec;
+
+ current->thread.cet.exec_shstk_size = shstk_size;
+ return 0;
+}
+
+static int handle_push_shstk(unsigned long arg2)
+{
+ unsigned long ssp = 0, ret_addr = 0;
+ int ia32, err;
+
+ ia32 = in_ia32_syscall();
+
+ if (ia32) {
+ unsigned int buf[2];
+
+ err = copy_from_user(buf, (unsigned int __user *)arg2,
+ sizeof(buf));
+ if (!err) {
+ ssp = (unsigned long)buf[0];
+ ret_addr = (unsigned long)buf[1];
+ }
+ } else {
+ unsigned long buf[2];
+
+ err = copy_from_user(buf, (unsigned long __user *)arg2,
+ sizeof(buf));
+ if (!err) {
+ ssp = buf[0];
+ ret_addr = buf[1];
+ }
+ }
+ if (err)
+ return -EFAULT;
+ err = cet_push_shstk(ia32, ssp, ret_addr);
+ if (err)
+ return -err;
+ return 0;
+}
+
+static int handle_alloc_shstk(unsigned long arg2)
+{
+ int err = 0;
+ unsigned long shstk_size = 0;
+
+ if (in_ia32_syscall()) {
+ unsigned int size;
+
+ err = get_user(size, (unsigned int __user *)arg2);
+ if (!err)
+ shstk_size = size;
+ } else {
+ err = get_user(shstk_size, (unsigned long __user *)arg2);
+ }
+
+ if (err)
+ return -EFAULT;
+
+ err = cet_alloc_shstk(&shstk_size);
+ if (err)
+ return -err;
+
+ if (in_ia32_syscall()) {
+ if (put_user(shstk_size, (unsigned int __user *)arg2))
+ return -EFAULT;
+ } else {
+ if (put_user(shstk_size, (unsigned long __user *)arg2))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+int prctl_cet(int option, unsigned long arg2)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -EINVAL;
+
+ switch (option) {
+ case ARCH_CET_STATUS:
+ return handle_get_status(arg2);
+
+ case ARCH_CET_DISABLE:
+ if (current->thread.cet.locked)
+ return -EPERM;
+ if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+ cet_disable_free_shstk(current);
+
+ return 0;
+
+ case ARCH_CET_LOCK:
+ current->thread.cet.locked = 1;
+ return 0;
+
+ case ARCH_CET_EXEC:
+ return handle_set_exec(arg2);
+
+ case ARCH_CET_ALLOC_SHSTK:
+ return handle_alloc_shstk(arg2);
+
+ case ARCH_CET_PUSH_SHSTK:
+ return handle_push_shstk(arg2);
+
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/arch/x86/kernel/elf.c b/arch/x86/kernel/elf.c
index 8e2719d8dc86..de08d41971f6 100644
--- a/arch/x86/kernel/elf.c
+++ b/arch/x86/kernel/elf.c
@@ -8,7 +8,10 @@
#include <asm/cet.h>
#include <asm/elf_property.h>
+#include <asm/prctl.h>
+#include <asm/processor.h>
#include <uapi/linux/elf-em.h>
+#include <uapi/linux/prctl.h>
#include <linux/binfmts.h>
#include <linux/elf.h>
#include <linux/slab.h>
@@ -208,13 +211,26 @@ int arch_setup_features(void *ehdr_p, void *phdr_p,
current->thread.cet.shstk_enabled = 0;
current->thread.cet.shstk_base = 0;
current->thread.cet.shstk_size = 0;
+ current->thread.cet.locked = 0;
if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
- if (shstk) {
- err = cet_setup_shstk();
- if (err < 0)
- goto out;
+ int exec = current->thread.cet.exec_shstk;
+
+ if (exec != CET_EXEC_ALWAYS_OFF) {
+ if (shstk || (exec == CET_EXEC_ALWAYS_ON)) {
+ err = cet_setup_shstk();
+ if (err < 0)
+ goto out;
+ }
}
}
+
+ /*
+ * Lockout CET features if no interpreter
+ */
+ if (!interp)
+ current->thread.cet.locked = 1;
+
+ err = 0;
out:
return err;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ae56caee41f9..54ad1863c6d2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -794,6 +794,13 @@ long do_arch_prctl_common(struct task_struct *task, int option,
return get_cpuid_mode();
case ARCH_SET_CPUID:
return set_cpuid_mode(task, cpuid_enabled);
+ case ARCH_CET_STATUS:
+ case ARCH_CET_DISABLE:
+ case ARCH_CET_LOCK:
+ case ARCH_CET_EXEC:
+ case ARCH_CET_ALLOC_SHSTK:
+ case ARCH_CET_PUSH_SHSTK:
+ return prctl_cet(option, cpuid_enabled);
}
return -EINVAL;
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 04/10] x86/cet: Handle thread shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:38 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
When fork() specifies CLONE_VM but not CLONE_VFORK, the child
needs a separate program stack and a separate shadow stack.
This patch handles allocation and freeing of the thread shadow
stack.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 2 ++
arch/x86/include/asm/mmu_context.h | 3 +++
arch/x86/kernel/cet.c | 34 ++++++++++++++++++++++++++++++++++
arch/x86/kernel/process.c | 1 +
arch/x86/kernel/process_64.c | 7 +++++++
5 files changed, 47 insertions(+)
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 5507469cb803..c8fd87e13859 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -19,6 +19,7 @@ struct cet_stat {
unsigned long cet_get_shstk_ptr(void);
int cet_push_shstk(int ia32, unsigned long ssp, unsigned long val);
int cet_setup_shstk(void);
+int cet_setup_thread_shstk(struct task_struct *p);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
int cet_restore_signal(unsigned long ssp);
@@ -28,6 +29,7 @@ static inline unsigned long cet_get_shstk_ptr(void) { return 0; }
static inline int cet_push_shstk(int ia32, unsigned long ssp,
unsigned long val) { return 0; }
static inline int cet_setup_shstk(void) { return 0; }
+static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline int cet_restore_signal(unsigned long ssp) { return 0; }
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index cf9911b5a53c..42395257efc3 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
+#include <asm/cet.h>
extern atomic64_t last_mm_ctx_id;
@@ -228,6 +229,8 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
+ if (!tsk->vfork_done) \
+ cet_disable_free_shstk(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 6f445ce94c83..156f5d88ffd5 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -103,6 +103,40 @@ int cet_setup_shstk(void)
return 0;
}
+int cet_setup_thread_shstk(struct task_struct *tsk)
+{
+ unsigned long addr, size;
+ struct cet_user_state *state;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ state = get_xsave_addr(&tsk->thread.fpu.state.xsave,
+ XFEATURE_MASK_SHSTK_USER);
+
+ if (!state)
+ return -EINVAL;
+
+ size = tsk->thread.cet.shstk_size;
+ if (size == 0)
+ size = SHSTK_SIZE;
+
+ addr = shstk_mmap(0, size);
+
+ if (addr >= TASK_SIZE) {
+ tsk->thread.cet.shstk_base = 0;
+ tsk->thread.cet.shstk_size = 0;
+ tsk->thread.cet.shstk_enabled = 0;
+ return -ENOMEM;
+ }
+
+ state->user_ssp = (u64)(addr + size - sizeof(u64));
+ tsk->thread.cet.shstk_base = addr;
+ tsk->thread.cet.shstk_size = size;
+ tsk->thread.cet.shstk_enabled = 1;
+ return 0;
+}
+
void cet_disable_shstk(void)
{
u64 r;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b3b0b482983a..ae56caee41f9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -127,6 +127,7 @@ void exit_thread(struct task_struct *tsk)
free_vm86(t);
+ cet_disable_free_shstk(tsk);
fpu__drop(fpu);
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 12bb445fb98d..6e493b0bcedd 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -317,6 +317,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (sp)
childregs->sp = sp;
+ /* Allocate a new shadow stack for pthread */
+ if ((clone_flags & (CLONE_VFORK | CLONE_VM)) == CLONE_VM) {
+ err = cet_setup_thread_shstk(p);
+ if (err)
+ goto out;
+ }
+
err = -ENOMEM;
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 09/10] mm: Prevent madvise from changing shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:38 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
mm/madvise.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d3c922ea1a1..2a6988badd6b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -839,6 +839,14 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (vma && start > vma->vm_start)
prev = vma;
+ /*
+ * Don't do anything on shadow stack.
+ */
+ if (vma->vm_flags & VM_SHSTK) {
+ error = -EINVAL;
+ goto out_no_plug;
+ }
+
blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
@@ -876,6 +884,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
}
out:
blk_finish_plug(&plug);
+out_no_plug:
if (write)
up_write(¤t->mm->mmap_sem);
else
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 10/10] mm: Prevent munmap and remap_file_pages of shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:38 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
mm/mmap.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index fc41c0543d7f..e7d1fcb7ec58 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2810,6 +2810,16 @@ EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
+ struct vm_area_struct *vma;
+
+ /* Do not munmap shadow stack */
+ down_read(¤t->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ if (vma && (vma->vm_flags & VM_SHSTK)) {
+ up_read(¤t->mm->mmap_sem);
+ return -EINVAL;
+ }
+ up_read(¤t->mm->mmap_sem);
profile_munmap(addr);
return vm_munmap(addr, len);
}
@@ -2851,6 +2861,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
+ if (vma->vm_flags & VM_SHSTK)
+ goto out;
+
if (start < vma->vm_start)
goto out;
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 02/10] x86/cet: Introduce WRUSS instruction
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
WRUSS is a new kernel-mode instruction but writes directly
to user shadow stack memory. This is used to construct
a return address on the shadow stack for the signal
handler.
This instruction can fault if the user shadow stack is
invalid shadow stack memory. In that case, the kernel does
fixup.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/special_insns.h | 44 +++++++++++++++++++++++++++
arch/x86/lib/x86-opcode-map.txt | 2 +-
arch/x86/mm/fault.c | 13 +++++++-
tools/objtool/arch/x86/lib/x86-opcode-map.txt | 2 +-
4 files changed, 58 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 317fc59b512c..8ce532fcc171 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -237,6 +237,50 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+#ifdef CONFIG_X86_INTEL_CET
+
+#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_X32)
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+ int err;
+
+ asm volatile("1:.byte 0x66, 0x0f, 0x38, 0xf5, 0x37\n"
+ "xor %[err],%[err]\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: mov $-1,%[err]; jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=a" (err)
+ : [val] "S" (val), [addr] "D" (addr)
+ : "memory");
+ return err;
+}
+#else
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+ return 0;
+}
+#endif
+
+static inline int write_user_shstk_64(unsigned long addr, unsigned long val)
+{
+ int err;
+
+ asm volatile("1:.byte 0x66, 0x48, 0x0f, 0x38, 0xf5, 0x37\n"
+ "xor %[err],%[err]\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: mov $-1,%[err]; jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=a" (err)
+ : [val] "S" (val), [addr] "D" (addr)
+ : "memory");
+ return err;
+}
+#endif /* CONFIG_X86_INTEL_CET */
+
#define nop() asm volatile ("nop")
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index e0b85930dd77..72bb7c48a7df 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -789,7 +789,7 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
-f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSS Pq,Qq (66),REX.W
f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2b3b9170109c..f157338862f8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -640,6 +640,17 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
return 0;
}
+/*
+ * WRUSS is a kernel instrcution and but writes to user
+ * shadow stack memory. When a fault occurs, both
+ * X86_PF_USER and X86_PF_SHSTK are set.
+ */
+static int is_wruss(struct pt_regs *regs, unsigned long error_code)
+{
+ return (((error_code & (X86_PF_USER | X86_PF_SHSTK)) ==
+ (X86_PF_USER | X86_PF_SHSTK)) && !user_mode(regs));
+}
+
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
static const char smep_warning[] = KERN_CRIT
@@ -851,7 +862,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
- if (error_code & X86_PF_USER) {
+ if ((error_code & X86_PF_USER) && !is_wruss(regs, error_code)) {
/*
* It's possible to have interrupts off here:
*/
diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
index e0b85930dd77..72bb7c48a7df 100644
--- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
@@ -789,7 +789,7 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
f3: Grp17 (1A)
-f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSS Pq,Qq (66),REX.W
f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 03/10] x86/cet: Signal handling for shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:38 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
Set and restore shadow stack pointer for signals.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/ia32/ia32_signal.c | 5 ++++
arch/x86/include/asm/cet.h | 7 +++++
arch/x86/include/uapi/asm/sigcontext.h | 4 +++
arch/x86/kernel/cet.c | 51 ++++++++++++++++++++++++++++++++++
arch/x86/kernel/signal.c | 11 ++++++++
5 files changed, 78 insertions(+)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 86b1341cba9a..26a776baff7c 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,6 +34,7 @@
#include <asm/sigframe.h>
#include <asm/sighandling.h>
#include <asm/smap.h>
+#include <asm/cet.h>
/*
* Do a signal return; undo the signal stack.
@@ -74,6 +75,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
unsigned int tmpflags, err = 0;
void __user *buf;
u32 tmp;
+ u32 ssp;
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -104,9 +106,11 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
get_user_ex(tmp, &sc->fpstate);
buf = compat_ptr(tmp);
+ get_user_ex(ssp, &sc->ssp);
} get_user_catch(err);
err |= fpu__restore_sig(buf, 1);
+ err |= cet_restore_signal((unsigned long)ssp);
force_iret();
@@ -194,6 +198,7 @@ static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
put_user_ex(current->thread.trap_nr, &sc->trapno);
put_user_ex(current->thread.error_code, &sc->err);
put_user_ex(regs->ip, &sc->ip);
+ put_user_ex((u32)cet_get_shstk_ptr(), &sc->ssp);
put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
put_user_ex(regs->flags, &sc->flags);
put_user_ex(regs->sp, &sc->sp_at_signal);
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 9d5bc1efc9b7..5507469cb803 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -17,14 +17,21 @@ struct cet_stat {
#ifdef CONFIG_X86_INTEL_CET
unsigned long cet_get_shstk_ptr(void);
+int cet_push_shstk(int ia32, unsigned long ssp, unsigned long val);
int cet_setup_shstk(void);
void cet_disable_shstk(void);
void cet_disable_free_shstk(struct task_struct *p);
+int cet_restore_signal(unsigned long ssp);
+int cet_setup_signal(int ia32, unsigned long addr);
#else
static inline unsigned long cet_get_shstk_ptr(void) { return 0; }
+static inline int cet_push_shstk(int ia32, unsigned long ssp,
+ unsigned long val) { return 0; }
static inline int cet_setup_shstk(void) { return 0; }
static inline void cet_disable_shstk(void) {}
static inline void cet_disable_free_shstk(struct task_struct *p) {}
+static inline int cet_restore_signal(unsigned long ssp) { return 0; }
+static inline int cet_setup_signal(int ia32, unsigned long addr) { return 0; }
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 844d60eb1882..6c8997a0156a 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -230,6 +230,7 @@ struct sigcontext_32 {
__u32 fpstate; /* Zero when no FPU/extended context */
__u32 oldmask;
__u32 cr2;
+ __u32 ssp;
};
/*
@@ -262,6 +263,7 @@ struct sigcontext_64 {
__u64 trapno;
__u64 oldmask;
__u64 cr2;
+ __u64 ssp;
/*
* fpstate is really (struct _fpstate *) or (struct _xstate *)
@@ -320,6 +322,7 @@ struct sigcontext {
struct _fpstate __user *fpstate;
__u32 oldmask;
__u32 cr2;
+ __u32 ssp;
};
# else /* __x86_64__: */
struct sigcontext {
@@ -377,6 +380,7 @@ struct sigcontext {
__u64 trapno;
__u64 oldmask;
__u64 cr2;
+ __u64 ssp;
struct _fpstate __user *fpstate; /* Zero when no FPU context */
# ifdef __ILP32__
__u32 __fpstate_pad;
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 8abbfd44322a..6f445ce94c83 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -17,6 +17,7 @@
#include <asm/fpu/xstate.h>
#include <asm/fpu/types.h>
#include <asm/cet.h>
+#include <asm/special_insns.h>
#define SHSTK_SIZE (0x8000 * (test_thread_flag(TIF_IA32) ? 4 : 8))
@@ -47,6 +48,24 @@ unsigned long cet_get_shstk_ptr(void)
return ptr;
}
+int cet_push_shstk(int ia32, unsigned long ssp, unsigned long val)
+{
+ if (val >= TASK_SIZE)
+ return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) && ia32) {
+ if (!IS_ALIGNED(ssp, 4))
+ return -EINVAL;
+ cet_set_shstk_ptr(ssp);
+ return write_user_shstk_32(ssp, (unsigned int)val);
+ } else {
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+ cet_set_shstk_ptr(ssp);
+ return write_user_shstk_64(ssp, val);
+ }
+}
+
static unsigned long shstk_mmap(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
@@ -121,3 +140,35 @@ void cet_disable_free_shstk(struct task_struct *tsk)
tsk->thread.cet.shstk_enabled = 0;
}
+
+int cet_restore_signal(unsigned long ssp)
+{
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+ return cet_set_shstk_ptr(ssp);
+}
+
+int cet_setup_signal(int ia32, unsigned long rstor_addr)
+{
+ unsigned long ssp;
+ struct cet_stat *cet = ¤t->thread.cet;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ ssp = cet_get_shstk_ptr();
+
+ /*
+ * Put the restorer address on the shstk
+ */
+ if (ia32)
+ ssp -= sizeof(u32);
+ else
+ ssp -= sizeof(rstor_addr);
+
+ if (ssp >= (cet->shstk_base + cet->shstk_size) ||
+ ssp < cet->shstk_base)
+ return -EINVAL;
+
+ return cet_push_shstk(ia32, ssp, rstor_addr);
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index da270b95fe4d..86fb897cae19 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -46,6 +46,7 @@
#include <asm/sigframe.h>
#include <asm/signal.h>
+#include <asm/cet.h>
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
@@ -102,6 +103,7 @@ static int restore_sigcontext(struct pt_regs *regs,
void __user *buf;
unsigned int tmpflags;
unsigned int err = 0;
+ unsigned long ssp = 0;
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -148,9 +150,11 @@ static int restore_sigcontext(struct pt_regs *regs,
get_user_ex(buf_val, &sc->fpstate);
buf = (void __user *)buf_val;
+ get_user_ex(ssp, &sc->ssp);
} get_user_catch(err);
err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
+ err |= cet_restore_signal(ssp);
force_iret();
@@ -193,6 +197,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
put_user_ex(current->thread.trap_nr, &sc->trapno);
put_user_ex(current->thread.error_code, &sc->err);
put_user_ex(regs->ip, &sc->ip);
+ put_user_ex(cet_get_shstk_ptr(), &sc->ssp);
#ifdef CONFIG_X86_32
put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
put_user_ex(regs->flags, &sc->flags);
@@ -742,6 +747,12 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
user_disable_single_step(current);
failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ unsigned long rstor = (unsigned long)ksig->ka.sa.sa_restorer;
+ int ia32 = is_ia32_frame(ksig);
+
+ failed = cet_setup_signal(ia32, rstor);
+ }
if (!failed) {
/*
* Clear the direction flag as per the ABI for function entry.
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 01/10] x86/cet: User-mode shadow stack support
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143807.3611-1-yu-cheng.yu@intel.com>
This patch adds basic shadow stack enabling/disabling routines.
A task's shadow stack is allocated from memory with VM_SHSTK
flag set and read-only protection. The shadow stack is
allocated to a fixed size and that can be changed by the system
admin.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cet.h | 32 ++++++++
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/msr-index.h | 14 ++++
arch/x86/include/asm/processor.h | 5 ++
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/cet.c | 123 +++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/common.c | 24 ++++++
arch/x86/kernel/process.c | 2 +
fs/proc/task_mmu.c | 3 +
9 files changed, 212 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/include/asm/cet.h
create mode 100644 arch/x86/kernel/cet.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index 000000000000..9d5bc1efc9b7
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+/*
+ * Per-thread CET status
+ */
+struct cet_stat {
+ unsigned long shstk_base;
+ unsigned long shstk_size;
+ unsigned int shstk_enabled:1;
+};
+
+#ifdef CONFIG_X86_INTEL_CET
+unsigned long cet_get_shstk_ptr(void);
+int cet_setup_shstk(void);
+void cet_disable_shstk(void);
+void cet_disable_free_shstk(struct task_struct *p);
+#else
+static inline unsigned long cet_get_shstk_ptr(void) { return 0; }
+static inline int cet_setup_shstk(void) { return 0; }
+static inline void cet_disable_shstk(void) {}
+static inline void cet_disable_free_shstk(struct task_struct *p) {}
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33833d1909af..3624a11e5ba6 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -56,6 +56,12 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+#define DISABLE_SHSTK 0
+#else
+#define DISABLE_SHSTK (1<<(X86_FEATURE_SHSTK & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -75,7 +81,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP|DISABLE_SHSTK)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fda2114197b3..428d13828ba9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -770,4 +770,18 @@
#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET 0x6a0
+#define MSR_IA32_S_CET 0x6a2
+#define MSR_IA32_PL0_SSP 0x6a4
+#define MSR_IA32_PL3_SSP 0x6a7
+#define MSR_IA32_INT_SSP_TAB 0x6a8
+
+/* MSR_IA32_U_CET and MSR_IA32_S_CET bits */
+#define MSR_IA32_CET_SHSTK_EN 0x0000000000000001
+#define MSR_IA32_CET_WRSS_EN 0x0000000000000002
+#define MSR_IA32_CET_ENDBR_EN 0x0000000000000004
+#define MSR_IA32_CET_LEG_IW_EN 0x0000000000000008
+#define MSR_IA32_CET_NO_TRACK_EN 0x0000000000000010
+
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 21a114914ba4..e632dd7adaac 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -24,6 +24,7 @@ struct vm86;
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
+#include <asm/cet.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -507,6 +508,10 @@ struct thread_struct {
unsigned int sig_on_uaccess_err:1;
unsigned int uaccess_err:1; /* uaccess failed */
+#ifdef CONFIG_X86_INTEL_CET
+ struct cet_stat cet;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 02d6f5cf4e70..7ea5e099d558 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,6 +138,8 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..8abbfd44322a
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cet.c - Control Flow Enforcement (CET)
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <asm/msr.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/cet.h>
+
+#define SHSTK_SIZE (0x8000 * (test_thread_flag(TIF_IA32) ? 4 : 8))
+
+static inline int cet_set_shstk_ptr(unsigned long addr)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -1;
+
+ if ((addr >= TASK_SIZE) || (!IS_ALIGNED(addr, 4)))
+ return -1;
+
+ rdmsrl(MSR_IA32_U_CET, r);
+ wrmsrl(MSR_IA32_U_CET, r | MSR_IA32_CET_SHSTK_EN);
+ wrmsrl(MSR_IA32_PL3_SSP, addr);
+ return 0;
+}
+
+unsigned long cet_get_shstk_ptr(void)
+{
+ unsigned long ptr;
+
+ if (!current->thread.cet.shstk_enabled)
+ return 0;
+
+ rdmsrl(MSR_IA32_PL3_SSP, ptr);
+ return ptr;
+}
+
+static unsigned long shstk_mmap(unsigned long addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long populate;
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap(NULL, addr, len, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_SHSTK,
+ 0, &populate, NULL);
+ up_write(&mm->mmap_sem);
+
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
+}
+
+int cet_setup_shstk(void)
+{
+ unsigned long addr, size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return -EOPNOTSUPP;
+
+ size = SHSTK_SIZE;
+ addr = shstk_mmap(0, size);
+
+ if (addr >= TASK_SIZE)
+ return -ENOMEM;
+
+ cet_set_shstk_ptr(addr + size - sizeof(void *));
+ current->thread.cet.shstk_base = addr;
+ current->thread.cet.shstk_size = size;
+ current->thread.cet.shstk_enabled = 1;
+ return 0;
+}
+
+void cet_disable_shstk(void)
+{
+ u64 r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return;
+
+ rdmsrl(MSR_IA32_U_CET, r);
+ r &= ~(MSR_IA32_CET_SHSTK_EN);
+ wrmsrl(MSR_IA32_U_CET, r);
+ wrmsrl(MSR_IA32_PL3_SSP, 0);
+ current->thread.cet.shstk_enabled = 0;
+}
+
+void cet_disable_free_shstk(struct task_struct *tsk)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+ !tsk->thread.cet.shstk_enabled)
+ return;
+
+ if (tsk == current)
+ cet_disable_shstk();
+
+ /*
+ * Free only when tsk is current or shares mm
+ * with current but has its own shstk.
+ */
+ if (tsk->mm && (tsk->mm == current->mm) &&
+ (tsk->thread.cet.shstk_base)) {
+ vm_munmap(tsk->thread.cet.shstk_base,
+ tsk->thread.cet.shstk_size);
+ tsk->thread.cet.shstk_base = 0;
+ tsk->thread.cet.shstk_size = 0;
+ }
+
+ tsk->thread.cet.shstk_enabled = 0;
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 38276f58d3bf..f54fabdaef60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -401,6 +401,29 @@ static __init int setup_disable_pku(char *arg)
__setup("nopku", setup_disable_pku);
#endif /* CONFIG_X86_64 */
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK))
+ cr4_set_bits(X86_CR4_CET);
+}
+
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+static __init int setup_disable_shstk(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ if (!boot_cpu_has(X86_FEATURE_SHSTK))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_SHSTK);
+ pr_info("x86: 'noshstk' specified, disabling Shadow Stack\n");
+ return 1;
+}
+__setup("noshstk", setup_disable_shstk);
+#endif
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -1313,6 +1336,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
x86_init_rdrand(c);
x86_init_cache_qos(c);
setup_pku(c);
+ setup_cet(c);
/*
* Clear/Set all flags overridden by options, need do it
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 30ca2d1a9231..b3b0b482983a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,6 +39,7 @@
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
+#include <asm/cet.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -136,6 +137,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ cet_disable_shstk();
fpu__clear(&tsk->thread.fpu);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c486ad4b43f0..6aca93ecec0e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -679,6 +679,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
[ilog2(VM_PKEY_BIT3)] = "",
+#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+ [ilog2(VM_SHSTK)] = "ss"
#endif
};
size_t i;
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 00/10] Control Flow Enforcement - Part (3)
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
This series introduces CET - Shadow stack
At the high level, shadow stack is:
Allocated from a task's address space with vm_flags VM_SHSTK;
Its PTEs must be read-only and dirty;
Fixed sized, but the default size can be changed by sys admin.
For a forked child, the shadow stack is duplicated when the next
shadow stack access takes place.
For a pthread child, a new shadow stack is allocated.
The signal handler uses the same shadow stack as the main program.
Yu-cheng Yu (10):
x86/cet: User-mode shadow stack support
x86/cet: Introduce WRUSS instruction
x86/cet: Signal handling for shadow stack
x86/cet: Handle thread shadow stack
x86/cet: ELF header parsing of Control Flow Enforcement
x86/cet: Add arch_prctl functions for shadow stack
mm: Prevent mprotect from changing shadow stack
mm: Prevent mremap of shadow stack
mm: Prevent madvise from changing shadow stack
mm: Prevent munmap and remap_file_pages of shadow stack
arch/x86/Kconfig | 4 +
arch/x86/ia32/ia32_signal.c | 5 +
arch/x86/include/asm/cet.h | 48 ++++++
arch/x86/include/asm/disabled-features.h | 8 +-
arch/x86/include/asm/elf.h | 5 +
arch/x86/include/asm/mmu_context.h | 3 +
arch/x86/include/asm/msr-index.h | 14 ++
arch/x86/include/asm/processor.h | 5 +
arch/x86/include/asm/special_insns.h | 44 +++++
arch/x86/include/uapi/asm/elf_property.h | 16 ++
arch/x86/include/uapi/asm/prctl.h | 15 ++
arch/x86/include/uapi/asm/sigcontext.h | 4 +
arch/x86/kernel/Makefile | 4 +
arch/x86/kernel/cet.c | 224 ++++++++++++++++++++++++
arch/x86/kernel/cet_prctl.c | 203 ++++++++++++++++++++++
arch/x86/kernel/cpu/common.c | 24 +++
arch/x86/kernel/elf.c | 236 ++++++++++++++++++++++++++
arch/x86/kernel/process.c | 10 ++
arch/x86/kernel/process_64.c | 7 +
arch/x86/kernel/signal.c | 11 ++
arch/x86/lib/x86-opcode-map.txt | 2 +-
arch/x86/mm/fault.c | 13 +-
fs/binfmt_elf.c | 16 ++
fs/proc/task_mmu.c | 3 +
include/uapi/linux/elf.h | 1 +
mm/madvise.c | 9 +
mm/mmap.c | 13 ++
mm/mprotect.c | 9 +
mm/mremap.c | 5 +-
tools/objtool/arch/x86/lib/x86-opcode-map.txt | 2 +-
30 files changed, 958 insertions(+), 5 deletions(-)
create mode 100644 arch/x86/include/asm/cet.h
create mode 100644 arch/x86/include/uapi/asm/elf_property.h
create mode 100644 arch/x86/kernel/cet.c
create mode 100644 arch/x86/kernel/cet_prctl.c
create mode 100644 arch/x86/kernel/elf.c
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH 9/9] x86/cet: Handle THP/HugeTLB shadow stack page copying
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
This patch implements THP shadow stack memory copying in the same
way as the previous patch for regular PTE.
In copy_huge_pmd(), we clear the dirty bit from the PMD. On the
next shadow stack access to the PMD, a page fault occurs. At
that time, the page is copied/re-used and the PMD is fixed.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
mm/huge_memory.c | 10 +++++++++-
mm/hugetlb.c | 2 +-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3a1815f8e11..c6e72ccc4274 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -600,6 +600,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pmd_mkdirty_shstk(entry);
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
@@ -976,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
- pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmdp_set_wrprotect_flush(vma, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1196,6 +1198,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pte_mkdirty_shstk(entry);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
@@ -1280,6 +1284,8 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pmd_mkdirty_shstk(entry);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
@@ -1350,6 +1356,8 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pmd_mkdirty_shstk(entry);
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 218679138255..d694cfab9f90 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3293,7 +3293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*
* See Documentation/vm/mmu_notifier.txt
*/
- huge_ptep_set_wrprotect(src, addr, src_pte);
+ huge_ptep_set_wrprotect_flush(vma, addr, src_pte);
}
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 7/9] x86/mm: Shadow stack page fault error checking
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
If a page fault is triggered by a shadow stack access (e.g.
call/ret) or shadow stack management instructions (e.g.
wrussq), then bit[6] of the page fault error code is set.
In access_error(), we check if a shadow stack page fault
is within a shadow stack memory area.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/traps.h | 2 ++
arch/x86/mm/fault.c | 11 +++++++++++
2 files changed, 13 insertions(+)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 5196050ff3d5..58ea2f5722e9 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -157,6 +157,7 @@ enum {
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
@@ -165,5 +166,6 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SHSTK = 1 << 6,
};
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 73bd8c95ac71..2b3b9170109c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1166,6 +1166,17 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
(error_code & X86_PF_INSTR), foreign))
return 1;
+ /*
+ * Verify X86_PF_SHSTK is within a shadow stack VMA.
+ * It is always an error if there is a shadow stack
+ * fault outside a shadow stack VMA.
+ */
+ if (error_code & X86_PF_SHSTK) {
+ if (!(vma->vm_flags & VM_SHSTK))
+ return 1;
+ return 0;
+ }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 8/9] x86/cet: Handle shadow stack page fault
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
When a task does fork(), its shadow stack must be duplicated for
the child. However, the child may not actually use all pages of
of the copied shadow stack. This patch implements a flow that
is similar to copy-on-write of an anonymous page, but for shadow
stack memory. A shadow stack PTE needs to be RO and dirty. We
use this dirty bit requirement to effect the copying of shadow
stack pages.
In copy_one_pte(), we clear the dirty bit from the shadow stack
PTE. On the next shadow stack access to the PTE, a page fault
occurs. At that time, we then copy/re-use the page and fix the
PTE.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
mm/memory.c | 32 +++++++++++++++++++++++++++++---
1 file changed, 29 insertions(+), 3 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464e0fd2..275c7fb3fc96 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1022,7 +1022,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* in the parent and the child
*/
if (is_cow_mapping(vm_flags)) {
- ptep_set_wrprotect(src_mm, addr, src_pte);
+ ptep_set_wrprotect_flush(vma, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -2444,7 +2444,13 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pte_mkdirty_shstk(entry);
+ else
+ entry = pte_mkdirty(entry);
+
+ entry = maybe_mkwrite(entry, vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2517,7 +2523,11 @@ static int wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pte_mkdirty_shstk(entry);
+ else
+ entry = pte_mkdirty(entry);
+ entry = maybe_mkwrite(entry, vma);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
@@ -3192,6 +3202,14 @@ static int do_anonymous_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
+ /*
+ * If this is within a shadow stack mapping, mark
+ * the PTE dirty. We don't use pte_mkdirty(),
+ * because the PTE must have _PAGE_DIRTY_HW set.
+ */
+ if (is_shstk_mapping(vma->vm_flags))
+ entry = pte_mkdirty_shstk(entry);
+
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
@@ -3974,6 +3992,14 @@ static int handle_pte_fault(struct vm_fault *vmf)
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
+
+ /*
+ * Shadow stack PTEs are copy-on-access, so do_wp_page()
+ * handling on them no matter if we have write fault or not.
+ */
+ if (is_shstk_mapping(vmf->vma->vm_flags))
+ return do_wp_page(vmf);
+
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 5/9] x86/mm: Introduce _PAGE_DIRTY_SW
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
The PTE DIRTY bit indicates a few conditions:
(1) When the processor writes to a memory page, the page's
PTE is (R/W + _PAGE_DIRTY_HW);
(2) When a modified page is shared from fork(), its PTE is
(R/O + _PAGE_DIRTY_HW);
(3) When access_remote_vm() has tried to write to a read-
only page with (FOLL_FORCE | FOLL_WRITE), the PTE is
(R/O + _PAGE_DIRTY_HW);
(4) A shadow stack memory page is required to be set as
(R/O + _PAGE_DIRTY_HW);
In case (1) above, the DIRTY bit is set by the processor;
for other cases, it is set by the software. However, the
processor reads the DIRTY bit only in case (4) for ensuring
a valid shadow stack page.
To make (R/O + _PAGE_DIRTY_HW) exclusively for shadow stack,
we introduce _PAGE_BIT_DIRTY_SW, a spare bit of the 64-bit
PTE, to replace _PAGE_BIT_DIRTY for case (2), (3) and (4).
This results to the following possible PTE settings:
Modified PTE: (R/W + _PAGE_DIRTY_HW)
Modified and shared PTE: (R/O + _PAGE_DIRTY_SW)
R/O PTE was (FOLL_FORCE | FOLL_WRITE): (R/O + _PAGE_DIRTY_SW)
Shadow stack PTE: (R/O + _PAGE_DIRTY_HW)
Shared shadow stack PTE: (R/O + _PAGE_DIRTY_SW)
Note that _PAGE_BIT_DRITY_SW is only used in R/O PTEs but
not R/W PTEs.
When this patch is applied, there are six free bits left in
the 64-bit PTE. There is no more free bit in the 32-bit
PTE (except for PAE) and shadow stack is not implemented
for the 32-bit kernel.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/pgtable.h | 91 ++++++++++++++++++++++++++++++++----
arch/x86/include/asm/pgtable_types.h | 14 +++++-
include/asm-generic/pgtable.h | 12 +++++
3 files changed, 105 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 00b5e79c09a6..0996f8a6979a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -116,9 +116,9 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
}
@@ -140,9 +140,9 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}
static inline int pmd_young(pmd_t pmd)
@@ -150,9 +150,9 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -281,9 +281,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pte_t pte_move_flags(pte_t pte, pteval_t from, pteval_t to)
+{
+ if (pte_flags(pte) & from)
+ pte = pte_set_flags(pte_clear_flags(pte, from), to);
+ return pte;
+}
+#else
+static inline pte_t pte_move_flags(pte_t pte, pteval_t from, pteval_t to)
+{
+ return pte;
+}
+#endif
+
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -293,6 +307,7 @@ static inline pte_t pte_mkold(pte_t pte)
static inline pte_t pte_wrprotect(pte_t pte)
{
+ pte = pte_move_flags(pte, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pte_clear_flags(pte, _PAGE_RW);
}
@@ -302,9 +317,18 @@ static inline pte_t pte_mkexec(pte_t pte)
}
static inline pte_t pte_mkdirty(pte_t pte)
+{
+ pteval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHSTK_USER) ||
+ pte_write(pte)) ? _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+ return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_HAS_SHSTK
+static inline pte_t pte_mkdirty_shstk(pte_t pte)
{
return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
+#endif
static inline pte_t pte_mkyoung(pte_t pte)
{
@@ -313,6 +337,7 @@ static inline pte_t pte_mkyoung(pte_t pte)
static inline pte_t pte_mkwrite(pte_t pte)
{
+ pte = pte_move_flags(pte, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pte_set_flags(pte, _PAGE_RW);
}
@@ -360,6 +385,20 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pmd_t pmd_move_flags(pmd_t pmd, pmdval_t from, pmdval_t to)
+{
+ if (pmd_flags(pmd) & from)
+ pmd = pmd_set_flags(pmd_clear_flags(pmd, from), to);
+ return pmd;
+}
+#else
+static inline pmd_t pmd_move_flags(pmd_t pmd, pmdval_t from, pmdval_t to)
+{
+ return pmd;
+}
+#endif
+
static inline pmd_t pmd_mkold(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -367,18 +406,29 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
+ pmd = pmd_move_flags(pmd, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pmd_clear_flags(pmd, _PAGE_RW);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ pmdval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHSTK_USER) ||
+ (pmd_flags(pmd) & _PAGE_RW)) ?
+ _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+ return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_HAS_SHSTK
+static inline pmd_t pmd_mkdirty_shstk(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
+#endif
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
@@ -397,6 +447,7 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
+ pmd = pmd_move_flags(pmd, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pmd_set_flags(pmd, _PAGE_RW);
}
@@ -419,6 +470,20 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+static inline pud_t pud_move_flags(pud_t pud, pudval_t from, pudval_t to)
+{
+ if (pud_flags(pud) & from)
+ pud = pud_set_flags(pud_clear_flags(pud, from), to);
+ return pud;
+}
+#else
+static inline pud_t pud_move_flags(pud_t pud, pudval_t from, pudval_t to)
+{
+ return pud;
+}
+#endif
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -426,17 +491,22 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
+ pud = pud_move_flags(pud, _PAGE_DIRTY_HW, _PAGE_DIRTY_SW);
return pud_clear_flags(pud, _PAGE_RW);
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
+ pudval_t dirty = (!IS_ENABLED(CONFIG_X86_INTEL_SHSTK_USER) ||
+ (pud_flags(pud) & _PAGE_RW)) ?
+ _PAGE_DIRTY_HW:_PAGE_DIRTY_SW;
+
+ return pud_set_flags(pud, dirty | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -456,6 +526,7 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
+ pud = pud_move_flags(pud, _PAGE_DIRTY_SW, _PAGE_DIRTY_HW);
return pud_set_flags(pud, _PAGE_RW);
}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 2ac5d46d7c49..0907adb56197 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -23,6 +23,7 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW5 57 /* available for programmer */
#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
@@ -34,6 +35,7 @@
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_DIRTY_SW _PAGE_BIT_SOFTW5 /* was written to */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -109,6 +111,14 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
+#if defined(CONFIG_X86_INTEL_SHADOW_STACK_USER)
+#define _PAGE_DIRTY_SW (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_SW)
+#else
+#define _PAGE_DIRTY_SW (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY_HW | _PAGE_DIRTY_SW)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
@@ -122,9 +132,9 @@
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW | \
- _PAGE_SOFT_DIRTY)
+ _PAGE_DIRTY_SW | _PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f59639afaa39..3f6f998509f0 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1097,4 +1097,16 @@ static inline void init_espfix_bsp(void) { }
#endif
#endif
+#ifndef CONFIG_ARCH_HAS_SHSTK
+static inline pte_t pte_mkdirty_shstk(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_mkdirty_shstk(pmd_t pmd)
+{
+ return pmd;
+}
+#endif
+
#endif /* _ASM_GENERIC_PGTABLE_H */
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 4/9] x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
From: Yu-cheng Yu @ 2018-06-07 14:37 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
We are going to create _PAGE_DIRTY_SW for non-hardware, memory
management purposes. Rename _PAGE_DIRTY to _PAGE_DIRTY_HW and
_PAGE_BIT_DIRTY to _PAGE_BIT_DIRTY_HW to make these PTE dirty
bits more clear. There are no functional changes in this
patch.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/pgtable.h | 6 +++---
arch/x86/include/asm/pgtable_types.h | 17 +++++++++--------
arch/x86/kernel/relocate_kernel_64.S | 2 +-
arch/x86/kvm/vmx.c | 2 +-
4 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f1633de5a675..00b5e79c09a6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -303,7 +303,7 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pte_set_flags(pte, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -377,7 +377,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pmd_set_flags(pmd, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -436,7 +436,7 @@ static inline pud_t pud_wrprotect(pud_t pud)
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return pud_set_flags(pud, _PAGE_DIRTY_HW | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 1e5a40673953..2ac5d46d7c49 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -15,7 +15,7 @@
#define _PAGE_BIT_PWT 3 /* page write through */
#define _PAGE_BIT_PCD 4 /* page cache disabled */
#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
-#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_DIRTY_HW 6 /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
@@ -45,7 +45,7 @@
#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_DIRTY_HW (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY_HW)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
@@ -73,7 +73,7 @@
_PAGE_PKEY_BIT3)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY_HW | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif
@@ -112,9 +112,9 @@
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
- _PAGE_ACCESSED | _PAGE_DIRTY)
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW)
#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW)
/*
* Set of bits not changed in pte_modify. The pte's
@@ -123,7 +123,7 @@
* pte_modify() does modify it.
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_HW | \
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
@@ -168,7 +168,8 @@ enum page_cache_mode {
_PAGE_ACCESSED)
#define __PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY_HW | _PAGE_ACCESSED | \
+ _PAGE_GLOBAL)
#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
@@ -187,7 +188,7 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY | _PAGE_ENC)
+ _PAGE_DIRTY_HW | _PAGE_ENC)
#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 11eda21eb697..e7665a4767b3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -17,7 +17,7 @@
*/
#define PTR(x) (x << 3)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY_HW)
/*
* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 40aa29204baf..52bd01f23316 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5235,7 +5235,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ _PAGE_ACCESSED | _PAGE_DIRTY_HW | _PAGE_PSE);
r = kvm_write_guest_page(kvm, identity_map_pfn,
&tmp, i * sizeof(tmp), sizeof(tmp));
if (r < 0)
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 0/9] Control Flow Enforcement - Part (2)
From: Yu-cheng Yu @ 2018-06-07 14:36 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
Summary of changes:
Shadow stack kernel config option;
Control protection exception; and
Shadow stack memory management.
The shadow stack PTE needs to be read-only and dirty. Changes
are made to:
Use the read-only and hardware dirty combination exclusively
for shadow stack;
Use a PTE spare bit to indicate other PTE dirty conditions;
Shadow stack page fault handling.
Yu-cheng Yu (9):
x86/cet: Control protection exception handler
x86/cet: Add Kconfig option for user-mode shadow stack
mm: Introduce VM_SHSTK for shadow stack memory
x86/mm: Change _PAGE_DIRTY to _PAGE_DIRTY_HW
x86/mm: Introduce _PAGE_DIRTY_SW
x86/mm: Introduce ptep_set_wrprotect_flush and related functions
x86/mm: Shadow stack page fault error checking
x86/cet: Handle shadow stack page fault
x86/cet: Handle THP/HugeTLB shadow stack page copying
arch/x86/Kconfig | 24 ++++++
arch/x86/entry/entry_32.S | 5 ++
arch/x86/entry/entry_64.S | 2 +-
arch/x86/include/asm/pgtable.h | 149 ++++++++++++++++++++++++++++++-----
arch/x86/include/asm/pgtable_types.h | 31 +++++---
arch/x86/include/asm/traps.h | 5 ++
arch/x86/kernel/idt.c | 1 +
arch/x86/kernel/relocate_kernel_64.S | 2 +-
arch/x86/kernel/traps.c | 61 ++++++++++++++
arch/x86/kvm/vmx.c | 2 +-
arch/x86/mm/fault.c | 11 +++
include/asm-generic/pgtable.h | 38 +++++++++
include/linux/mm.h | 8 ++
mm/huge_memory.c | 10 ++-
mm/hugetlb.c | 2 +-
mm/internal.h | 8 ++
mm/memory.c | 32 +++++++-
17 files changed, 353 insertions(+), 38 deletions(-)
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH 3/9] mm: Introduce VM_SHSTK for shadow stack memory
From: Yu-cheng Yu @ 2018-06-07 14:36 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
VM_SHSTK indicates a shadow stack memory area.
A shadow stack PTE must be read-only and dirty. For non shadow
stack, we use a spare bit of the 64-bit PTE for dirty. The PTE
changes are in the next patch.
There is no more spare bit in the 32-bit PTE (except for PAE) and
the shadow stack is not implemented for the 32-bit kernel.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
include/linux/mm.h | 8 ++++++++
mm/internal.h | 8 ++++++++
2 files changed, 16 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 02a616e2f17d..bf4388a8cc41 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -221,11 +221,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#if defined(CONFIG_X86)
@@ -257,6 +259,12 @@ extern unsigned int kobjsize(const void *objp);
# define VM_MPX VM_NONE
#endif
+#ifdef CONFIG_X86_INTEL_SHADOW_STACK_USER
+# define VM_SHSTK VM_HIGH_ARCH_5
+#else
+# define VM_SHSTK VM_NONE
+#endif
+
#ifndef VM_GROWSUP
# define VM_GROWSUP VM_NONE
#endif
diff --git a/mm/internal.h b/mm/internal.h
index 502d14189794..44c64711a309 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -280,6 +280,14 @@ static inline bool is_data_mapping(vm_flags_t flags)
return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}
+/*
+ * Shadow stack area
+ */
+static inline bool is_shstk_mapping(vm_flags_t flags)
+{
+ return (flags & VM_SHSTK);
+}
+
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 1/9] x86/cet: Control protection exception handler
From: Yu-cheng Yu @ 2018-06-07 14:36 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
A control protection exception is triggered when a control flow transfer
attempt violated shadow stack or indirect branch tracking constraints.
For example, the return address for a RET instruction differs from the
safe copy on the shadow stack; or a JMP instruction arrives at a non-
ENDBR instruction.
The control protection exception handler works in a similar way as the
general protection fault handler.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/entry/entry_32.S | 5 ++++
arch/x86/entry/entry_64.S | 2 +-
arch/x86/include/asm/traps.h | 3 +++
arch/x86/kernel/idt.c | 1 +
arch/x86/kernel/traps.c | 61 ++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 71 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bef8e2b202a8..14b63ef0d7d8 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1070,6 +1070,11 @@ ENTRY(general_protection)
jmp common_exception
END(general_protection)
+ENTRY(control_protection)
+ pushl $do_control_protection
+ jmp common_exception
+END(control_protection)
+
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
ASM_CLAC
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3166b9674429..5230f705d229 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -999,7 +999,7 @@ idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
idtentry coprocessor_error do_coprocessor_error has_error_code=0
idtentry alignment_check do_alignment_check has_error_code=1
idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
+idtentry control_protection do_control_protection has_error_code=1
/*
* Reload gs selector with exception handling
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 3de69330e6c5..5196050ff3d5 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -26,6 +26,7 @@ asmlinkage void invalid_TSS(void);
asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
+asmlinkage void control_protection(void);
asmlinkage void page_fault(void);
asmlinkage void async_page_fault(void);
asmlinkage void spurious_interrupt_bug(void);
@@ -77,6 +78,7 @@ dotraplinkage void do_stack_segment(struct pt_regs *, long);
dotraplinkage void do_double_fault(struct pt_regs *, long);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
+dotraplinkage void do_control_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
@@ -142,6 +144,7 @@ enum {
X86_TRAP_AC, /* 17, Alignment Check */
X86_TRAP_MC, /* 18, Machine Check */
X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */
+ X86_TRAP_CP = 21, /* 21 Control Protection Fault */
X86_TRAP_IRET = 32, /* 32, IRET Exception */
};
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 2c3a1b4294eb..d00493709cec 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -85,6 +85,7 @@ static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_MF, coprocessor_error),
INTG(X86_TRAP_AC, alignment_check),
INTG(X86_TRAP_XF, simd_coprocessor_error),
+ INTG(X86_TRAP_CP, control_protection),
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 03f3d7695dac..4e8769a19aaf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -577,6 +577,67 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
NOKPROBE_SYMBOL(do_general_protection);
+static const char *control_protection_err[] =
+{
+ "near-ret",
+ "far-ret/iret",
+ "endbranch",
+ "rstorssp",
+ "setssbsy",
+ "unknown",
+};
+
+/*
+ * When a control protection exception occurs, send a signal
+ * to the responsible application. Currently, control
+ * protection is only enabled for the user mode. This
+ * exception should not come from the kernel mode.
+ */
+dotraplinkage void
+do_control_protection(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ cond_local_irq_enable(regs);
+
+ tsk = current;
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ !cpu_feature_enabled(X86_FEATURE_IBT)) {
+ goto exit;
+ }
+
+ if (!user_mode(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+ if (notify_die(DIE_TRAP, "control protection fault", regs,
+ error_code, X86_TRAP_CP, SIGSEGV) != NOTIFY_STOP)
+ die("control protection fault", regs, error_code);
+ return;
+ }
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ unsigned int max_idx, err_idx;
+
+ max_idx = ARRAY_SIZE(control_protection_err) - 1;
+ err_idx = min((unsigned int)error_code - 1, max_idx);
+ pr_info("%s[%d] control protection ip:%lx sp:%lx error:%lx(%s)",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, error_code,
+ control_protection_err[err_idx]);
+ print_vma_addr(" in ", regs->ip);
+ pr_cont("\n");
+ }
+
+exit:
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_control_protection);
+
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 2/9] x86/cet: Add Kconfig option for user-mode shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:36 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143705.3531-1-yu-cheng.yu@intel.com>
Introduce Kconfig option X86_INTEL_SHADOW_STACK_USER.
An application has shadow stack protection when all the following are
true:
(1) The kernel has X86_INTEL_SHADOW_STACK_USER enabled,
(2) The running processor supports the shadow stack,
(3) The application is built with shadow stack enabled tools & libs
and, and at runtime, all dependent shared libs can support shadow
stack.
If this kernel config option is enabled, but (2) or (3) above is not
true, the application runs without the shadow stack protection.
Existing legacy applications will continue to work without the shadow
stack protection.
The user-mode shadow stack protection is only implemented for the
64-bit kernel. Thirty-two bit applications are supported under the
compatibility mode.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/Kconfig | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..dd580d4910fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1925,6 +1925,30 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+config X86_INTEL_CET
+ def_bool n
+
+config ARCH_HAS_SHSTK
+ def_bool n
+
+config X86_INTEL_SHADOW_STACK_USER
+ prompt "Intel Shadow Stack for user-mode"
+ def_bool n
+ depends on CPU_SUP_INTEL && X86_64
+ select X86_INTEL_CET
+ select ARCH_HAS_SHSTK
+ ---help---
+ Shadow stack provides hardware protection against program stack
+ corruption. Only when all the following are true will an application
+ have the shadow stack protection: the kernel supports it (i.e. this
+ feature is enabled), the application is compiled and linked with
+ shadow stack enabled, and the processor supports this feature.
+ When the kernel has this configuration enabled, existing non shadow
+ stack applications will continue to work, but without shadow stack
+ protection.
+
+ If unsure, say y.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: [PATCH 1/9] x86/cet: Control protection exception handler
From: Andy Lutomirski @ 2018-06-07 15:46 UTC (permalink / raw)
To: Yu-cheng Yu
Cc: LKML, linux-doc, Linux-MM, linux-arch, X86 ML, H. Peter Anvin,
Thomas Gleixner, Ingo Molnar, H. J. Lu, Shanbhogue, Vedvyas,
Ravi V. Shankar, Dave Hansen, Jonathan Corbet, Oleg Nesterov,
Arnd Bergmann, mike.kravetz
In-Reply-To: <20180607143705.3531-2-yu-cheng.yu@intel.com>
On Thu, Jun 7, 2018 at 7:40 AM Yu-cheng Yu <yu-cheng.yu@intel.com> wrote:
>
> A control protection exception is triggered when a control flow transfer
> attempt violated shadow stack or indirect branch tracking constraints.
> For example, the return address for a RET instruction differs from the
> safe copy on the shadow stack; or a JMP instruction arrives at a non-
> ENDBR instruction.
>
> The control protection exception handler works in a similar way as the
> general protection fault handler.
>
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
> ---
> arch/x86/entry/entry_32.S | 5 ++++
> arch/x86/entry/entry_64.S | 2 +-
> arch/x86/include/asm/traps.h | 3 +++
> arch/x86/kernel/idt.c | 1 +
> arch/x86/kernel/traps.c | 61 ++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 71 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index bef8e2b202a8..14b63ef0d7d8 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -1070,6 +1070,11 @@ ENTRY(general_protection)
> jmp common_exception
> END(general_protection)
>
> +ENTRY(control_protection)
> + pushl $do_control_protection
> + jmp common_exception
> +END(control_protection)
Ugh, you're seriously supporting this on 32-bit? Please test double
fault handling very carefully -- the CET interaction with task
switches is so gross that I didn't even bother reading the spec except
to let the architects know that they were a but nuts to support it at
all.
> +
> #ifdef CONFIG_KVM_GUEST
> ENTRY(async_page_fault)
> ASM_CLAC
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 3166b9674429..5230f705d229 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -999,7 +999,7 @@ idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
> idtentry coprocessor_error do_coprocessor_error has_error_code=0
> idtentry alignment_check do_alignment_check has_error_code=1
> idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
> -
> +idtentry control_protection do_control_protection has_error_code=1
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 03f3d7695dac..4e8769a19aaf 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> +/*
> + * When a control protection exception occurs, send a signal
> + * to the responsible application. Currently, control
> + * protection is only enabled for the user mode. This
> + * exception should not come from the kernel mode.
> + */
> +dotraplinkage void
> +do_control_protection(struct pt_regs *regs, long error_code)
> +{
> + struct task_struct *tsk;
> +
> + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> + cond_local_irq_enable(regs);
> +
> + tsk = current;
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK) &&
> + !cpu_feature_enabled(X86_FEATURE_IBT)) {
static_cpu_has(), please. But your handling here is odd -- I think
that we should at least warn if we get #CP with CET disable.
> + goto exit;
> + }
> +
> + if (!user_mode(regs)) {
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_CP;
I realize you copied this from elsewhere in the file, but please
either delete these assignments to error_code and trap_nr or at least
hoist them out of the if block.
> + if (notify_die(DIE_TRAP, "control protection fault", regs,
> + error_code, X86_TRAP_CP, SIGSEGV) != NOTIFY_STOP)
Does this notify_die() check serve any purpose at all? Removing all
the old ones would be a project, but let's try not to add new callers.
> + die("control protection fault", regs, error_code);
> + return;
> + }
> +
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_CP;
> +
> + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
> + printk_ratelimit()) {
> + unsigned int max_idx, err_idx;
> +
> + max_idx = ARRAY_SIZE(control_protection_err) - 1;
> + err_idx = min((unsigned int)error_code - 1, max_idx);
What if error_code == 0? Is that also invalid?
> + pr_info("%s[%d] control protection ip:%lx sp:%lx error:%lx(%s)",
> + tsk->comm, task_pid_nr(tsk),
> + regs->ip, regs->sp, error_code,
> + control_protection_err[err_idx]);
> + print_vma_addr(" in ", regs->ip);
> + pr_cont("\n");
> + }
> +
> +exit:
> + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
This is definitely wrong for the feature-disabled, !user_mode case.
Also, are you planning on enabling CET for kernel code too?
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH 2/9] x86/cet: Add Kconfig option for user-mode shadow stack
From: Andy Lutomirski @ 2018-06-07 15:47 UTC (permalink / raw)
To: Yu-cheng Yu
Cc: LKML, linux-doc, Linux-MM, linux-arch, X86 ML, H. Peter Anvin,
Thomas Gleixner, Ingo Molnar, H. J. Lu, Shanbhogue, Vedvyas,
Ravi V. Shankar, Dave Hansen, Jonathan Corbet, Oleg Nesterov,
Arnd Bergmann, mike.kravetz
In-Reply-To: <20180607143705.3531-3-yu-cheng.yu@intel.com>
On Thu, Jun 7, 2018 at 7:40 AM Yu-cheng Yu <yu-cheng.yu@intel.com> wrote:
>
> Introduce Kconfig option X86_INTEL_SHADOW_STACK_USER.
>
> An application has shadow stack protection when all the following are
> true:
>
> (1) The kernel has X86_INTEL_SHADOW_STACK_USER enabled,
> (2) The running processor supports the shadow stack,
> (3) The application is built with shadow stack enabled tools & libs
> and, and at runtime, all dependent shared libs can support shadow
> stack.
>
> If this kernel config option is enabled, but (2) or (3) above is not
> true, the application runs without the shadow stack protection.
> Existing legacy applications will continue to work without the shadow
> stack protection.
>
> The user-mode shadow stack protection is only implemented for the
> 64-bit kernel. Thirty-two bit applications are supported under the
> compatibility mode.
>
The 64-bit only part seems entirely reasonable. So please make the
code 64-bit only :)
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH 5/5] Documentation/x86: Add CET description
From: Yu-cheng Yu @ 2018-06-07 14:35 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143544.3477-1-yu-cheng.yu@intel.com>
Explain how CET works and the noshstk/noibt kernel parameters.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
Documentation/admin-guide/kernel-parameters.txt | 6 +
Documentation/x86/intel_cet.txt | 161 ++++++++++++++++++++++++
2 files changed, 167 insertions(+)
create mode 100644 Documentation/x86/intel_cet.txt
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f2040d46f095..c9a94bec1519 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2649,6 +2649,12 @@
noexec=on: enable non-executable mappings (default)
noexec=off: disable non-executable mappings
+ noibt [X86-64] Disable indirect branch tracking for user-mode
+ applications
+
+ noshstk [X86-64] Disable shadow stack support for user-mode
+ applications
+
nosmap [X86]
Disable SMAP (Supervisor Mode Access Prevention)
even if it is supported by processor.
diff --git a/Documentation/x86/intel_cet.txt b/Documentation/x86/intel_cet.txt
new file mode 100644
index 000000000000..1b902a6c49f4
--- /dev/null
+++ b/Documentation/x86/intel_cet.txt
@@ -0,0 +1,161 @@
+-----------------------------------------
+Control Flow Enforcement Technology (CET)
+-----------------------------------------
+
+[1] Overview
+
+Control Flow Enforcement Technology (CET) provides protection against
+return/jump-oriented programing (ROP) attacks. It can be implemented to
+protect both the kernel and applications. In the first phase, only the
+user-mode protection is implemented for the 64-bit kernel. Thirty-two bit
+applications are supported under the compatibility mode.
+
+CET includes shadow stack (SHSTK) and indirect branch tracking (IBT) and
+they are enabled from two kernel configuration options:
+
+ INTEL_X86_SHADOW_STACK_USER, and
+ INTEL_X86_BRANCH_TRACKING_USER.
+
+There are two command-line options for disabling CET features:
+
+ noshstk - disables shadow stack, and
+ noibt - disables indirect branch tracking.
+
+At run time, /proc/cpuinfo shows the availability of SHSTK and IBT.
+
+[2] Application Enabling
+
+The design of CET user-mode interface provides maximum overall coverage
+and compatibility with existing applications.
+
+To verify the CET capability of an application, use the following command
+and look for SHSTK/IBT in the NT_GNU_PROPERTY_TYPE_0 field:
+
+ readelf -n <application>
+
+CET features are opt-in by each application. To build a CET-capable
+application, the following tools are needed: Binutils v2.30, GCC v8.1,
+and GLIBC v2.29 (or later).
+
+If an application has CET capabilities, is statically linked, and the
+kernel supports CET, it will run with CET enabled. If an application
+needs any shared libraries, the loader checks all dependencies and enables
+CET only when all requirements are met. Once an application starts with
+CET enabled, the protection cannot be turned off until the next exec().
+
+[3] CET system calls
+
+The following arch_prctl() system calls are added for CET:
+
+(3a) arch_prctl(ARCH_CET_STATUS, unsigned long *addr)
+
+ Return CET feature status.
+
+ The parameter 'addr' is a pointer to a user buffer.
+ On returning to the caller, the kernel fills the following
+ information:
+
+ *addr = SHSTK/IBT status
+ *(addr + 1) = SHSTK/IBT default setting on exec()
+ *(addr + 2) = default SHSTK size on exec()
+
+(3b) arch_prctl(ARCH_CET_DISABLE, unsigned long features)
+
+ Disable SHSTK and/or IBT specified in 'features'. Return -EPERM
+ if CET is locked out.
+
+(3c) arch_prctl(ARCH_CET_LOCK)
+
+ Lock out CET features; disable turning off of SHSTK/IBT.
+
+(3d) arch_prctl(ARCH_CET_EXEC, unsigned long *addr)
+
+ Control how CET features should be enabled upon exec() a new
+ image.
+
+ The parameter 'addr' is a pointer to a user buffer.
+
+ *addr = a bitmap indicating which features are being changed
+ *(addr + 1) = how CET should be enabled upon exec().
+ 0: Check ELF header
+ 1: Always disable
+ 2: Always enable
+ *(addr + 2) = default SHSTK size on exec()
+
+(3e) arch_prctl(ARCH_CET_ALLOC_SHSTK, unsigned long *addr)
+
+ Allocate a new SHSTK.
+
+ The parameter 'addr' is a pointer to a user buffer and indicates
+ the desired SHSTK size to allocate. On returning to the caller
+ the buffer contains the address of the new SHSTK.
+
+(3f) arch_prctl(ARCH_CET_PUSH_SHSTK, unsigned long *addr)
+
+ Push a value onto the SHSTK.
+
+ The parameter 'addr' is a pointer to a user buffer.
+
+ *addr = the SHSTK pointer
+ *(addr + 1) = the value to push (a function return address)
+
+Note: ARCH_CET_ALLOC_SHSTK and ARCH_CET_PUSH_SHSTK are intended for
+ the implementation of GLIBC getcontext(), setcontext(),
+ makecontext(), and swapcontext().
+
+(3g) arch_prctl(ARCH_CET_LEGACY_BITMAP, unsigned long *addr)
+
+ If the current task does not have a legacy bitmap, setup one.
+ Return bitmap information as the following:
+
+ *addr = bitmap base address
+ *(addr + 1) = bitmap size
+
+[4] The implementation of the SHSTK
+
+A task's SHSTK is allocated from memory to a fixed size that can
+support 32 KB nested function calls; that is 256 KB for a 64-bit
+application and 128 KB for a 32-bit application. The system admin
+can change the size with the CET command line utility.
+
+The main program and its signal handlers use the same shadow stack.
+
+The SHSTK's vma has VM_SHSTK flag set; its PTEs are required to be
+read-only and dirty. When a SHSTK PTE is not present, RO, and dirty,
+a SHSTK access triggers a page fault with an additional SHSTK bit set
+in the page fault error code.
+
+When a task forks a child, its SHSTK PTEs are copied and both the
+parent's and the child's SHSTK PTEs are cleared of the dirty bit.
+Upon the next SHSTK access, the resulting SHSTK page fault is handled
+by page copy/re-use.
+
+When a pthread child is created, a separate SHSTK is created for the
+child.
+
+[5] The management of read-only & dirty PTEs for SHSTK
+
+A RO and dirty PTE exists in the following cases:
+
+(5a) A page is modified and then shared with a fork()'ed child;
+(5b) access_remote_vm with (FOLL_WRITE | FOLL_FORCE) on a RO page;
+(5c) A SHSTK page.
+
+The processor does not read the dirty bit for (5a) and (5b), but
+checks the dirty bit for (5c). To prevent accidental use of non-
+SHSTK memory as SHSTK, we introduce the use of a spare bit of the
+64-bit PTE as _PAGE_BIT_DIRTY_SW and exchange it with the dirty
+bit for (5a) and (5b). This results to the following possible
+PTE settings:
+
+Modified PTE: (R/W + DIRTY_HW)
+Modified and shared PTE: (R/O + DIRTY_SW)
+R/O PTE was (FOLL_FORCE | FOLL_WRITE): (R/O + DIRTY_SW)
+SHSTK stack PTE: (R/O + DIRTY_HW)
+Shared SHSTK PTE: (R/O + DIRTY_SW)
+
+[6] The implementation of IBT
+
+The kernel provides IBT support in mmap() of the legacy code bit map.
+However, the management of the bitmap is done in the GLIBC or the
+application.
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 4/5] x86/fpu/xstate: Add XSAVES system states for shadow stack
From: Yu-cheng Yu @ 2018-06-07 14:35 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143544.3477-1-yu-cheng.yu@intel.com>
Intel Control-flow Enforcement Technology (CET) introduces the
following MSRs into the XSAVES system states.
IA32_U_CET (user-mode CET settings),
IA32_PL3_SSP (user-mode shadow stack),
IA32_PL0_SSP (kernel-mode shadow stack),
IA32_PL1_SSP (ring-1 shadow stack),
IA32_PL2_SSP (ring-2 shadow stack).
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/fpu/types.h | 22 ++++++++++++++++++++++
arch/x86/include/asm/fpu/xstate.h | 4 +++-
arch/x86/include/uapi/asm/processor-flags.h | 2 ++
arch/x86/kernel/fpu/xstate.c | 10 ++++++++++
4 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 202c53918ecf..e55d51d172f1 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,9 @@ enum xfeature {
XFEATURE_Hi16_ZMM,
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
+ XFEATURE_RESERVED,
+ XFEATURE_SHSTK_USER,
+ XFEATURE_SHSTK_KERNEL,
XFEATURE_MAX,
};
@@ -128,6 +131,8 @@ enum xfeature {
#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_SHSTK_USER (1 << XFEATURE_SHSTK_USER)
+#define XFEATURE_MASK_SHSTK_KERNEL (1 << XFEATURE_SHSTK_KERNEL)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
@@ -229,6 +234,23 @@ struct pkru_state {
u32 pad;
} __packed;
+/*
+ * State component 11 is Control flow Enforcement user states
+ */
+struct cet_user_state {
+ u64 u_cet; /* user control flow settings */
+ u64 user_ssp; /* user shadow stack pointer */
+} __packed;
+
+/*
+ * State component 12 is Control flow Enforcement kernel states
+ */
+struct cet_kernel_state {
+ u64 kernel_ssp; /* kernel shadow stack */
+ u64 pl1_ssp; /* ring-1 shadow stack */
+ u64 pl2_ssp; /* ring-2 shadow stack */
+} __packed;
+
struct xstate_header {
u64 xfeatures;
u64 xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index a32dc5f8c963..662562cbafe9 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -31,7 +31,9 @@
XFEATURE_MASK_Hi16_ZMM | \
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_SHSTK_USER | \
+ XFEATURE_MASK_SHSTK_KERNEL)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..25311ec4b731 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT 23 /* enable Control flow Enforcement */
+#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index dd2c561c4544..91c0f665567b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -35,6 +35,9 @@ static const char *xfeature_names[] =
"Processor Trace (unused)" ,
"Protection Keys User registers",
"unknown xstate feature" ,
+ "Control flow User registers" ,
+ "Control flow Kernel registers" ,
+ "unknown xstate feature" ,
};
static short xsave_cpuid_features[] __initdata = {
@@ -48,6 +51,9 @@ static short xsave_cpuid_features[] __initdata = {
X86_FEATURE_AVX512F,
X86_FEATURE_INTEL_PT,
X86_FEATURE_PKU,
+ 0, /* Unused */
+ X86_FEATURE_SHSTK, /* XFEATURE_SHSTK_USER */
+ X86_FEATURE_SHSTK, /* XFEATURE_SHSTK_KERNEL */
};
/*
@@ -316,6 +322,8 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
+ print_xstate_feature(XFEATURE_MASK_SHSTK_USER);
+ print_xstate_feature(XFEATURE_MASK_SHSTK_KERNEL);
}
/*
@@ -562,6 +570,8 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
+ XCHECK_SZ(sz, nr, XFEATURE_SHSTK_USER, struct cet_user_state);
+ XCHECK_SZ(sz, nr, XFEATURE_SHSTK_KERNEL, struct cet_kernel_state);
/*
* Make *SURE* to add any feature numbers in below if
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 1/5] x86/cpufeatures: Add CPUIDs for Control-flow Enforcement Technology (CET)
From: Yu-cheng Yu @ 2018-06-07 14:35 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143544.3477-1-yu-cheng.yu@intel.com>
Add CPUIDs for Control-flow Enforcement Technology (CET).
CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect branch tracking
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/cpufeatures.h | 2 ++
arch/x86/kernel/cpu/scattered.c | 1 +
2 files changed, 3 insertions(+)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index fb00a2fca990..244c2aa07f0c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -219,6 +219,7 @@
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_IBT ( 7*32+29) /* Indirect Branch Tracking */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -317,6 +318,7 @@
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow Stack */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 772c219b6889..63cbb4d9938e 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -21,6 +21,7 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_IBT, CPUID_EDX, 20, 0x00000007, 0},
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 2/5] x86/fpu/xstate: Change some names to separate XSAVES system and user states
From: Yu-cheng Yu @ 2018-06-07 14:35 UTC (permalink / raw)
To: linux-kernel, linux-doc, linux-mm, linux-arch, x86,
H. Peter Anvin, Thomas Gleixner, Ingo Molnar, H.J. Lu,
Vedvyas Shanbhogue, Ravi V. Shankar, Dave Hansen, Andy Lutomirski,
Jonathan Corbet, Oleg Nesterov, Arnd Bergmann, Mike Kravetz
Cc: Yu-cheng Yu
In-Reply-To: <20180607143544.3477-1-yu-cheng.yu@intel.com>
To support XSAVES system states, change some names to distinguish
user and system states.
Change:
supervisor to system
copy_init_fpstate_to_fpregs() to copy_init_fpstate_user_settings_to_fpregs()
xfeatures_mask to xfeatures_mask_user
XCNTXT_MASK to SUPPORTED_XFEATURES_MASK (states supported)
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
arch/x86/include/asm/fpu/internal.h | 5 ++-
arch/x86/include/asm/fpu/xstate.h | 24 +++++-----
arch/x86/kernel/fpu/core.c | 4 +-
arch/x86/kernel/fpu/init.c | 2 +-
arch/x86/kernel/fpu/signal.c | 6 +--
arch/x86/kernel/fpu/xstate.c | 88 +++++++++++++++++++------------------
6 files changed, 66 insertions(+), 63 deletions(-)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a38bf5a1e37a..f1f9bf91a0ab 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -93,7 +93,8 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
* XRSTORS requires these bits set in xcomp_bv, or it will
* trigger #GP:
*/
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
+ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
+ xfeatures_mask_user;
}
static inline void fpstate_init_fxstate(struct fxregs_state *fx)
@@ -233,7 +234,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
/*
* If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
+ * format and system states in addition to modified optimization in
* XSAVEOPT.
*
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 48581988d78c..9b382e5157ed 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -19,19 +19,19 @@
#define XSAVE_YMM_SIZE 256
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-/* Supervisor features */
-#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
+/* System features */
+#define XFEATURE_MASK_SYSTEM (XFEATURE_MASK_PT)
/* All currently supported features */
-#define XCNTXT_MASK (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
- XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
- XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM | \
- XFEATURE_MASK_PKRU | \
- XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+#define SUPPORTED_XFEATURES_MASK (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
@@ -39,7 +39,7 @@
#define REX_PREFIX
#endif
-extern u64 xfeatures_mask;
+extern u64 xfeatures_mask_user;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f92a6593de1e..d654b2f9a6c4 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -362,7 +362,7 @@ void fpu__drop(struct fpu *fpu)
* Clear FPU registers by setting them up from
* the init fpstate:
*/
-static inline void copy_init_fpstate_to_fpregs(void)
+static inline void copy_init_fpstate_user_settings_to_fpregs(void)
{
if (use_xsave())
copy_kernel_to_xregs(&init_fpstate.xsave, -1);
@@ -394,7 +394,7 @@ void fpu__clear(struct fpu *fpu)
preempt_disable();
fpu__initialize(fpu);
user_fpu_begin();
- copy_init_fpstate_to_fpregs();
+ copy_init_fpstate_user_settings_to_fpregs();
preempt_enable();
}
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6abd83572b01..761c3a5a9e07 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -229,7 +229,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
- return XCNTXT_MASK;
+ return SUPPORTED_XFEATURES_MASK;
}
/* Legacy code to initialize eager fpu mode. */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 23f1691670b6..f77aa76ba675 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -249,11 +249,11 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
{
if (use_xsave()) {
if ((unsigned long)buf % 64 || fx_only) {
- u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+ u64 init_bv = xfeatures_mask_user & ~XFEATURE_MASK_FPSSE;
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_fxregs(buf);
} else {
- u64 init_bv = xfeatures_mask & ~xbv;
+ u64 init_bv = xfeatures_mask_user & ~xbv;
if (unlikely(init_bv))
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_xregs(buf, xbv);
@@ -417,7 +417,7 @@ void fpu__init_prepare_fx_sw_frame(void)
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
- fx_sw_reserved.xfeatures = xfeatures_mask;
+ fx_sw_reserved.xfeatures = xfeatures_mask_user;
fx_sw_reserved.xstate_size = fpu_user_xstate_size;
if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 87a57b7642d3..19f8df54c72a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -53,11 +53,11 @@ static short xsave_cpuid_features[] __initdata = {
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
-u64 xfeatures_mask __read_mostly;
+u64 xfeatures_mask_user __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_user)*8];
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -82,7 +82,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+ u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_user;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -113,14 +113,14 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
-static int xfeature_is_supervisor(int xfeature_nr)
+static int xfeature_is_system(int xfeature_nr)
{
/*
- * We currently do not support supervisor states, but if
+ * We currently do not support system states, but if
* we did, we could find out like this.
*
* SDM says: If state component 'i' is a user state component,
- * ECX[0] return 0; if state component i is a supervisor
+ * ECX[0] return 0; if state component i is a system
* state component, ECX[0] returns 1.
*/
u32 eax, ebx, ecx, edx;
@@ -131,7 +131,7 @@ static int xfeature_is_supervisor(int xfeature_nr)
static int xfeature_is_user(int xfeature_nr)
{
- return !xfeature_is_supervisor(xfeature_nr);
+ return !xfeature_is_system(xfeature_nr);
}
/*
@@ -164,7 +164,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* None of the feature bits are in init state. So nothing else
* to do for us, as the memory layout is up to date.
*/
- if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+ if ((xfeatures & xfeatures_mask_user) == xfeatures_mask_user)
return;
/*
@@ -191,7 +191,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* in a special way already:
*/
feature_bit = 0x2;
- xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+ xfeatures = (xfeatures_mask_user & ~xfeatures) >> 2;
/*
* Update all the remaining memory layouts according to their
@@ -219,20 +219,20 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
void fpu__init_cpu_xstate(void)
{
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
return;
/*
- * Make it clear that XSAVES supervisor states are not yet
+ * Make it clear that XSAVES system states are not yet
* implemented should anyone expect it to work by changing
* bits in XFEATURE_MASK_* macros and XCR0.
*/
- WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
- "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+ WARN_ONCE((xfeatures_mask_user & XFEATURE_MASK_SYSTEM),
+ "x86/fpu: XSAVES system states are not yet implemented.\n");
- xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+ xfeatures_mask_user &= ~XFEATURE_MASK_SYSTEM;
cr4_set_bits(X86_CR4_OSXSAVE);
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
}
/*
@@ -242,7 +242,7 @@ void fpu__init_cpu_xstate(void)
*/
static int xfeature_enabled(enum xfeature xfeature)
{
- return !!(xfeatures_mask & (1UL << xfeature));
+ return !!(xfeatures_mask_user & BIT_ULL(xfeature));
}
/*
@@ -272,7 +272,7 @@ static void __init setup_xstate_features(void)
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
/*
- * If an xfeature is supervisor state, the offset
+ * If an xfeature is system state, the offset
* in EBX is invalid. We leave it to -1.
*/
if (xfeature_is_user(i))
@@ -348,7 +348,7 @@ static int xfeature_is_aligned(int xfeature_nr)
*/
static void __init setup_xstate_comp(void)
{
- unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+ unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_user)*8];
int i;
/*
@@ -421,7 +421,8 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+ init_fpstate.xsave.header.xcomp_bv =
+ BIT_ULL(63) | xfeatures_mask_user;
/*
* Init all the features state with header.xfeatures being 0x0
@@ -440,11 +441,11 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
u32 eax, ebx, ecx, edx;
/*
- * Only XSAVES supports supervisor states and it uses compacted
- * format. Checking a supervisor state's uncompacted offset is
+ * Only XSAVES supports system states and it uses compacted
+ * format. Checking a system state's uncompacted offset is
* an error.
*/
- if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
+ if (XFEATURE_MASK_SYSTEM & (1 << xfeature_nr)) {
WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
return -1;
}
@@ -465,7 +466,7 @@ static int xfeature_size(int xfeature_nr)
/*
* 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
+ * 1. saving of system state
* 2. using the compacted format
*
* Use this function when dealing with the compacted format so
@@ -480,8 +481,8 @@ int using_compacted_format(void)
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
int validate_xstate_header(const struct xstate_header *hdr)
{
- /* No unknown or supervisor features may be set */
- if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+ /* No unknown or system features may be set */
+ if (hdr->xfeatures & (~xfeatures_mask_user | XFEATURE_MASK_SYSTEM))
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -588,11 +589,11 @@ static void do_extra_xstate_size_checks(void)
check_xstate_against_struct(i);
/*
- * Supervisor state components can be managed only by
+ * System state components can be managed only by
* XSAVES, which is compacted-format only.
*/
if (!using_compacted_format())
- XSTATE_WARN_ON(xfeature_is_supervisor(i));
+ XSTATE_WARN_ON(xfeature_is_system(i));
/* Align from the end of the previous feature */
if (xfeature_is_aligned(i))
@@ -616,7 +617,7 @@ static void do_extra_xstate_size_checks(void)
/*
- * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ * Get total size of enabled xstates in XCR0/xfeatures_mask_user.
*
* Note the SDM's wording here. "sub-function 0" only enumerates
* the size of the *user* states. If we use it to size a buffer
@@ -706,7 +707,7 @@ static int init_xstate_size(void)
*/
static void fpu__init_disable_system_xstate(void)
{
- xfeatures_mask = 0;
+ xfeatures_mask_user = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
fpu__xstate_clear_all_cpu_caps();
}
@@ -742,15 +743,15 @@ void __init fpu__init_system_xstate(void)
}
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xfeatures_mask = eax + ((u64)edx << 32);
+ xfeatures_mask_user = eax + ((u64)edx << 32);
- if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ if ((xfeatures_mask_user & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
* booting without it. This is too early to BUG().
*/
- pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask_user);
goto out_disable;
}
@@ -759,10 +760,10 @@ void __init fpu__init_system_xstate(void)
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask &= ~BIT(i);
+ xfeatures_mask_user &= ~BIT_ULL(i);
}
- xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+ xfeatures_mask_user &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -772,9 +773,10 @@ void __init fpu__init_system_xstate(void)
/*
* Update info used for ptrace frames; use standard-format size and no
- * supervisor xstates:
+ * system xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+ update_regset_xstate_info(fpu_user_xstate_size,
+ xfeatures_mask_user & ~XFEATURE_MASK_SYSTEM);
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
@@ -782,7 +784,7 @@ void __init fpu__init_system_xstate(void)
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask,
+ xfeatures_mask_user,
fpu_kernel_xstate_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
@@ -801,7 +803,7 @@ void fpu__resume_cpu(void)
* Restore XCR0 on xsave capable CPUs:
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
}
/*
@@ -853,7 +855,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
* have not enabled. Remember that pcntxt_mask is
* what we write to the XCR0 register.
*/
- WARN_ONCE(!(xfeatures_mask & xstate_feature),
+ WARN_ONCE(!(xfeatures_mask_user & xstate_feature),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -1003,7 +1005,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ header.xfeatures &= ~XFEATURE_MASK_SYSTEM;
/*
* Copy xregs_state->header:
@@ -1087,7 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ header.xfeatures &= ~XFEATURE_MASK_SYSTEM;
/*
* Copy xregs_state->header:
@@ -1180,7 +1182,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures &= XFEATURE_MASK_SYSTEM;
/*
* Add back in the features that came in from userspace:
@@ -1236,7 +1238,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures &= XFEATURE_MASK_SYSTEM;
/*
* Add back in the features that came in from userspace:
--
2.15.1
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: [PATCH 2/5] x86/fpu/xstate: Change some names to separate XSAVES system and user states
From: Yu-cheng Yu @ 2018-06-07 15:47 UTC (permalink / raw)
To: Andy Lutomirski
Cc: LKML, linux-doc, Linux-MM, linux-arch, X86 ML, H. Peter Anvin,
Thomas Gleixner, Ingo Molnar, H. J. Lu, Shanbhogue, Vedvyas,
Ravi V. Shankar, Dave Hansen, Jonathan Corbet, Oleg Nesterov,
Arnd Bergmann, mike.kravetz
In-Reply-To: <CALCETrXAx4vBUxf3VaePNm3HHLZkdTFAR9TV0T+A-jb2QL7Uag@mail.gmail.com>
On Thu, 2018-06-07 at 08:38 -0700, Andy Lutomirski wrote:
> On Thu, Jun 7, 2018 at 7:40 AM Yu-cheng Yu <yu-cheng.yu@intel.com> wrote:
> >
> > To support XSAVES system states, change some names to distinguish
> > user and system states.
> >
> > Change:
> > supervisor to system
> > copy_init_fpstate_to_fpregs() to copy_init_fpstate_user_settings_to_fpregs()
> > xfeatures_mask to xfeatures_mask_user
> > XCNTXT_MASK to SUPPORTED_XFEATURES_MASK (states supported)
>
> How about copy_init_user_fpstate_to_fpregs()? It's shorter and more
> to the point.
>
> --Andy
I will change that.
Yu-cheng
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox