linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/37] x86: Memory Protection Keys
@ 2015-11-17  3:35 Dave Hansen
  2015-11-17  3:35 ` [PATCH 27/37] mm, multi-arch: pass a protection key in to calc_vm_flag_bits() Dave Hansen
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Dave Hansen @ 2015-11-17  3:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: x86, Dave Hansen, linux-api, linux-arch, aarcange, akpm, jack,
	kirill.shutemov, n-horiguchi

Memory Protection Keys for User pages is a CPU feature which will
first appear on Skylake Servers, but will also be supported on
future non-server parts.  It provides a mechanism for enforcing
page-based protections, but without requiring modification of the
page tables when an application changes protection domains.  See
the Documentation/ patch for more details.

Changes from RFCv3:

 * Added 'current' and 'foreign' variants of get_user_pages() to
   help indicate whether protection keys should be enforced.
   Thanks to Jerome Glisse for pointing out this issue.
 * Added "allocation" and set/get system calls so that we can do
   management of proection keys in the kernel.  This opens the
   door to use of specific protection keys for kernel use in the
   future, such as for execute-only memory.
 * Removed the kselftest code for the moment.  It will be
   submitted separately.

Thanks Ingo and Thomas for most of these):
Changes from RFCv2 (Thanks Ingo and Thomas for most of these):

 * few minor compile warnings
 * changed 'nopku' interaction with cpuid bits.  Now, we do not
   clear the PKU cpuid bit, we just skip enabling it.
 * changed __pkru_allows_write() to also check access disable bit
 * removed the unused write_pkru()
 * made si_pkey a u64 and added some patch description details.
   Also made it share space in siginfo with MPX and clarified
   comments.
 * give some real text for the Processor Trace xsave state
 * made vma_pkey() less ugly (and much more optimized actually)
 * added SEGV_PKUERR to copy_siginfo_to_user()
 * remove page table walk when filling in si_pkey, added some
   big fat comments about it being inherently racy.
 * added self test code

This code is not runnable to anyone outside of Intel unless they
have some special hardware or a fancy simulator.  If you are
interested in running this for real, please get in touch with me.
Hardware is available to a very small but nonzero number of
people.

This set is also available here (with the new syscall):

	git://git.kernel.org/pub/scm/linux/kernel/git/daveh/x86-pkeys.git pkeys-v010

=== diffstat ===

 Documentation/kernel-parameters.txt         |   3 +
 Documentation/x86/protection-keys.txt       |  53 +++++
 arch/mips/mm/gup.c                          |   3 +-
 arch/powerpc/include/asm/mman.h             |   5 +-
 arch/powerpc/include/asm/mmu_context.h      |  12 +
 arch/s390/include/asm/mmu_context.h         |  12 +
 arch/s390/mm/gup.c                          |   3 +-
 arch/sh/mm/gup.c                            |   2 +-
 arch/sparc/mm/gup.c                         |   2 +-
 arch/unicore32/include/asm/mmu_context.h    |  12 +
 arch/x86/Kconfig                            |  16 ++
 arch/x86/entry/syscalls/syscall_32.tbl      |   5 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   5 +
 arch/x86/include/asm/cpufeature.h           |  54 +++--
 arch/x86/include/asm/disabled-features.h    |  12 +
 arch/x86/include/asm/fpu/types.h            |  12 +
 arch/x86/include/asm/fpu/xstate.h           |   4 +-
 arch/x86/include/asm/mmu.h                  |   7 +
 arch/x86/include/asm/mmu_context.h          | 110 ++++++++-
 arch/x86/include/asm/pgtable.h              |  38 +++
 arch/x86/include/asm/pgtable_types.h        |  34 ++-
 arch/x86/include/asm/pkeys.h                |  68 ++++++
 arch/x86/include/asm/required-features.h    |   4 +
 arch/x86/include/asm/special_insns.h        |  20 ++
 arch/x86/include/uapi/asm/mman.h            |  22 ++
 arch/x86/include/uapi/asm/processor-flags.h |   2 +
 arch/x86/kernel/cpu/common.c                |  42 ++++
 arch/x86/kernel/fpu/xstate.c                | 250 +++++++++++++++++++-
 arch/x86/kernel/ldt.c                       |   4 +-
 arch/x86/kernel/process_64.c                |   2 +
 arch/x86/kernel/setup.c                     |   9 +
 arch/x86/mm/fault.c                         | 153 ++++++++++--
 arch/x86/mm/gup.c                           |  52 ++--
 arch/x86/mm/mpx.c                           |   4 +-
 drivers/char/agp/frontend.c                 |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |   4 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c     |   2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c         |   4 +-
 drivers/gpu/drm/via/via_dmablit.c           |   3 +-
 drivers/infiniband/core/umem.c              |   2 +-
 drivers/infiniband/core/umem_odp.c          |   8 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c |   3 +-
 drivers/infiniband/hw/qib/qib_user_pages.c  |   3 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c    |   2 +-
 drivers/iommu/amd_iommu_v2.c                |   8 +-
 drivers/media/pci/ivtv/ivtv-udma.c          |   4 +-
 drivers/media/pci/ivtv/ivtv-yuv.c           |  10 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c   |   3 +-
 drivers/misc/sgi-gru/grufault.c             |   3 +-
 drivers/scsi/st.c                           |   2 -
 drivers/staging/android/ashmem.c            |   4 +-
 drivers/video/fbdev/pvr2fb.c                |   4 +-
 drivers/virt/fsl_hypervisor.c               |   5 +-
 fs/exec.c                                   |   8 +-
 fs/proc/task_mmu.c                          |   5 +
 include/asm-generic/mm_hooks.h              |  13 +
 include/linux/mm.h                          |  52 +++-
 include/linux/mman.h                        |   6 +-
 include/linux/pkeys.h                       |  59 +++++
 include/uapi/asm-generic/mman-common.h      |   5 +
 include/uapi/asm-generic/siginfo.h          |  17 +-
 kernel/events/uprobes.c                     |   8 +-
 kernel/signal.c                             |   4 +
 mm/Kconfig                                  |  13 +
 mm/frame_vector.c                           |   9 +-
 mm/gup.c                                    | 153 ++++++------
 mm/ksm.c                                    |  10 +-
 mm/memory.c                                 |   8 +-
 mm/mempolicy.c                              |   6 +-
 mm/mmap.c                                   |   2 +-
 mm/mprotect.c                               | 134 ++++++++++-
 mm/nommu.c                                  |  35 ++-
 mm/process_vm_access.c                      |   6 +-
 mm/util.c                                   |   4 +-
 net/ceph/pagevec.c                          |   2 +-
 security/tomoyo/domain.c                    |   9 +-
 virt/kvm/async_pf.c                         |   2 +-
 virt/kvm/kvm_main.c                         |  13 +-
 78 files changed, 1410 insertions(+), 285 deletions(-)


Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: aarcange@redhat.com
Cc: akpm@linux-foundation.org
Cc: jack@suse.cz
Cc: kirill.shutemov@linux.intel.com
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: n-horiguchi@ah.jp.nec.com
Cc: x86@kernel.org

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 27/37] mm, multi-arch: pass a protection key in to calc_vm_flag_bits()
  2015-11-17  3:35 [PATCH 00/37] x86: Memory Protection Keys Dave Hansen
@ 2015-11-17  3:35 ` Dave Hansen
  2015-11-17  3:35 ` [PATCH 29/37] mm: implement new mprotect_key() system call Dave Hansen
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2015-11-17  3:35 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, Dave Hansen, dave.hansen, linux-api, linux-arch


From: Dave Hansen <dave.hansen@linux.intel.com>

This plumbs a protection key through calc_vm_flag_bits().  We
could have done this in calc_vm_prot_bits(), but I did not feel
super strongly which way to go.  It was pretty arbitrary which
one to use.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
---

 b/arch/powerpc/include/asm/mman.h  |    5 +++--
 b/drivers/char/agp/frontend.c      |    2 +-
 b/drivers/staging/android/ashmem.c |    4 ++--
 b/include/linux/mman.h             |    6 +++---
 b/mm/mmap.c                        |    2 +-
 b/mm/mprotect.c                    |    2 +-
 b/mm/nommu.c                       |    2 +-
 7 files changed, 12 insertions(+), 11 deletions(-)

diff -puN arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits arch/powerpc/include/asm/mman.h
--- a/arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.421490430 -0800
+++ b/arch/powerpc/include/asm/mman.h	2015-11-16 12:36:26.434491019 -0800
@@ -18,11 +18,12 @@
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+		unsigned long pkey)
 {
 	return (prot & PROT_SAO) ? VM_SAO : 0;
 }
-#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
diff -puN drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits drivers/char/agp/frontend.c
--- a/drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.423490520 -0800
+++ b/drivers/char/agp/frontend.c	2015-11-16 12:36:26.435491065 -0800
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(i
 {
 	unsigned long prot_bits;
 
-	prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
+	prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
 	return vm_get_page_prot(prot_bits);
 }
 
diff -puN drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits drivers/staging/android/ashmem.c
--- a/drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.424490566 -0800
+++ b/drivers/staging/android/ashmem.c	2015-11-16 12:36:26.435491065 -0800
@@ -372,8 +372,8 @@ static int ashmem_mmap(struct file *file
 	}
 
 	/* requested protection bits must match our allowed protection mask */
-	if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
-		     calc_vm_prot_bits(PROT_MASK))) {
+	if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
+		     calc_vm_prot_bits(PROT_MASK, 0))) {
 		ret = -EPERM;
 		goto out;
 	}
diff -puN include/linux/mman.h~pkeys-84-calc_vm_prot_bits include/linux/mman.h
--- a/include/linux/mman.h~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.426490656 -0800
+++ b/include/linux/mman.h	2015-11-16 12:36:26.436491110 -0800
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long
  */
 
 #ifndef arch_calc_vm_prot_bits
-#define arch_calc_vm_prot_bits(prot) 0
+#define arch_calc_vm_prot_bits(prot, pkey) 0
 #endif
 
 #ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(uns
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_prot_bits(unsigned long prot)
+calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
 	return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
 	       _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
 	       _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
-	       arch_calc_vm_prot_bits(prot);
+	       arch_calc_vm_prot_bits(prot, pkey);
 }
 
 /*
diff -puN mm/mmap.c~pkeys-84-calc_vm_prot_bits mm/mmap.c
--- a/mm/mmap.c~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.428490747 -0800
+++ b/mm/mmap.c	2015-11-16 12:36:26.437491156 -0800
@@ -1309,7 +1309,7 @@ unsigned long do_mmap(struct file *file,
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
-	vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+	vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
 	if (flags & MAP_LOCKED)
diff -puN mm/mprotect.c~pkeys-84-calc_vm_prot_bits mm/mprotect.c
--- a/mm/mprotect.c~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.429490793 -0800
+++ b/mm/mprotect.c	2015-11-16 19:22:07.781660707 -0800
@@ -373,7 +373,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
 	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
 		prot |= PROT_EXEC;
 
-	vm_flags = calc_vm_prot_bits(prot);
+	vm_flags = calc_vm_prot_bits(prot, 0);
 
 	down_write(&current->mm->mmap_sem);
 
diff -puN mm/nommu.c~pkeys-84-calc_vm_prot_bits mm/nommu.c
--- a/mm/nommu.c~pkeys-84-calc_vm_prot_bits	2015-11-16 12:36:26.431490883 -0800
+++ b/mm/nommu.c	2015-11-16 12:36:26.438491201 -0800
@@ -1081,7 +1081,7 @@ static unsigned long determine_vm_flags(
 {
 	unsigned long vm_flags;
 
-	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
 	/* vm_flags |= mm->def_flags; */
 
 	if (!(capabilities & NOMMU_MAP_DIRECT)) {
_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 29/37] mm: implement new mprotect_key() system call
  2015-11-17  3:35 [PATCH 00/37] x86: Memory Protection Keys Dave Hansen
  2015-11-17  3:35 ` [PATCH 27/37] mm, multi-arch: pass a protection key in to calc_vm_flag_bits() Dave Hansen
@ 2015-11-17  3:35 ` Dave Hansen
  2015-11-17  3:35 ` [PATCH 31/37] x86: wire up " Dave Hansen
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2015-11-17  3:35 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, Dave Hansen, dave.hansen, linux-api


From: Dave Hansen <dave.hansen@linux.intel.com>

mprotect_key() is just like mprotect, except it also takes a
protection key as an argument.  On systems that do not support
protection keys, it still works, but requires that key=0.
Otherwise it does exactly what mprotect does.

I expect it to get used like this, if you want to guarantee that
any mapping you create can *never* be accessed without the right
protection keys set up.

	pkey_deny_access(11); // random pkey
	int real_prot = PROT_READ|PROT_WRITE;
	ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
	ret = mprotect_key(ptr, PAGE_SIZE, real_prot, 11);

This way, there is *no* window where the mapping is accessible
since it was always either PROT_NONE or had a protection key set.

We settled on 'unsigned long' for the type of the key here.  We
only need 4 bits on x86 today, but I figured that other
architectures might need some more space.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/include/asm/mmu_context.h |   10 +++++++--
 b/include/linux/pkeys.h              |    7 +++++-
 b/mm/Kconfig                         |    7 ++++++
 b/mm/mprotect.c                      |   36 +++++++++++++++++++++++++++++------
 4 files changed, 51 insertions(+), 9 deletions(-)

diff -puN arch/x86/include/asm/mmu_context.h~pkeys-85-mprotect_pkey arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkeys-85-mprotect_pkey	2015-11-16 12:36:28.811598913 -0800
+++ b/arch/x86/include/asm/mmu_context.h	2015-11-16 19:22:01.920396860 -0800
@@ -4,6 +4,7 @@
 #include <asm/desc.h>
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
+#include <linux/pkeys.h>
 
 #include <trace/events/tlb.h>
 
@@ -243,10 +244,14 @@ static inline void arch_unmap(struct mm_
 		mpx_notify_unmap(mm, vma, start, end);
 }
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * If the config option is off, we get the generic version from
+ * include/linux/pkeys.h.
+ */
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
 	u16 pkey = 0;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
 				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
 	/*
@@ -259,9 +264,10 @@ static inline int vma_pkey(struct vm_are
 	 */
 	pkey = (vma->vm_flags >> vm_pkey_shift) &
 	       (vma_pkey_mask >> vm_pkey_shift);
-#endif
+
 	return pkey;
 }
+#endif
 
 static inline bool __pkru_allows_pkey(u16 pkey, bool write)
 {
diff -puN include/linux/pkeys.h~pkeys-85-mprotect_pkey include/linux/pkeys.h
--- a/include/linux/pkeys.h~pkeys-85-mprotect_pkey	2015-11-16 12:36:28.812598958 -0800
+++ b/include/linux/pkeys.h	2015-11-16 19:22:05.259547173 -0800
@@ -2,10 +2,10 @@
 #define _LINUX_PKEYS_H
 
 #include <linux/mm_types.h>
-#include <asm/mmu_context.h>
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include <asm/pkeys.h>
+#include <asm/mmu_context.h>
 #else /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
@@ -17,6 +17,11 @@ static inline bool arch_validate_pkey(in
 {
 	return true;
 }
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	return 0;
+}
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
diff -puN mm/Kconfig~pkeys-85-mprotect_pkey mm/Kconfig
--- a/mm/Kconfig~pkeys-85-mprotect_pkey	2015-11-16 12:36:28.814599049 -0800
+++ b/mm/Kconfig	2015-11-16 19:22:03.862484284 -0800
@@ -673,3 +673,10 @@ config ARCH_USES_HIGH_VMA_FLAGS
 	bool
 config ARCH_HAS_PKEYS
 	bool
+
+config NR_PROTECTION_KEYS
+	int
+	# Everything supports a _single_ key, so allow folks to
+	# at least call APIs that take keys, but require that the
+	# key be 0.
+	default 1
diff -puN mm/mprotect.c~pkeys-85-mprotect_pkey mm/mprotect.c
--- a/mm/mprotect.c~pkeys-85-mprotect_pkey	2015-11-16 12:36:28.816599140 -0800
+++ b/mm/mprotect.c	2015-11-16 19:22:05.259547173 -0800
@@ -24,6 +24,7 @@
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
 #include <linux/ksm.h>
+#include <linux/pkeys.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -344,10 +345,13 @@ fail:
 	return error;
 }
 
-SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-		unsigned long, prot)
+/*
+ * pkey=-1 when doing a legacy mprotect()
+ */
+static int do_mprotect_pkey(unsigned long start, size_t len,
+		unsigned long prot, int pkey)
 {
-	unsigned long vm_flags, nstart, end, tmp, reqprot;
+	unsigned long nstart, end, tmp, reqprot;
 	struct vm_area_struct *vma, *prev;
 	int error = -EINVAL;
 	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
@@ -373,8 +377,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
 	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
 		prot |= PROT_EXEC;
 
-	vm_flags = calc_vm_prot_bits(prot, 0);
-
 	down_write(&current->mm->mmap_sem);
 
 	vma = find_vma(current->mm, start);
@@ -407,7 +409,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
 
 		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
 
-		newflags = vm_flags;
+		/*
+		 * If this is a vanilla, non-pkey mprotect, inherit the
+		 * pkey from the VMA we are working on.
+		 */
+		if (pkey == -1)
+			newflags = calc_vm_prot_bits(prot, vma_pkey(vma));
+		else
+			newflags = calc_vm_prot_bits(prot, pkey);
 		newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 
 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
@@ -443,3 +452,18 @@ out:
 	up_write(&current->mm->mmap_sem);
 	return error;
 }
+
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+		unsigned long, prot)
+{
+	return do_mprotect_pkey(start, len, prot, -1);
+}
+
+SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
+		unsigned long, prot, int, pkey)
+{
+	if (!arch_validate_pkey(pkey))
+		return -EINVAL;
+
+	return do_mprotect_pkey(start, len, prot, pkey);
+}
_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 31/37] x86: wire up mprotect_key() system call
  2015-11-17  3:35 [PATCH 00/37] x86: Memory Protection Keys Dave Hansen
  2015-11-17  3:35 ` [PATCH 27/37] mm, multi-arch: pass a protection key in to calc_vm_flag_bits() Dave Hansen
  2015-11-17  3:35 ` [PATCH 29/37] mm: implement new mprotect_key() system call Dave Hansen
@ 2015-11-17  3:35 ` Dave Hansen
  2015-11-17  3:35 ` [PATCH 34/37] x86, pkeys: allocation/free syscalls Dave Hansen
  2015-11-17  3:36 ` [PATCH 35/37] x86, pkeys: add pkey set/get syscalls Dave Hansen
  4 siblings, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2015-11-17  3:35 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, Dave Hansen, dave.hansen, linux-api


From: Dave Hansen <dave.hansen@linux.intel.com>

This is all that we need to get the new system call itself
working on x86.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 b/arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 b/arch/x86/include/uapi/asm/mman.h       |    6 ++++++
 b/mm/Kconfig                             |    1 +
 4 files changed, 9 insertions(+)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkeys-16-x86-mprotect_key arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkeys-16-x86-mprotect_key	2015-11-16 12:36:30.650682386 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl	2015-11-16 19:21:56.328145123 -0800
@@ -383,3 +383,4 @@
 374	i386	userfaultfd		sys_userfaultfd
 375	i386	membarrier		sys_membarrier
 376	i386	mlock2			sys_mlock2
+377	i386	pkey_mprotect		sys_pkey_mprotect
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkeys-16-x86-mprotect_key arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkeys-16-x86-mprotect_key	2015-11-16 12:36:30.652682477 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl	2015-11-16 19:21:56.328145123 -0800
@@ -332,6 +332,7 @@
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
+326	common	pkey_mprotect		sys_pkey_mprotect
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/uapi/asm/mman.h~pkeys-16-x86-mprotect_key arch/x86/include/uapi/asm/mman.h
--- a/arch/x86/include/uapi/asm/mman.h~pkeys-16-x86-mprotect_key	2015-11-16 12:36:30.653682522 -0800
+++ b/arch/x86/include/uapi/asm/mman.h	2015-11-16 12:36:30.659682794 -0800
@@ -20,6 +20,12 @@
 		((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) |	\
 		((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) |	\
 		((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (		\
+		((key) & 0x1 ? VM_PKEY_BIT0 : 0) |      \
+		((key) & 0x2 ? VM_PKEY_BIT1 : 0) |      \
+		((key) & 0x4 ? VM_PKEY_BIT2 : 0) |      \
+		((key) & 0x8 ? VM_PKEY_BIT3 : 0))
 #endif
 
 #include <asm-generic/mman.h>
diff -puN mm/Kconfig~pkeys-16-x86-mprotect_key mm/Kconfig
--- a/mm/Kconfig~pkeys-16-x86-mprotect_key	2015-11-16 12:36:30.655682613 -0800
+++ b/mm/Kconfig	2015-11-16 12:36:30.659682794 -0800
@@ -679,4 +679,5 @@ config NR_PROTECTION_KEYS
 	# Everything supports a _single_ key, so allow folks to
 	# at least call APIs that take keys, but require that the
 	# key be 0.
+	default 16 if X86_INTEL_MEMORY_PROTECTION_KEYS
 	default 1
_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 34/37] x86, pkeys: allocation/free syscalls
  2015-11-17  3:35 [PATCH 00/37] x86: Memory Protection Keys Dave Hansen
                   ` (2 preceding siblings ...)
  2015-11-17  3:35 ` [PATCH 31/37] x86: wire up " Dave Hansen
@ 2015-11-17  3:35 ` Dave Hansen
  2015-11-17  3:36 ` [PATCH 35/37] x86, pkeys: add pkey set/get syscalls Dave Hansen
  4 siblings, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2015-11-17  3:35 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, Dave Hansen, dave.hansen, linux-api


From: Dave Hansen <dave.hansen@linux.intel.com>

This patch adds two new system calls:

	int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
	int pkey_free(int pkey);

These establish which protection keys are valid for use by
userspace.  A key which was not obtained by pkey_alloc() may not
be passed to pkey_mprotect().

In addition, the 'init_access_rights' argument to pkey_alloc() specifies
the rights that will be established for the returned pkey.  For instance

	pkey = pkey_alloc(flags, PKEY_DENY_WRITE);

will return with the bits set in PKRU such that writing to 'pkey' is
already denied.  This keeps userspace from needing to have knowledge
about manipulating PKRU.  It is still free to do so if it wishes, but
it is no longer required.

The kernel does _not_ enforce that this interface must be used for
changes to PKRU, even for keys it does not control.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |    2 
 b/arch/x86/entry/syscalls/syscall_64.tbl |    2 
 b/arch/x86/include/asm/mmu.h             |    7 ++
 b/arch/x86/include/asm/mmu_context.h     |    8 +++
 b/arch/x86/include/asm/pgtable.h         |    5 +-
 b/arch/x86/include/asm/pkeys.h           |   55 ++++++++++++++++++++++
 b/arch/x86/kernel/fpu/xstate.c           |   75 +++++++++++++++++++++++++++++++
 b/include/asm-generic/mm_hooks.h         |    1 
 b/include/linux/pkeys.h                  |   23 +++++++++
 b/include/uapi/asm-generic/mman-common.h |    5 ++
 b/mm/mprotect.c                          |   55 ++++++++++++++++++++++
 11 files changed, 236 insertions(+), 2 deletions(-)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkey-allocation-syscalls arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkey-allocation-syscalls	2015-11-16 12:36:32.972787782 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl	2015-11-16 19:21:52.519973697 -0800
@@ -384,3 +384,5 @@
 375	i386	membarrier		sys_membarrier
 376	i386	mlock2			sys_mlock2
 377	i386	pkey_mprotect		sys_pkey_mprotect
+378	i386	pkey_alloc		sys_pkey_alloc
+379	i386	pkey_free		sys_pkey_free
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkey-allocation-syscalls arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkey-allocation-syscalls	2015-11-16 12:36:32.973787828 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl	2015-11-16 19:21:52.520973742 -0800
@@ -333,6 +333,8 @@
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
 326	common	pkey_mprotect		sys_pkey_mprotect
+327	common	pkey_alloc		sys_pkey_alloc
+328	common	pkey_free		sys_pkey_free
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/asm/mmu_context.h~pkey-allocation-syscalls arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkey-allocation-syscalls	2015-11-16 12:36:32.975787919 -0800
+++ b/arch/x86/include/asm/mmu_context.h	2015-11-16 19:21:52.520973742 -0800
@@ -108,7 +108,12 @@ static inline void enter_lazy_tlb(struct
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	/* pkey 0 is the default and always allocated */
+	mm->context.pkey_allocation_map = 0x1;
+#endif
 	init_new_context_ldt(tsk, mm);
+
 	return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
@@ -333,4 +338,7 @@ static inline bool arch_pte_access_permi
 	return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
 }
 
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val);
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff -puN arch/x86/include/asm/mmu.h~pkey-allocation-syscalls arch/x86/include/asm/mmu.h
--- a/arch/x86/include/asm/mmu.h~pkey-allocation-syscalls	2015-11-16 12:36:32.976787964 -0800
+++ b/arch/x86/include/asm/mmu.h	2015-11-16 12:36:32.992788690 -0800
@@ -22,6 +22,13 @@ typedef struct {
 	void __user *vdso;
 
 	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	/*
+	 * One bit per protection key says whether userspace can
+	 * use it or not.  protected by mmap_sem.
+	 */
+	u16 pkey_allocation_map;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff -puN arch/x86/include/asm/pgtable.h~pkey-allocation-syscalls arch/x86/include/asm/pgtable.h
--- a/arch/x86/include/asm/pgtable.h~pkey-allocation-syscalls	2015-11-16 12:36:32.978788055 -0800
+++ b/arch/x86/include/asm/pgtable.h	2015-11-16 12:36:32.992788690 -0800
@@ -912,16 +912,17 @@ static inline pte_t pte_swp_clear_soft_d
 
 #define PKRU_AD_BIT 0x1
 #define PKRU_WD_BIT 0x2
+#define PKRU_BITS_PER_PKEY 2
 
 static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
 {
-	int pkru_pkey_bits = pkey * 2;
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
 	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
 }
 
 static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
 {
-	int pkru_pkey_bits = pkey * 2;
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
 	/*
 	 * Access-disable disables writes too so we need to check
 	 * both bits here.
diff -puN arch/x86/include/asm/pkeys.h~pkey-allocation-syscalls arch/x86/include/asm/pkeys.h
--- a/arch/x86/include/asm/pkeys.h~pkey-allocation-syscalls	2015-11-16 12:36:32.979788100 -0800
+++ b/arch/x86/include/asm/pkeys.h	2015-11-16 19:21:52.520973742 -0800
@@ -7,6 +7,61 @@
 
 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
 
+#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do {		\
+	mm_pkey_allocation_map(mm) |= (1 << pkey);	\
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do {			\
+	mm_pkey_allocation_map(mm) &= ~(1 << pkey);	\
+} while (0)
+
+static inline
+bool mm_pkey_is_allocated(struct mm_struct *mm, unsigned long pkey)
+{
+	if (!arch_validate_pkey(pkey))
+		return true;
+
+	return mm_pkey_allocation_map(mm) & (1 << pkey);
+}
+
+static inline
+int mm_pkey_alloc(struct mm_struct *mm)
+{
+	int all_pkeys_mask = ((1 << arch_max_pkey()) - 1);
+	int ret;
+
+	/*
+	 * Are we out of pkeys?  We must handle this specially
+	 * because ffz() behavior is undefined if there are no
+	 * zeros.
+	 */
+	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+		return -1;
+
+	ret = ffz(mm_pkey_allocation_map(mm));
+
+	mm_set_pkey_allocated(mm, ret);
+
+	return ret;
+}
+
+static inline
+int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+	/*
+	 * pkey 0 is special, always allocated and can never
+	 * be freed.
+	 */
+	if (!pkey || !arch_validate_pkey(pkey))
+		return -EINVAL;
+	if (!mm_pkey_is_allocated(mm, pkey))
+		return -EINVAL;
+
+	mm_set_pkey_free(mm, pkey);
+
+	return 0;
+}
+
 #endif /*_ASM_X86_PKEYS_H */
 
 
diff -puN arch/x86/kernel/fpu/xstate.c~pkey-allocation-syscalls arch/x86/kernel/fpu/xstate.c
--- a/arch/x86/kernel/fpu/xstate.c~pkey-allocation-syscalls	2015-11-16 12:36:32.981788191 -0800
+++ b/arch/x86/kernel/fpu/xstate.c	2015-11-16 19:21:52.521973787 -0800
@@ -5,6 +5,8 @@
  */
 #include <linux/compat.h>
 #include <linux/cpu.h>
+#include <linux/mman.h>
+#include <linux/pkeys.h>
 
 #include <asm/fpu/api.h>
 #include <asm/fpu/internal.h>
@@ -775,6 +777,7 @@ const void *get_xsave_field_ptr(int xsav
 	return get_xsave_addr(&fpu->state.xsave, xsave_state);
 }
 
+#ifdef CONFIG_ARCH_HAS_PKEYS
 
 /*
  * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
@@ -864,6 +867,78 @@ static void fpu__xfeature_set_state(int
 	preempt_disable();
 	fpu__restore(fpu);
 	preempt_enable();
+}
+
+#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
+#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+
+/*
+ * This will go out and modify the XSAVE buffer so that PKRU is
+ * set to a particular state for access to 'pkey'.
+ *
+ * PKRU state does affect kernel access to user memory.  We do
+ * not modfiy PKRU *itself* here, only the XSAVE state that will
+ * be restored in to PKRU when we return back to userspace.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val)
+{
+	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct pkru_state *old_pkru_state;
+	struct pkru_state new_pkru_state;
+	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+	u32 new_pkru_bits = 0;
+
+	if (!arch_validate_pkey(pkey))
+		return -EINVAL;
+	/*
+	 * This check implies XSAVE support.  OSPKE only gets
+	 * set if we enable XSAVE and we enable PKU in XCR0.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return -EINVAL;
+
+	/* Set the bits we need in PKRU  */
+	if (init_val & PKEY_DISABLE_ACCESS)
+		new_pkru_bits |= PKRU_AD_BIT;
+	if (init_val & PKEY_DISABLE_WRITE)
+		new_pkru_bits |= PKRU_WD_BIT;
+
+	/* Shift the bits in to the correct place in PKRU for pkey. */
+	new_pkru_bits <<= pkey_shift;
+
+	/* Locate old copy of the state in the xsave buffer */
+	old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
+
+	/*
+	 * When state is not in the buffer, it is in the init
+	 * state, set it manually.  Otherwise, copy out the old
+	 * state.
+	 */
+	if (!old_pkru_state)
+		new_pkru_state.pkru = 0;
+	else
+		new_pkru_state.pkru = old_pkru_state->pkru;
+
+	/* mask off any old bits in place */
+	new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+	/* Set the newly-requested bits */
+	new_pkru_state.pkru |= new_pkru_bits;
+
+	/*
+	 * We could theoretically live without zeroing pkru.pad.
+	 * The current XSAVE feature state definition says that
+	 * only bytes 0->3 are used.  But we do not want to
+	 * chance leaking kernel stack out to userspace in case a
+	 * memcpy() of the whole xsave buffer was done.
+	 *
+	 * They're in the same cacheline anyway.
+	 */
+	new_pkru_state.pad = 0;
+
+	fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
+			sizeof(new_pkru_state));
 
 	return 0;
 }
+#endif /* CONFIG_ARCH_HAS_PKEYS */
diff -puN include/asm-generic/mm_hooks.h~pkey-allocation-syscalls include/asm-generic/mm_hooks.h
--- a/include/asm-generic/mm_hooks.h~pkey-allocation-syscalls	2015-11-16 12:36:32.983788282 -0800
+++ b/include/asm-generic/mm_hooks.h	2015-11-16 12:36:32.993788735 -0800
@@ -38,4 +38,5 @@ static inline bool arch_pte_access_permi
 	/* by default, allow everything */
 	return true;
 }
+
 #endif	/* _ASM_GENERIC_MM_HOOKS_H */
diff -puN include/linux/pkeys.h~pkey-allocation-syscalls include/linux/pkeys.h
--- a/include/linux/pkeys.h~pkey-allocation-syscalls	2015-11-16 12:36:32.984788327 -0800
+++ b/include/linux/pkeys.h	2015-11-16 19:21:52.521973787 -0800
@@ -23,6 +23,29 @@ static inline int vma_pkey(struct vm_are
 {
 	return 0;
 }
+
+static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+	return (pkey == 0);
+}
+
+static inline int mm_pkey_alloc(struct mm_struct *mm)
+{
+	return -1;
+}
+
+static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+	WARN_ONCE(1, "free of protection key when disabled");
+	return -EINVAL;
+}
+
+static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+			unsigned long init_val)
+{
+	return 0;
+}
+
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
diff -puN include/uapi/asm-generic/mman-common.h~pkey-allocation-syscalls include/uapi/asm-generic/mman-common.h
--- a/include/uapi/asm-generic/mman-common.h~pkey-allocation-syscalls	2015-11-16 12:36:32.986788418 -0800
+++ b/include/uapi/asm-generic/mman-common.h	2015-11-16 12:36:32.994788781 -0800
@@ -71,4 +71,9 @@
 #define MAP_HUGE_SHIFT	26
 #define MAP_HUGE_MASK	0x3f
 
+#define PKEY_DISABLE_ACCESS	0x1
+#define PKEY_DISABLE_WRITE	0x2
+#define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
+				 PKEY_DISABLE_WRITE)
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff -puN mm/mprotect.c~pkey-allocation-syscalls mm/mprotect.c
--- a/mm/mprotect.c~pkey-allocation-syscalls	2015-11-16 12:36:32.988788509 -0800
+++ b/mm/mprotect.c	2015-11-16 19:21:52.522973832 -0800
@@ -23,11 +23,13 @@
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
+#include <linux/pkeys.h>
 #include <linux/ksm.h>
 #include <linux/pkeys.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
+#include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
 #include "internal.h"
@@ -379,6 +381,14 @@ static int do_mprotect_pkey(unsigned lon
 
 	down_write(&current->mm->mmap_sem);
 
+	/*
+	 * If userspace did not allocate the pkey, do not let
+	 * them use it here.
+	 */
+	error = -EINVAL;
+	if (!mm_pkey_is_allocated(current->mm, pkey))
+		goto out;
+
 	vma = find_vma(current->mm, start);
 	error = -ENOMEM;
 	if (!vma)
@@ -474,3 +484,48 @@ SYSCALL_DEFINE4(pkey_mprotect, unsigned
 
 	return do_mprotect_pkey(start, len, prot, pkey);
 }
+
+SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
+{
+	int pkey;
+	int ret;
+
+	/* No flags supported yet. */
+	if (flags)
+		return -EINVAL;
+	/* check for unsupported init values */
+	if (init_val & ~PKEY_ACCESS_MASK)
+		return -EINVAL;
+
+	down_write(&current->mm->mmap_sem);
+	pkey = mm_pkey_alloc(current->mm);
+
+	ret = -ENOSPC;
+	if (pkey == -1)
+		goto out;
+
+	ret = arch_set_user_pkey_access(current, pkey, init_val);
+	if (ret) {
+		mm_pkey_free(current->mm, pkey);
+		goto out;
+	}
+	ret = pkey;
+out:
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
+
+SYSCALL_DEFINE1(pkey_free, int, pkey)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+	ret = mm_pkey_free(current->mm, pkey);
+	up_write(&current->mm->mmap_sem);
+
+	/*
+	 * We could provie warnings or errors if any VMA still
+	 * has the pkey set here.
+	 */
+	return ret;
+}
_

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 35/37] x86, pkeys: add pkey set/get syscalls
  2015-11-17  3:35 [PATCH 00/37] x86: Memory Protection Keys Dave Hansen
                   ` (3 preceding siblings ...)
  2015-11-17  3:35 ` [PATCH 34/37] x86, pkeys: allocation/free syscalls Dave Hansen
@ 2015-11-17  3:36 ` Dave Hansen
  4 siblings, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2015-11-17  3:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, Dave Hansen, dave.hansen, linux-api


From: Dave Hansen <dave.hansen@linux.intel.com>

This establishes two more system calls for protection key management:

	unsigned long pkey_get(int pkey);
	int pkey_set(int pkey, unsigned long access_rights);

The return value from pkey_get() and the 'access_rights' passed
to pkey_set() are the same format: a bitmask containing
PKEY_DENY_WRITE and/or PKEY_DENY_ACCESS, or nothing set at all.

These replace userspace's direct use of rdpkru/wrpkru.

With current hardware, the kernel can not enforce that it has
control over a given key.  But, this at least allows the kernel
to indicate to userspace that userspace does not control a given
protection key.

The kernel does _not_ enforce that this interface must be used for
changes to PKRU, even for keys it has not "allocated".

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |    2 +
 b/arch/x86/entry/syscalls/syscall_64.tbl |    2 +
 b/arch/x86/include/asm/mmu_context.h     |    2 +
 b/arch/x86/include/asm/pkeys.h           |    2 -
 b/arch/x86/kernel/fpu/xstate.c           |   55 +++++++++++++++++++++++++++++--
 b/include/linux/pkeys.h                  |    8 ++++
 b/mm/mprotect.c                          |   34 +++++++++++++++++++
 7 files changed, 102 insertions(+), 3 deletions(-)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkey-syscalls-set-get arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkey-syscalls-set-get	2015-11-16 12:36:34.851873071 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl	2015-11-16 12:36:34.863873616 -0800
@@ -386,3 +386,5 @@
 377	i386	pkey_mprotect		sys_pkey_mprotect
 378	i386	pkey_alloc		sys_pkey_alloc
 379	i386	pkey_free		sys_pkey_free
+380	i386	pkey_get		sys_pkey_get
+381	i386	pkey_set		sys_pkey_set
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkey-syscalls-set-get arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkey-syscalls-set-get	2015-11-16 12:36:34.852873116 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl	2015-11-16 12:36:34.864873661 -0800
@@ -335,6 +335,8 @@
 326	common	pkey_mprotect		sys_pkey_mprotect
 327	common	pkey_alloc		sys_pkey_alloc
 328	common	pkey_free		sys_pkey_free
+329	common	pkey_get		sys_pkey_get
+330	common	pkey_set		sys_pkey_set
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/asm/mmu_context.h~pkey-syscalls-set-get arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkey-syscalls-set-get	2015-11-16 12:36:34.854873207 -0800
+++ b/arch/x86/include/asm/mmu_context.h	2015-11-16 12:36:34.864873661 -0800
@@ -340,5 +340,7 @@ static inline bool arch_pte_access_permi
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
+extern unsigned long arch_get_user_pkey_access(struct task_struct *tsk,
+		int pkey);
 
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff -puN arch/x86/include/asm/pkeys.h~pkey-syscalls-set-get arch/x86/include/asm/pkeys.h
--- a/arch/x86/include/asm/pkeys.h~pkey-syscalls-set-get	2015-11-16 12:36:34.855873253 -0800
+++ b/arch/x86/include/asm/pkeys.h	2015-11-16 19:14:09.231117816 -0800
@@ -16,7 +16,7 @@
 } while (0)
 
 static inline
-bool mm_pkey_is_allocated(struct mm_struct *mm, unsigned long pkey)
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
 	if (!arch_validate_pkey(pkey))
 		return true;
diff -puN arch/x86/kernel/fpu/xstate.c~pkey-syscalls-set-get arch/x86/kernel/fpu/xstate.c
--- a/arch/x86/kernel/fpu/xstate.c~pkey-syscalls-set-get	2015-11-16 12:36:34.857873343 -0800
+++ b/arch/x86/kernel/fpu/xstate.c	2015-11-16 12:36:34.865873706 -0800
@@ -687,7 +687,7 @@ void fpu__resume_cpu(void)
  *
  * Note: does not work for compacted buffers.
  */
-void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
 {
 	int feature_nr = fls64(xstate_feature_mask) - 1;
 
@@ -871,6 +871,7 @@ static void fpu__xfeature_set_state(int
 
 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+#define PKRU_INIT_STATE	0
 
 /*
  * This will go out and modify the XSAVE buffer so that PKRU is
@@ -889,6 +890,9 @@ int arch_set_user_pkey_access(struct tas
 	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
 	u32 new_pkru_bits = 0;
 
+	/* Only support manipulating current task for now */
+	if (tsk != current)
+		return -EINVAL;
 	if (!arch_validate_pkey(pkey))
 		return -EINVAL;
 	/*
@@ -916,7 +920,7 @@ int arch_set_user_pkey_access(struct tas
 	 * state.
 	 */
 	if (!old_pkru_state)
-		new_pkru_state.pkru = 0;
+		new_pkru_state.pkru = PKRU_INIT_STATE;
 	else
 		new_pkru_state.pkru = old_pkru_state->pkru;
 
@@ -941,4 +945,51 @@ int arch_set_user_pkey_access(struct tas
 
 	return 0;
 }
+
+/*
+ * Figures out what the rights are currently for 'pkey'.
+ * Converts from PKRU's format to the user-visible PKEY_DISABLE_*
+ * format.
+ */
+unsigned long arch_get_user_pkey_access(struct task_struct *tsk, int pkey)
+{
+	struct fpu *fpu = &current->thread.fpu;
+	u32 pkru_reg;
+	int ret = 0;
+
+	/* Only support manipulating current task for now */
+	if (tsk != current)
+		return -1;
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return -1;
+	/*
+	 * The contents of PKRU itself are invalid.  Consult the
+	 * task's XSAVE buffer for PKRU contents.  This is much
+	 * more expensive than reading PKRU directly, but should
+	 * be rare or impossible with eagerfpu mode.
+	 */
+	if (!fpu->fpregs_active) {
+		struct xregs_state *xsave = &fpu->state.xsave;
+		struct pkru_state *pkru_state =
+			get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
+		/*
+		 * PKRU is in its init state and not present in
+		 * the buffer in a saved form.
+		 */
+		if (!pkru_state)
+			return PKRU_INIT_STATE;
+
+		return pkru_state->pkru;
+	}
+	/*
+	 * Consult the user register directly.
+	 */
+	pkru_reg = read_pkru();
+	if (!__pkru_allows_read(pkru_reg, pkey))
+		ret |= PKEY_DISABLE_ACCESS;
+	if (!__pkru_allows_write(pkru_reg, pkey))
+		ret |= PKEY_DISABLE_WRITE;
+
+	return ret;
+}
 #endif /* CONFIG_ARCH_HAS_PKEYS */
diff -puN include/linux/pkeys.h~pkey-syscalls-set-get include/linux/pkeys.h
--- a/include/linux/pkeys.h~pkey-syscalls-set-get	2015-11-16 12:36:34.858873389 -0800
+++ b/include/linux/pkeys.h	2015-11-16 12:36:34.865873706 -0800
@@ -43,6 +43,14 @@ static inline int mm_pkey_free(struct mm
 static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 			unsigned long init_val)
 {
+	return -EINVAL;
+}
+
+static inline
+unsigned long arch_get_user_pkey_access(struct task_struct *tsk, int pkey)
+{
+	if (pkey)
+		return -1;
 	return 0;
 }
 
diff -puN mm/mprotect.c~pkey-syscalls-set-get mm/mprotect.c
--- a/mm/mprotect.c~pkey-syscalls-set-get	2015-11-16 12:36:34.860873479 -0800
+++ b/mm/mprotect.c	2015-11-16 12:36:34.865873706 -0800
@@ -529,3 +529,37 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
 	 */
 	return ret;
 }
+
+SYSCALL_DEFINE1(pkey_get, int, pkey)
+{
+	unsigned long ret = 0;
+
+	down_write(&current->mm->mmap_sem);
+	if (!mm_pkey_is_allocated(current->mm, pkey))
+		ret = -EBADF;
+	up_write(&current->mm->mmap_sem);
+
+	if (ret)
+		return ret;
+
+	ret = arch_get_user_pkey_access(current, pkey);
+
+	return ret;
+}
+
+SYSCALL_DEFINE2(pkey_set, int, pkey, unsigned long, access_rights)
+{
+	unsigned long ret = 0;
+
+	down_write(&current->mm->mmap_sem);
+	if (!mm_pkey_is_allocated(current->mm, pkey))
+		ret = -EBADF;
+	up_write(&current->mm->mmap_sem);
+
+	if (ret)
+		return ret;
+
+	ret = arch_set_user_pkey_access(current, pkey, access_rights);
+
+	return ret;
+}
_

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-11-17  3:36 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-11-17  3:35 [PATCH 00/37] x86: Memory Protection Keys Dave Hansen
2015-11-17  3:35 ` [PATCH 27/37] mm, multi-arch: pass a protection key in to calc_vm_flag_bits() Dave Hansen
2015-11-17  3:35 ` [PATCH 29/37] mm: implement new mprotect_key() system call Dave Hansen
2015-11-17  3:35 ` [PATCH 31/37] x86: wire up " Dave Hansen
2015-11-17  3:35 ` [PATCH 34/37] x86, pkeys: allocation/free syscalls Dave Hansen
2015-11-17  3:36 ` [PATCH 35/37] x86, pkeys: add pkey set/get syscalls Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).