Linux Security Modules development

Linux Security Modules development
 help / color / mirror / Atom feed

* [PATCH bpf-next v11 2/7] landlock: Add the management of domains
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

A Landlock domain is a set of eBPF programs.  There is a list for each
different program types that can be run on a specific Landlock hook
(e.g. ptrace).  A domain is tied to a set of subjects (i.e. tasks).  A
Landlock program should not try (nor be able) to infer which subject is
currently enforced, but to have a unique security policy for all
subjects tied to the same domain.  This make the reasoning much easier
and help avoid pitfalls.

The next commits tie a domain to a task's credentials thanks to
seccomp(2), but we could use cgroups or a security file-system to
enforce a sysadmin-defined policy .

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* rename files and names to clearly define a domain
* create a standalone patch to ease review
---
 security/landlock/Makefile        |   3 +-
 security/landlock/common.h        |  38 +++++
 security/landlock/domain_manage.c | 265 ++++++++++++++++++++++++++++++
 security/landlock/domain_manage.h |  23 +++
 4 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 security/landlock/domain_manage.c
 create mode 100644 security/landlock/domain_manage.h

diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 682b798c6b76..dd5f70185778 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
 landlock-y := \
-	bpf_verify.o bpf_ptrace.o
+	bpf_verify.o bpf_ptrace.o \
+	domain_manage.o
diff --git a/security/landlock/common.h b/security/landlock/common.h
index 0234c4bc4acd..fb2990eb5fb4 100644
--- a/security/landlock/common.h
+++ b/security/landlock/common.h
@@ -11,11 +11,49 @@
 
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/refcount.h>
 
 enum landlock_hook_type {
 	LANDLOCK_HOOK_PTRACE = 1,
 };
 
+#define _LANDLOCK_HOOK_LAST LANDLOCK_HOOK_PTRACE
+
+struct landlock_prog_list {
+	struct landlock_prog_list *prev;
+	struct bpf_prog *prog;
+	refcount_t usage;
+};
+
+/**
+ * struct landlock_domain - Landlock programs enforced on a set of tasks
+ *
+ * When prepending a new program, if &struct landlock_domain is shared with
+ * other tasks, then duplicate it and prepend the program to this new &struct
+ * landlock_domain.
+ *
+ * @usage: reference count to manage the object lifetime. When a task needs to
+ *	   add Landlock programs and if @usage is greater than 1, then the
+ *	   task must duplicate &struct landlock_domain to not change the
+ *	   children's programs as well.
+ * @programs: array of non-NULL &struct landlock_prog_list pointers
+ */
+struct landlock_domain {
+	struct landlock_prog_list *programs[_LANDLOCK_HOOK_LAST];
+	refcount_t usage;
+};
+
+/**
+ * get_hook_index - get an index for the programs of struct landlock_prog_set
+ *
+ * @type: a Landlock hook type
+ */
+static inline size_t get_hook_index(enum landlock_hook_type type)
+{
+	/* type ID > 0 for loaded programs */
+	return type - 1;
+}
+
 static inline enum landlock_hook_type get_hook_type(const struct bpf_prog *prog)
 {
 	switch (prog->expected_attach_type) {
diff --git a/security/landlock/domain_manage.c b/security/landlock/domain_manage.c
new file mode 100644
index 000000000000..c955b9c95c84
--- /dev/null
+++ b/security/landlock/domain_manage.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - domain management
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "common.h"
+#include "domain_manage.h"
+
+void landlock_get_domain(struct landlock_domain *dom)
+{
+	if (!dom)
+		return;
+	refcount_inc(&dom->usage);
+}
+
+static void put_landlock_prog_list(struct landlock_prog_list *prog_list)
+{
+	struct landlock_prog_list *orig = prog_list;
+
+	/* clean up single-reference branches iteratively */
+	while (orig && refcount_dec_and_test(&orig->usage)) {
+		struct landlock_prog_list *freeme = orig;
+
+		if (orig->prog)
+			bpf_prog_put(orig->prog);
+		orig = orig->prev;
+		kfree(freeme);
+	}
+}
+
+void landlock_put_domain(struct landlock_domain *domain)
+{
+	if (domain && refcount_dec_and_test(&domain->usage)) {
+		size_t i;
+
+		for (i = 0; i < ARRAY_SIZE(domain->programs); i++)
+			put_landlock_prog_list(domain->programs[i]);
+		kfree(domain);
+	}
+}
+
+static struct landlock_domain *new_landlock_domain(void)
+{
+	struct landlock_domain *domain;
+
+	/* array filled with NULL values */
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&domain->usage, 1);
+	return domain;
+}
+
+/**
+ * store_landlock_prog - prepend and deduplicate a Landlock prog_list
+ *
+ * Prepend @prog to @init_domain while ignoring @prog if they are already in
+ * @ref_domain.  Whatever is the result of this function call, you can call
+ * bpf_prog_put(@prog) after.
+ *
+ * @init_domain: empty domain to prepend to
+ * @ref_domain: domain to check for duplicate programs
+ * @prog: program to prepend
+ *
+ * Return -errno on error or 0 if @prog was successfully stored.
+ */
+static int store_landlock_prog(struct landlock_domain *init_domain,
+		const struct landlock_domain *ref_domain,
+		struct bpf_prog *prog)
+{
+	struct landlock_prog_list *tmp_list = NULL;
+	int err;
+	size_t hook;
+	enum landlock_hook_type last_type;
+	struct bpf_prog *new = prog;
+
+	/* allocate all the memory we need */
+	struct landlock_prog_list *new_list;
+
+	last_type = get_hook_type(new);
+
+	/* ignore duplicate programs */
+	if (ref_domain) {
+		struct landlock_prog_list *ref;
+
+		hook = get_hook_index(get_hook_type(new));
+		for (ref = ref_domain->programs[hook]; ref;
+				ref = ref->prev) {
+			if (ref->prog == new)
+				return -EINVAL;
+		}
+	}
+
+	new = bpf_prog_inc(new);
+	if (IS_ERR(new)) {
+		err = PTR_ERR(new);
+		goto put_tmp_list;
+	}
+	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
+	if (!new_list) {
+		bpf_prog_put(new);
+		err = -ENOMEM;
+		goto put_tmp_list;
+	}
+	/* ignore Landlock types in this tmp_list */
+	new_list->prog = new;
+	new_list->prev = tmp_list;
+	refcount_set(&new_list->usage, 1);
+	tmp_list = new_list;
+
+	if (!tmp_list)
+		/* inform user space that this program was already added */
+		return -EEXIST;
+
+	/* properly store the list (without error cases) */
+	while (tmp_list) {
+		struct landlock_prog_list *new_list;
+
+		new_list = tmp_list;
+		tmp_list = tmp_list->prev;
+		/* do not increment the previous prog list usage */
+		hook = get_hook_index(get_hook_type(new_list->prog));
+		new_list->prev = init_domain->programs[hook];
+		/* no need to add from the last program to the first because
+		 * each of them are a different Landlock type */
+		smp_store_release(&init_domain->programs[hook], new_list);
+	}
+	return 0;
+
+put_tmp_list:
+	put_landlock_prog_list(tmp_list);
+	return err;
+}
+
+/* limit Landlock programs set to 256KB */
+#define LANDLOCK_PROGRAMS_MAX_PAGES (1 << 6)
+
+/**
+ * landlock_prepend_prog - attach a Landlock prog_list to @current_domain
+ *
+ * Whatever is the result of this function call, you can call
+ * bpf_prog_put(@prog) after.
+ *
+ * @current_domain: landlock_domain pointer, must be (RCU-)locked (if needed)
+ *                  to prevent a concurrent put/free. This pointer must not be
+ *                  freed after the call.
+ * @prog: non-NULL Landlock prog_list to prepend to @current_domain. @prog will
+ *        be owned by landlock_prepend_prog() and freed if an error happened.
+ *
+ * Return @current_domain or a new pointer when OK. Return a pointer error
+ * otherwise.
+ */
+struct landlock_domain *landlock_prepend_prog(
+		struct landlock_domain *current_domain,
+		struct bpf_prog *prog)
+{
+	struct landlock_domain *new_domain = current_domain;
+	unsigned long pages;
+	int err;
+	size_t i;
+	struct landlock_domain tmp_domain = {};
+
+	if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
+		return ERR_PTR(-EINVAL);
+
+	/* validate memory size allocation */
+	pages = prog->pages;
+	if (current_domain) {
+		size_t i;
+
+		for (i = 0; i < ARRAY_SIZE(current_domain->programs); i++) {
+			struct landlock_prog_list *walker_p;
+
+			for (walker_p = current_domain->programs[i];
+					walker_p; walker_p = walker_p->prev)
+				pages += walker_p->prog->pages;
+		}
+		/* count a struct landlock_domain if we need to allocate one */
+		if (refcount_read(&current_domain->usage) != 1)
+			pages += round_up(sizeof(*current_domain), PAGE_SIZE)
+				/ PAGE_SIZE;
+	}
+	if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
+		return ERR_PTR(-E2BIG);
+
+	/* ensure early that we can allocate enough memory for the new
+	 * prog_lists */
+	err = store_landlock_prog(&tmp_domain, current_domain, prog);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * Each task_struct points to an array of prog list pointers.  These
+	 * tables are duplicated when additions are made (which means each
+	 * table needs to be refcounted for the processes using it). When a new
+	 * table is created, all the refcounters on the prog_list are bumped
+	 * (to track each table that references the prog). When a new prog is
+	 * added, it's just prepended to the list for the new table to point
+	 * at.
+	 *
+	 * Manage all the possible errors before this step to not uselessly
+	 * duplicate current_domain and avoid a rollback.
+	 */
+	if (!new_domain) {
+		/*
+		 * If there is no Landlock domain used by the current task,
+		 * then create a new one.
+		 */
+		new_domain = new_landlock_domain();
+		if (IS_ERR(new_domain))
+			goto put_tmp_lists;
+	} else if (refcount_read(&current_domain->usage) > 1) {
+		/*
+		 * If the current task is not the sole user of its Landlock
+		 * domain, then duplicate it.
+		 */
+		new_domain = new_landlock_domain();
+		if (IS_ERR(new_domain))
+			goto put_tmp_lists;
+		for (i = 0; i < ARRAY_SIZE(new_domain->programs); i++) {
+			new_domain->programs[i] =
+				READ_ONCE(current_domain->programs[i]);
+			if (new_domain->programs[i])
+				refcount_inc(&new_domain->programs[i]->usage);
+		}
+
+		/*
+		 * Landlock domain from the current task will not be freed here
+		 * because the usage is strictly greater than 1. It is only
+		 * prevented to be freed by another task thanks to the caller
+		 * of landlock_prepend_prog() which should be locked if needed.
+		 */
+		landlock_put_domain(current_domain);
+	}
+
+	/* prepend tmp_domain to new_domain */
+	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++) {
+		/* get the last new list */
+		struct landlock_prog_list *last_list =
+			tmp_domain.programs[i];
+
+		if (last_list) {
+			while (last_list->prev)
+				last_list = last_list->prev;
+			/* no need to increment usage (pointer replacement) */
+			last_list->prev = new_domain->programs[i];
+			new_domain->programs[i] = tmp_domain.programs[i];
+		}
+	}
+	return new_domain;
+
+put_tmp_lists:
+	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++)
+		put_landlock_prog_list(tmp_domain.programs[i]);
+	return new_domain;
+}
diff --git a/security/landlock/domain_manage.h b/security/landlock/domain_manage.h
new file mode 100644
index 000000000000..5b5b49f6e3e8
--- /dev/null
+++ b/security/landlock/domain_manage.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - domain management headers
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
+#define _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
+
+#include <linux/filter.h>
+
+#include "common.h"
+
+void landlock_get_domain(struct landlock_domain *dom);
+void landlock_put_domain(struct landlock_domain *dom);
+
+struct landlock_domain *landlock_prepend_prog(
+		struct landlock_domain *current_domain,
+		struct bpf_prog *prog);
+
+#endif /* _SECURITY_LANDLOCK_DOMAIN_MANAGE_H */
-- 
2.23.0


^ permalink raw reply related

* [PATCH bpf-next v11 1/7] bpf,landlock: Define an eBPF program type for Landlock hooks
From: Mickaël Salaün @ 2019-10-29 17:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

Add a new type of eBPF program used by Landlock hooks.  The goal of this
type of program is to accept or deny a requested access from userspace
to a kernel object (e.g. process).  This will be more useful with the
next commit adding a new eBPF helper.

This new BPF program type will be registered with the Landlock LSM
initialization.

Add an initial Landlock Kconfig and update the MAINTAINERS file.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* replace file system program types with a (simpler) ptrace program type
* add an eBPF task pointer type
* split files

Changes since v9:
* handle inode put and map put, which fix unmount (reported by Al Viro)
* replace subtype with expected_attach_type and expected_attach_triggers
* check eBPF program return code

Changes since v8:
* Remove the chaining concept from the eBPF program contexts (chain and
  cookie). We need to keep these subtypes this way to be able to make
  them evolve, though.
* remove bpf_landlock_put_extra() because there is no more a "previous"
  field to free (for now)

Changes since v7:
* cosmetic fixes
* rename LANDLOCK_SUBTYPE_* to LANDLOCK_*
* cleanup UAPI definitions and move them from bpf.h to landlock.h
  (suggested by Alexei Starovoitov)
* disable Landlock by default (suggested by Alexei Starovoitov)
* rename BPF_PROG_TYPE_LANDLOCK_{RULE,HOOK}
* update the Kconfig
* update the MAINTAINERS file
* replace the IOCTL, LOCK and FCNTL events with FS_PICK, FS_WALK and
  FS_GET hook types
* add the ability to chain programs with an eBPF program file descriptor
  (i.e. the "previous" field in a Landlock subtype) and keep a state
  with a "cookie" value available from the context
* add a "triggers" subtype bitfield to match specific actions (e.g.
  append, chdir, read...)

Changes since v6:
* add 3 more sub-events: IOCTL, LOCK, FCNTL
  https://lkml.kernel.org/r/2fbc99a6-f190-f335-bd14-04bdeed35571@digikod.net
* rename LANDLOCK_VERSION to LANDLOCK_ABI to better reflect its purpose,
  and move it from landlock.h to common.h
* rename BPF_PROG_TYPE_LANDLOCK to BPF_PROG_TYPE_LANDLOCK_RULE: an eBPF
  program could be used for something else than a rule
* simplify struct landlock_context by removing the arch and syscall_nr fields
* remove all eBPF map functions call, remove ABILITY_WRITE
* refactor bpf_landlock_func_proto() (suggested by Kees Cook)
* constify pointers
* fix doc inclusion

Changes since v5:
* rename file hooks.c to init.c
* fix spelling

Changes since v4:
* merge a minimal (not enabled) LSM code and Kconfig in this commit

Changes since v3:
* split commit
* revamp the landlock_context:
  * add arch, syscall_nr and syscall_cmd (ioctl, fcntl…) to be able to
    cross-check action with the event type
  * replace args array with dedicated fields to ease the addition of new
    fields
---
 MAINTAINERS                    |  8 ++++
 include/linux/bpf.h            |  1 +
 include/linux/bpf_types.h      |  3 ++
 include/uapi/linux/bpf.h       |  2 +
 include/uapi/linux/landlock.h  | 39 ++++++++++++++++
 kernel/bpf/syscall.c           |  9 ++++
 kernel/bpf/verifier.c          |  7 +++
 security/Kconfig               |  1 +
 security/Makefile              |  2 +
 security/landlock/Kconfig      | 19 ++++++++
 security/landlock/Makefile     |  4 ++
 security/landlock/bpf_ptrace.c | 30 ++++++++++++
 security/landlock/bpf_ptrace.h | 17 +++++++
 security/landlock/bpf_verify.c | 83 ++++++++++++++++++++++++++++++++++
 security/landlock/common.h     | 30 ++++++++++++
 15 files changed, 255 insertions(+)
 create mode 100644 include/uapi/linux/landlock.h
 create mode 100644 security/landlock/Kconfig
 create mode 100644 security/landlock/Makefile
 create mode 100644 security/landlock/bpf_ptrace.c
 create mode 100644 security/landlock/bpf_ptrace.h
 create mode 100644 security/landlock/bpf_verify.c
 create mode 100644 security/landlock/common.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 7fc074632eac..4cabb85ea52d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9146,6 +9146,14 @@ F:	net/core/skmsg.c
 F:	net/core/sock_map.c
 F:	net/ipv4/tcp_bpf.c
 
+LANDLOCK SECURITY MODULE
+M:	Mickaël Salaün <mic@digikod.net>
+S:	Supported
+F:	include/uapi/linux/landlock.h
+F:	security/landlock/
+K:	landlock
+K:	LANDLOCK
+
 LANTIQ / INTEL Ethernet drivers
 M:	Hauke Mehrtens <hauke@hauke-m.de>
 L:	netdev@vger.kernel.org
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 171be30fe0ae..819a3e207438 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -291,6 +291,7 @@ enum bpf_reg_type {
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	PTR_TO_BTF_ID,		 /* reg points to kernel struct */
+	PTR_TO_TASK,		 /* reg points to struct task_struct */
 };
 
 /* The information passed from prog-specific *_is_valid_access
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 36a9c2325176..bddabc961a3b 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -38,6 +38,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
+#ifdef CONFIG_SECURITY_LANDLOCK
+BPF_PROG_TYPE(BPF_PROG_TYPE_LANDLOCK_HOOK, landlock)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4af8b0819a32..6e4147790f96 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -173,6 +173,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 	BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	BPF_PROG_TYPE_LANDLOCK_HOOK,
 };
 
 enum bpf_attach_type {
@@ -199,6 +200,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_RECVMSG,
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
+	BPF_LANDLOCK_PTRACE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
new file mode 100644
index 000000000000..3ffe3cbdbad6
--- /dev/null
+++ b/include/uapi/linux/landlock.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Landlock - UAPI headers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _UAPI__LINUX_LANDLOCK_H__
+#define _UAPI__LINUX_LANDLOCK_H__
+
+#include <linux/types.h>
+
+/**
+ * DOC: landlock_ret
+ *
+ * The return value of a landlock program is a bitmask that can allow or deny
+ * the action for which the program is run.
+ *
+ * In the future, this could be used to trigger an audit event as well.
+ *
+ * - %LANDLOCK_RET_ALLOW
+ * - %LANDLOCK_RET_DENY
+ */
+#define LANDLOCK_RET_ALLOW	0
+#define LANDLOCK_RET_DENY	1
+
+/**
+ * struct landlock_context_ptrace - context accessible to BPF_LANDLOCK_PTRACE
+ *
+ * @tracer: pointer to the task requesting to debug @tracee
+ * @tracee: pointer to the task being debugged
+ */
+struct landlock_context_ptrace {
+	__u64 tracer;
+	__u64 tracee;
+};
+
+#endif /* _UAPI__LINUX_LANDLOCK_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ff5225759553..5159e582a0d8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1621,6 +1621,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+#ifdef CONFIG_SECURITY_LANDLOCK
+	case BPF_PROG_TYPE_LANDLOCK_HOOK:
+		switch (expected_attach_type) {
+		case BPF_LANDLOCK_PTRACE:
+			return 0;
+		default:
+			return -EINVAL;
+		}
+#endif
 	default:
 		return 0;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c59778c0fc4d..ebf1991906b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -421,6 +421,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 	[PTR_TO_BTF_ID]		= "ptr_",
+	[PTR_TO_TASK]		= "task",
 };
 
 static char slot_type_char[] = {
@@ -1878,6 +1879,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_TASK:
 		return true;
 	default:
 		return false;
@@ -2600,6 +2602,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_XDP_SOCK:
 		pointer_desc = "xdp_sock ";
 		break;
+	case PTR_TO_TASK:
+		pointer_desc = "task ";
+		break;
 	default:
 		break;
 	}
@@ -4527,6 +4532,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_TASK:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -6278,6 +6284,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_LANDLOCK_HOOK:
 		break;
 	default:
 		return 0;
diff --git a/security/Kconfig b/security/Kconfig
index 2a1a2d396228..9d9981394fb0 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -238,6 +238,7 @@ source "security/loadpin/Kconfig"
 source "security/yama/Kconfig"
 source "security/safesetid/Kconfig"
 source "security/lockdown/Kconfig"
+source "security/landlock/Kconfig"
 
 source "security/integrity/Kconfig"
 
diff --git a/security/Makefile b/security/Makefile
index be1dd9d2cb2f..60b7f6f2fd30 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -12,6 +12,7 @@ subdir-$(CONFIG_SECURITY_YAMA)		+= yama
 subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
 subdir-$(CONFIG_SECURITY_SAFESETID)    += safesetid
 subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown
+subdir-$(CONFIG_SECURITY_LANDLOCK)		+= landlock
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -29,6 +30,7 @@ obj-$(CONFIG_SECURITY_YAMA)		+= yama/
 obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
 obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
+obj-$(CONFIG_SECURITY_LANDLOCK)	+= landlock/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
 
 # Object integrity file lists
diff --git a/security/landlock/Kconfig b/security/landlock/Kconfig
new file mode 100644
index 000000000000..44921bd72380
--- /dev/null
+++ b/security/landlock/Kconfig
@@ -0,0 +1,19 @@
+config SECURITY_LANDLOCK
+	bool "Landlock support"
+	depends on SECURITY
+	depends on BPF_SYSCALL
+	depends on SECCOMP_FILTER
+	default n
+	help
+	  This selects Landlock, a programmatic access control.  It enables to
+	  restrict processes on the fly (i.e. create a sandbox) or log some
+	  actions.  The security policy is a set of eBPF programs, dedicated to
+	  allow or deny a list of actions on specific kernel objects (e.g.
+	  process).
+
+	  You need to enable seccomp filter to apply a security policy to a
+	  process hierarchy (e.g. application with built-in sandboxing).
+
+	  See Documentation/security/landlock/ for further information.
+
+	  If you are unsure how to answer this question, answer N.
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
new file mode 100644
index 000000000000..682b798c6b76
--- /dev/null
+++ b/security/landlock/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
+
+landlock-y := \
+	bpf_verify.o bpf_ptrace.o
diff --git a/security/landlock/bpf_ptrace.c b/security/landlock/bpf_ptrace.c
new file mode 100644
index 000000000000..2ec73078ad01
--- /dev/null
+++ b/security/landlock/bpf_ptrace.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - eBPF ptrace
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#include <linux/bpf.h>
+#include <uapi/linux/landlock.h>
+
+#include "bpf_ptrace.h"
+
+bool landlock_is_valid_access_ptrace(int off, enum bpf_access_type type,
+		enum bpf_reg_type *reg_type, int *max_size)
+{
+	if (type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case offsetof(struct landlock_context_ptrace, tracer):
+		/* fall through */
+	case offsetof(struct landlock_context_ptrace, tracee):
+		*reg_type = PTR_TO_TASK;
+		*max_size = sizeof(u64);
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/security/landlock/bpf_ptrace.h b/security/landlock/bpf_ptrace.h
new file mode 100644
index 000000000000..85edce37b70a
--- /dev/null
+++ b/security/landlock/bpf_ptrace.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - eBPF ptrace headers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_BPF_PTRACE_H
+#define _SECURITY_LANDLOCK_BPF_PTRACE_H
+
+#include <linux/bpf.h>
+
+bool landlock_is_valid_access_ptrace(int off, enum bpf_access_type type,
+		enum bpf_reg_type *reg_type, int *max_size);
+
+#endif /* _SECURITY_LANDLOCK_BPF_PTRACE_H */
diff --git a/security/landlock/bpf_verify.c b/security/landlock/bpf_verify.c
new file mode 100644
index 000000000000..6ed921588178
--- /dev/null
+++ b/security/landlock/bpf_verify.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - eBPF program verifications
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+#include "common.h"
+#include "bpf_ptrace.h"
+
+static bool bpf_landlock_is_valid_access(int off, int size,
+		enum bpf_access_type type, const struct bpf_prog *prog,
+		struct bpf_insn_access_aux *info)
+{
+	enum bpf_reg_type reg_type = NOT_INIT;
+	int max_size = 0;
+
+	if (WARN_ON(!prog->expected_attach_type))
+		return false;
+
+	if (off < 0)
+		return false;
+	if (size <= 0 || size > sizeof(__u64))
+		return false;
+
+	/* set register type and max size */
+	switch (get_hook_type(prog)) {
+	case LANDLOCK_HOOK_PTRACE:
+		if (!landlock_is_valid_access_ptrace(off, type, &reg_type,
+					&max_size))
+			return false;
+		break;
+	}
+
+	/* check memory range access */
+	switch (reg_type) {
+	case NOT_INIT:
+		return false;
+	case SCALAR_VALUE:
+		/* allow partial raw value */
+		if (size > max_size)
+			return false;
+		info->ctx_field_size = max_size;
+		break;
+	default:
+		/* deny partial pointer */
+		if (size != max_size)
+			return false;
+	}
+
+	info->reg_type = reg_type;
+	return true;
+}
+
+static const struct bpf_func_proto *bpf_landlock_func_proto(
+		enum bpf_func_id func_id,
+		const struct bpf_prog *prog)
+{
+	if (WARN_ON(!prog->expected_attach_type))
+		return NULL;
+
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		return NULL;
+	}
+}
+
+const struct bpf_verifier_ops landlock_verifier_ops = {
+	.get_func_proto	= bpf_landlock_func_proto,
+	.is_valid_access = bpf_landlock_is_valid_access,
+};
+
+const struct bpf_prog_ops landlock_prog_ops = {};
diff --git a/security/landlock/common.h b/security/landlock/common.h
new file mode 100644
index 000000000000..0234c4bc4acd
--- /dev/null
+++ b/security/landlock/common.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - private headers
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_COMMON_H
+#define _SECURITY_LANDLOCK_COMMON_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+enum landlock_hook_type {
+	LANDLOCK_HOOK_PTRACE = 1,
+};
+
+static inline enum landlock_hook_type get_hook_type(const struct bpf_prog *prog)
+{
+	switch (prog->expected_attach_type) {
+	case BPF_LANDLOCK_PTRACE:
+		return LANDLOCK_HOOK_PTRACE;
+	default:
+		WARN_ON(1);
+		return BPF_LANDLOCK_PTRACE;
+	}
+}
+
+#endif /* _SECURITY_LANDLOCK_COMMON_H */
-- 
2.23.0


^ permalink raw reply related

* [PATCH bpf-next v11 3/7] landlock,seccomp: Load Landlock programs per process hierarchy
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

The seccomp(2) syscall can be used by a task to apply a Landlock program
to itself. As a seccomp filter, a Landlock program is enforced for the
current task and all its future children. A program is immutable and a
task can only add new restricting programs to itself, forming a list of
programs.

A Landlock program is tied to a Landlock hook. If the action on a kernel
object is allowed by the other Linux security mechanisms (e.g. DAC,
capabilities, other LSM), then a Landlock hook related to this kind of
object is triggered. The list of programs for this hook is then
evaluated. Each program return a binary value which can deny the action
on a kernel object with a non-zero value. If every programs of the list
return zero, then the action on the object is allowed.

The next commit adds the LSM hooks to enforce the memory protection
programs on the appropriate process hierarchies.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
Link: https://lore.kernel.org/lkml/c10a503d-5e35-7785-2f3d-25ed8dd63fab@digikod.net/
---

Changes since v10:
* rewrite the Landlock program attaching mechanisme to not rely on
  internal seccomp structures but only on the (LSM-stacked) task's
  credentials:
  * make the use of seccomp (and task's credentials) optional if not
    relying on its syscall, which may be useful for domains defined by
    other means (e.g. cgroups or system-wide thanks to a dedicated
    securityfs)

Changes since v9:
* replace subtype with expected_attach_type and expected_attach_triggers

Changes since v8:
* Remove the chaining concept from the eBPF program contexts (chain and
  cookie). We need to keep these subtypes this way to be able to make
  them evolve, though.

Changes since v7:
* handle and verify program chains
* split and rename providers.c to enforce.c and enforce_seccomp.c
* rename LANDLOCK_SUBTYPE_* to LANDLOCK_*

Changes since v6:
* rename some functions with more accurate names to reflect that an eBPF
  program for Landlock could be used for something else than a rule
* reword rule "appending" to "prepending" and explain it
* remove the superfluous no_new_privs check, only check global
  CAP_SYS_ADMIN when prepending a Landlock rule (needed for containers)
* create and use {get,put}_seccomp_landlock() (suggested by Kees Cook)
* replace ifdef with static inlined function (suggested by Kees Cook)
* use get_user() (suggested by Kees Cook)
* replace atomic_t with refcount_t (requested by Kees Cook)
* move struct landlock_{rule,events} from landlock.h to common.h
* cleanup headers

Changes since v5:
* remove struct landlock_node and use a similar inheritance mechanisme
  as seccomp-bpf (requested by Andy Lutomirski)
* rename SECCOMP_ADD_LANDLOCK_RULE to SECCOMP_APPEND_LANDLOCK_RULE
* rename file manager.c to providers.c
* add comments
* typo and cosmetic fixes

Changes since v4:
* merge manager and seccomp patches
* return -EFAULT in seccomp(2) when user_bpf_fd is null to easely check
  if Landlock is supported
* only allow a process with the global CAP_SYS_ADMIN to use Landlock
  (will be lifted in the future)
* add an early check to exit as soon as possible if the current process
  does not have Landlock rules

Changes since v3:
* remove the hard link with seccomp (suggested by Andy Lutomirski and
  Kees Cook):
  * remove the cookie which could imply multiple evaluation of Landlock
    rules
  * remove the origin field in struct landlock_data
* remove documentation fix (merged upstream)
* rename the new seccomp command to SECCOMP_ADD_LANDLOCK_RULE
* internal renaming
* split commit
* new design to be able to inherit on the fly the parent rules

Changes since v2:
* Landlock programs can now be run without seccomp filter but for any
  syscall (from the process) or interruption
* move Landlock related functions and structs into security/landlock/*
  (to manage cgroups as well)
* fix seccomp filter handling: run Landlock programs for each of their
  legitimate seccomp filter
* properly clean up all seccomp results
* cosmetic changes to ease the understanding
* fix some ifdef
---
 MAINTAINERS                        |  1 +
 include/linux/landlock.h           | 25 +++++++++
 include/linux/lsm_hooks.h          |  1 +
 include/uapi/linux/seccomp.h       |  1 +
 kernel/seccomp.c                   |  4 ++
 security/landlock/Makefile         |  5 +-
 security/landlock/common.h         | 16 ++++++
 security/landlock/domain_syscall.c | 87 ++++++++++++++++++++++++++++++
 security/landlock/hooks_cred.c     | 47 ++++++++++++++++
 security/landlock/hooks_cred.h     | 14 +++++
 security/landlock/init.c           | 30 +++++++++++
 security/security.c                | 15 ++++++
 12 files changed, 244 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/landlock.h
 create mode 100644 security/landlock/domain_syscall.c
 create mode 100644 security/landlock/hooks_cred.c
 create mode 100644 security/landlock/hooks_cred.h
 create mode 100644 security/landlock/init.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 4cabb85ea52d..32bfd88159b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9149,6 +9149,7 @@ F:	net/ipv4/tcp_bpf.c
 LANDLOCK SECURITY MODULE
 M:	Mickaël Salaün <mic@digikod.net>
 S:	Supported
+F:	include/linux/landlock.h
 F:	include/uapi/linux/landlock.h
 F:	security/landlock/
 K:	landlock
diff --git a/include/linux/landlock.h b/include/linux/landlock.h
new file mode 100644
index 000000000000..ffbf2397c459
--- /dev/null
+++ b/include/linux/landlock.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock LSM - public kernel headers
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _LINUX_LANDLOCK_H
+#define _LINUX_LANDLOCK_H
+
+#include <linux/errno.h>
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_SECURITY_LANDLOCK)
+extern int landlock_seccomp_prepend_prog(unsigned int flags,
+		const int __user *user_bpf_fd);
+#else /* CONFIG_SECCOMP_FILTER && CONFIG_SECURITY_LANDLOCK */
+static inline int landlock_seccomp_prepend_prog(unsigned int flags,
+		const int __user *user_bpf_fd)
+{
+		return -EINVAL;
+}
+#endif /* CONFIG_SECCOMP_FILTER && CONFIG_SECURITY_LANDLOCK */
+
+#endif /* _LINUX_LANDLOCK_H */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a3763247547c..a8ba679b388a 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -2106,6 +2106,7 @@ extern void security_add_hooks(struct security_hook_list *hooks, int count,
 enum lsm_order {
 	LSM_ORDER_FIRST = -1,	/* This is only for capabilities. */
 	LSM_ORDER_MUTABLE = 0,
+	LSM_ORDER_LAST = 1,	/* potentially-unprivileged LSM */
 };
 
 struct lsm_info {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 90734aa5aa36..bce6534e7feb 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
 #define SECCOMP_SET_MODE_FILTER		1
 #define SECCOMP_GET_ACTION_AVAIL	2
 #define SECCOMP_GET_NOTIF_SIZES		3
+#define SECCOMP_PREPEND_LANDLOCK_PROG	4
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..af542a2d21e7 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,6 +41,7 @@
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
+#include <linux/landlock.h>
 
 enum notify_state {
 	SECCOMP_NOTIFY_INIT,
@@ -1397,6 +1398,9 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 			return -EINVAL;
 
 		return seccomp_get_notif_sizes(uargs);
+	case SECCOMP_PREPEND_LANDLOCK_PROG:
+		return landlock_seccomp_prepend_prog(flags,
+				(const int __user *)uargs);
 	default:
 		return -EINVAL;
 	}
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index dd5f70185778..0b291f2c027c 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
-landlock-y := \
+landlock-y := init.o \
 	bpf_verify.o bpf_ptrace.o \
-	domain_manage.o
+	domain_manage.o domain_syscall.o \
+	hooks_cred.o
diff --git a/security/landlock/common.h b/security/landlock/common.h
index fb2990eb5fb4..3ae8340a5b3d 100644
--- a/security/landlock/common.h
+++ b/security/landlock/common.h
@@ -10,9 +10,15 @@
 #define _SECURITY_LANDLOCK_COMMON_H
 
 #include <linux/bpf.h>
+#include <linux/cred.h>
 #include <linux/filter.h>
+#include <linux/lsm_hooks.h>
 #include <linux/refcount.h>
 
+#define LANDLOCK_NAME "landlock"
+
+extern struct lsm_blob_sizes landlock_blob_sizes;
+
 enum landlock_hook_type {
 	LANDLOCK_HOOK_PTRACE = 1,
 };
@@ -43,6 +49,16 @@ struct landlock_domain {
 	refcount_t usage;
 };
 
+struct landlock_cred_security {
+	struct landlock_domain *domain;
+};
+
+static inline struct landlock_cred_security *landlock_cred(
+		const struct cred *cred)
+{
+	return cred->security + landlock_blob_sizes.lbs_cred;
+}
+
 /**
  * get_hook_index - get an index for the programs of struct landlock_prog_set
  *
diff --git a/security/landlock/domain_syscall.c b/security/landlock/domain_syscall.c
new file mode 100644
index 000000000000..62daa630353e
--- /dev/null
+++ b/security/landlock/domain_syscall.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - seccomp syscall
+ *
+ * Copyright © 2016-2018 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifdef CONFIG_SECCOMP_FILTER
+
+#include <asm/current.h>
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/filter.h>
+#include <linux/landlock.h>
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
+#include <linux/uaccess.h>
+
+#include "common.h"
+#include "domain_manage.h"
+
+/**
+ * landlock_seccomp_prepend_prog - attach a Landlock program to the current
+ *                                 task
+ *
+ * current->cred->security[landlock]->domain is lazily allocated. When a new
+ * credential is created, only a pointer is copied.  When a new Landlock
+ * program is added by a task, if there is other references to this task's
+ * domain, then a new allocation is made to contain an array pointing to
+ * Landlock program lists.  This design enable low-performance impact and is
+ * memory efficient while keeping the property of prepend-only programs.
+ *
+ * For now, installing a Landlock program requires that the requesting task has
+ * the global CAP_SYS_ADMIN. We cannot force the use of no_new_privs to not
+ * exclude containers where a process may legitimately acquire more privileges
+ * thanks to an SUID binary.
+ *
+ * @flags: not used, must be 0
+ * @user_bpf_fd: file descriptor pointing to a loaded Landlock prog
+ */
+int landlock_seccomp_prepend_prog(unsigned int flags,
+		const int __user *user_bpf_fd)
+{
+	struct landlock_domain *new_domain;
+	struct cred *cred_new;
+	struct landlock_cred_security *llcred_new;
+	struct bpf_prog *prog;
+	int bpf_fd, err;
+
+	/* planned to be replaced with a no_new_privs check to allow
+	 * unprivileged tasks */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/* enable to check if Landlock is supported with early EFAULT */
+	if (!user_bpf_fd)
+		return -EFAULT;
+	if (flags)
+		return -EINVAL;
+	err = get_user(bpf_fd, user_bpf_fd);
+	if (err)
+		return err;
+	prog = bpf_prog_get(bpf_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	cred_new = prepare_creds();
+	if (!cred_new) {
+		bpf_prog_put(prog);
+		return -ENOMEM;
+	}
+	llcred_new = landlock_cred(cred_new);
+	/* the new creds are an atomic copy of the current creds */
+	new_domain = landlock_prepend_prog(llcred_new->domain, prog);
+	bpf_prog_put(prog);
+	if (IS_ERR(new_domain)) {
+		abort_creds(cred_new);
+		return PTR_ERR(new_domain);
+	}
+	llcred_new->domain = new_domain;
+	return commit_creds(cred_new);
+}
+
+#endif /* CONFIG_SECCOMP_FILTER */
diff --git a/security/landlock/hooks_cred.c b/security/landlock/hooks_cred.c
new file mode 100644
index 000000000000..def8678019a0
--- /dev/null
+++ b/security/landlock/hooks_cred.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - credential hooks
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#include <linux/cred.h>
+#include <linux/lsm_hooks.h>
+
+#include "common.h"
+#include "domain_manage.h"
+#include "hooks_cred.h"
+
+static int hook_cred_prepare(struct cred *new, const struct cred *old,
+		gfp_t gfp)
+{
+	const struct landlock_cred_security *cred_old = landlock_cred(old);
+	struct landlock_cred_security *cred_new = landlock_cred(new);
+	struct landlock_domain *dom_old;
+
+	dom_old = cred_old->domain;
+	if (dom_old) {
+		landlock_get_domain(dom_old);
+		cred_new->domain = dom_old;
+	} else {
+		cred_new->domain = NULL;
+	}
+	return 0;
+}
+
+static void hook_cred_free(struct cred *cred)
+{
+	landlock_put_domain(landlock_cred(cred)->domain);
+}
+
+static struct security_hook_list landlock_hooks[] = {
+	LSM_HOOK_INIT(cred_prepare, hook_cred_prepare),
+	LSM_HOOK_INIT(cred_free, hook_cred_free),
+};
+
+__init void landlock_add_hooks_cred(void)
+{
+	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+			LANDLOCK_NAME);
+}
diff --git a/security/landlock/hooks_cred.h b/security/landlock/hooks_cred.h
new file mode 100644
index 000000000000..641d66f6bf9a
--- /dev/null
+++ b/security/landlock/hooks_cred.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - credential hooks headers
+ *
+ * Copyright © 2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_HOOKS_CRED_H
+#define _SECURITY_LANDLOCK_HOOKS_CRED_H
+
+__init void landlock_add_hooks_cred(void);
+
+#endif /* _SECURITY_LANDLOCK_HOOKS_CRED_H */
diff --git a/security/landlock/init.c b/security/landlock/init.c
new file mode 100644
index 000000000000..8836ec4defd3
--- /dev/null
+++ b/security/landlock/init.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - initialization
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#include <linux/lsm_hooks.h>
+
+#include "common.h"
+#include "hooks_cred.h"
+
+static int __init landlock_init(void)
+{
+	pr_info(LANDLOCK_NAME ": Registering hooks\n");
+	landlock_add_hooks_cred();
+	return 0;
+}
+
+struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = {
+	.lbs_cred = sizeof(struct landlock_cred_security),
+};
+
+DEFINE_LSM(LANDLOCK_NAME) = {
+	.name = LANDLOCK_NAME,
+	.order = LSM_ORDER_LAST,
+	.blobs = &landlock_blob_sizes,
+	.init = landlock_init,
+};
diff --git a/security/security.c b/security/security.c
index 1bc000f834e2..03c7dce9e014 100644
--- a/security/security.c
+++ b/security/security.c
@@ -264,6 +264,21 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 		}
 	}
 
+	/*
+	 * In case of an unprivileged access-control, we don't want to give the
+	 * ability to any process to do some checks (e.g. through an eBPF
+	 * program) on kernel objects (e.g. files) if a privileged security
+	 * policy forbid their access.  We must then load
+	 * potentially-unprivileged security modules after all other LSMs.
+	 *
+	 * LSM_ORDER_LAST is always last and does not appear in the modifiable
+	 * ordered list of enabled LSMs.
+	 */
+	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+		if (lsm->order == LSM_ORDER_LAST)
+			append_ordered_lsm(lsm, "last");
+	}
+
 	/* Disable all LSMs not in the ordered list. */
 	for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
 		if (exists_ordered_lsm(lsm))
-- 
2.23.0


^ permalink raw reply related

* [PATCH bpf-next v11 7/7] landlock: Add user and kernel documentation for Landlock
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

This documentation can be built with the Sphinx framework.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* replace the filesystem hooks with the ptrace one
* remove the triggers
* update example
* add documenation for Landlock domains and seccomp interaction
* reference more kernel documenation (e.g. LSM hooks)

Changes since v9:
* update with expected attach type and expected attach triggers

Changes since v8:
* remove documentation related to chaining and tagging according to this
  patch series

Changes since v7:
* update documentation according to the Landlock revamp

Changes since v6:
* add a check for ctx->event
* rename BPF_PROG_TYPE_LANDLOCK to BPF_PROG_TYPE_LANDLOCK_RULE
* rename Landlock version to ABI to better reflect its purpose and add a
  dedicated changelog section
* update tables
* relax no_new_privs recommendations
* remove ABILITY_WRITE related functions
* reword rule "appending" to "prepending" and explain it
* cosmetic fixes

Changes since v5:
* update the rule hierarchy inheritance explanation
* briefly explain ctx->arg2
* add ptrace restrictions
* explain EPERM
* update example (subtype)
* use ":manpage:"
---
 Documentation/security/index.rst           |   1 +
 Documentation/security/landlock/index.rst  |  22 ++++
 Documentation/security/landlock/kernel.rst | 139 ++++++++++++++++++++
 Documentation/security/landlock/user.rst   | 142 +++++++++++++++++++++
 4 files changed, 304 insertions(+)
 create mode 100644 Documentation/security/landlock/index.rst
 create mode 100644 Documentation/security/landlock/kernel.rst
 create mode 100644 Documentation/security/landlock/user.rst

diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index fc503dd689a7..4d213e76ddf4 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -15,3 +15,4 @@ Security Documentation
    self-protection
    siphash
    tpm/index
+   landlock/index
diff --git a/Documentation/security/landlock/index.rst b/Documentation/security/landlock/index.rst
new file mode 100644
index 000000000000..1eced757b05d
--- /dev/null
+++ b/Documentation/security/landlock/index.rst
@@ -0,0 +1,22 @@
+=========================================
+Landlock LSM: programmatic access control
+=========================================
+
+:Author: Mickaël Salaün
+
+Landlock is a stackable Linux Security Module (LSM) that makes it possible to
+create security sandboxes, programmable access-controls or safe endpoint
+security agents.  This kind of sandbox is expected to help mitigate the
+security impact of bugs or unexpected/malicious behaviors in user-space
+applications.  The current version allows only a process with the global
+CAP_SYS_ADMIN capability to create such sandboxes but the ultimate goal of
+Landlock is to empower any process, including unprivileged ones, to securely
+restrict themselves.  Landlock is inspired by seccomp-bpf but instead of
+filtering syscalls and their raw arguments, a Landlock rule can inspect the use
+of kernel objects like processes and hence make a decision according to the
+kernel semantic.
+
+.. toctree::
+
+    user
+    kernel
diff --git a/Documentation/security/landlock/kernel.rst b/Documentation/security/landlock/kernel.rst
new file mode 100644
index 000000000000..0be906f92c3e
--- /dev/null
+++ b/Documentation/security/landlock/kernel.rst
@@ -0,0 +1,139 @@
+==============================
+Landlock: kernel documentation
+==============================
+
+eBPF properties
+===============
+
+To get an expressive language while still being safe and small, Landlock is
+based on eBPF. Landlock should be usable by untrusted processes and must
+therefore expose a minimal attack surface. The eBPF bytecode is minimal,
+powerful, widely used and designed to be used by untrusted applications. Thus,
+reusing the eBPF support in the kernel enables a generic approach while
+minimizing new code.
+
+An eBPF program has access to an eBPF context containing some fields used to
+inspect the current object. These arguments may be used directly (e.g. raw
+value) or passed to helper functions according to their types (e.g. pointer).
+It is then possible to do complex access checks without race conditions or
+inconsistent evaluation (i.e.  `incorrect mirroring of the OS code and state
+<https://www.ndss-symposium.org/ndss2003/traps-and-pitfalls-practical-problems-system-call-interposition-based-security-tools/>`_).
+
+A Landlock hook describes a particular access type.  For now, there is one hook
+dedicated to ptrace related operations: BPF_LANDLOCK_PTRACE.  A Landlock
+program is tied to one hook.  This makes it possible to statically check
+context accesses, potentially performed by such program, and hence prevents
+kernel address leaks and ensure the right use of hook arguments with eBPF
+functions.  Any user can add multiple Landlock programs per Landlock hook.
+They are stacked and evaluated one after the other, starting from the most
+recent program, as seccomp-bpf does with its filters.  Underneath, a hook is an
+abstraction over a set of LSM hooks.
+
+
+Guiding principles
+==================
+
+Unprivileged use
+----------------
+
+* Landlock helpers and context should be usable by any unprivileged and
+  untrusted program while following the system security policy enforced by
+  other access control mechanisms (e.g. DAC, LSM), even if a global
+  CAP_SYS_ADMIN is currently required.
+
+
+Landlock hook and context
+-------------------------
+
+* A Landlock hook shall be focused on access control on kernel objects instead
+  of syscall filtering (i.e. syscall arguments), which is the purpose of
+  seccomp-bpf.
+* A Landlock context provided by a hook shall express the minimal and more
+  generic interface to control an access for a kernel object.
+* A hook shall guaranty that all the BPF function calls from a program are
+  safe.  Thus, the related Landlock context arguments shall always be of the
+  same type for a particular hook.  For example, a network hook could share
+  helpers with a file hook because of UNIX socket.  However, the same helpers
+  may not be compatible for a file system handle and a net handle.
+* Multiple hooks may use the same context interface.
+
+
+Landlock helpers
+----------------
+
+* Landlock helpers shall be as generic as possible while at the same time being
+  as simple as possible and following the syscall creation principles (cf.
+  *Documentation/adding-syscalls.txt*).
+* The only behavior change allowed on a helper is to fix a (logical) bug to
+  match the initial semantic.
+* Helpers shall be reentrant, i.e. only take inputs from arguments (e.g. from
+  the BPF context), to enable a hook to use a cache.  Future program options
+  might change this cache behavior.
+* It is quite easy to add new helpers to extend Landlock.  The main concern
+  should be about the possibility to leak information from the kernel that may
+  not be accessible otherwise (i.e. side-channel attack).
+
+
+Landlock domain
+===============
+
+A Landlock domain is a set of eBPF programs.  There is a list for each
+different program types that can be run on a specific Landlock hook (e.g.
+ptrace).  A domain is tied to a set of subjects (i.e. tasks).
+
+A Landlock program should not try (nor be able) to infer which subject is
+currently enforced, but to have a unique security policy for all subjects tied
+to the same domain.  This make the reasoning much easier and help avoid
+pitfalls.
+
+.. kernel-doc:: security/landlock/common.h
+    :functions: landlock_domain
+
+.. kernel-doc:: security/landlock/domain_manage.c
+    :functions: landlock_prepend_prog
+
+
+Adding a Landlock program with seccomp
+--------------------------------------
+
+The :manpage:`seccomp(2)` syscall can be used with the
+`SECCOMP_PREPEND_LANDLOCK_PROG` operation to prepend a Landlock program to the
+current task's domain.
+
+.. kernel-doc:: security/landlock/domain_syscall.c
+    :functions: landlock_seccomp_prepend_prog
+
+
+Running a list of Landlock programs
+-----------------------------------
+
+.. kernel-doc:: security/landlock/bpf_run.c
+    :functions: landlock_access_denied
+
+
+LSM hooks
+=========
+
+.. kernel-doc:: security/landlock/hooks_ptrace.c
+    :functions: hook_ptrace_access_check
+
+.. kernel-doc:: security/landlock/hooks_ptrace.c
+    :functions: hook_ptrace_traceme
+
+
+Questions and answers
+=====================
+
+Why a program does not return an errno or a kill code?
+------------------------------------------------------
+
+seccomp filters can return multiple kind of code, including an errno value or a
+kill signal, which may be convenient for access control.  Those return codes
+are hardwired in the userland ABI.  Instead, Landlock's approach is to return a
+bitmask to allow or deny an action, which is much simpler and more generic.
+Moreover, we do not really have a choice because, unlike to seccomp, Landlock
+programs are not enforced at the syscall entry point but may be executed at any
+point in the kernel (through LSM hooks) where an errno return code may not make
+sense.  However, with this simple ABI and with the ability to call helpers,
+Landlock may gain features similar to seccomp-bpf in the future while being
+compatible with previous programs.
diff --git a/Documentation/security/landlock/user.rst b/Documentation/security/landlock/user.rst
new file mode 100644
index 000000000000..e7aa9a260a86
--- /dev/null
+++ b/Documentation/security/landlock/user.rst
@@ -0,0 +1,142 @@
+================================
+Landlock: userland documentation
+================================
+
+Landlock programs
+=================
+
+eBPF programs are used to create security programs.  They are contained and can
+call only a whitelist of dedicated functions. Moreover, they can only loop
+under strict conditions, which protects from denial of service.  More
+information on BPF can be found in *Documentation/networking/filter.txt*.
+
+
+Writing a program
+-----------------
+
+To enforce a security policy, a thread first needs to create a Landlock
+program.  The easiest way to write an eBPF program depicting a security program
+is to write it in the C language.  As described in *samples/bpf/README.rst*,
+LLVM can compile such programs.  A simple eBPF program can also be written by
+hand has done in *tools/testing/selftests/landlock/*.
+
+Once the eBPF program is created, the next step is to create the metadata
+describing the Landlock program.  This metadata includes an expected attach
+type which contains the hook type to which the program is tied.
+
+A hook is a policy decision point which exposes the same context type for
+each program evaluation.
+
+A Landlock hook describes the kind of kernel object for which a program will be
+triggered to allow or deny an action.  For example, the hook
+BPF_LANDLOCK_PTRACE can be triggered every time a landlocked thread performs a
+set of action related to debugging (cf. :manpage:`ptrace(2)`) or if the kernel
+needs to know if a process manipulation requested by something else is
+legitimate.
+
+The next step is to fill a :c:type:`struct bpf_load_program_attr
+<bpf_load_program_attr>` with BPF_PROG_TYPE_LANDLOCK_HOOK, the expected attach
+type and other BPF program metadata.  This bpf_attr must then be passed to the
+:manpage:`bpf(2)` syscall alongside the BPF_PROG_LOAD command.  If everything
+is deemed correct by the kernel, the thread gets a file descriptor referring to
+this program.
+
+In the following code, the *insn* variable is an array of BPF instructions
+which can be extracted from an ELF file as is done in bpf_load_file() from
+*samples/bpf/bpf_load.c*.
+
+.. code-block:: c
+
+    int prog_fd;
+    struct bpf_load_program_attr load_attr;
+
+    memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+    load_attr.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK;
+    load_attr.expected_attach_type = BPF_LANDLOCK_PTRACE;
+    load_attr.insns = insns;
+    load_attr.insns_cnt = sizeof(insn) / sizeof(struct bpf_insn);
+    load_attr.license = "GPL";
+
+    prog_fd = bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
+    if (prog_fd == -1)
+        exit(1);
+
+
+Enforcing a program
+-------------------
+
+Once the Landlock program has been created or received (e.g. through a UNIX
+socket), the thread willing to sandbox itself (and its future children) should
+perform the following two steps.
+
+The thread should first request to never be allowed to get new privileges with a
+call to :manpage:`prctl(2)` and the PR_SET_NO_NEW_PRIVS option.  More
+information can be found in *Documentation/prctl/no_new_privs.txt*.
+
+.. code-block:: c
+
+    if (prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0))
+        exit(1);
+
+A thread can apply a program to itself by using the :manpage:`seccomp(2)` syscall.
+The operation is SECCOMP_PREPEND_LANDLOCK_PROG, the flags must be empty and the
+*args* argument must point to a valid Landlock program file descriptor.
+
+.. code-block:: c
+
+    if (seccomp(SECCOMP_PREPEND_LANDLOCK_PROG, 0, &fd))
+        exit(1);
+
+If the syscall succeeds, the program is now enforced on the calling thread and
+will be enforced on all its subsequently created children of the thread as
+well.  Once a thread is landlocked, there is no way to remove this security
+policy, only stacking more restrictions is allowed.  The program evaluation is
+performed from the newest to the oldest.
+
+When a syscall ask for an action on a kernel object, if this action is denied,
+then an EACCES errno code is returned through the syscall.
+
+
+.. _inherited_programs:
+
+Inherited programs
+------------------
+
+Every new thread resulting from a :manpage:`clone(2)` inherits Landlock program
+restrictions from its parent.  This is similar to the seccomp inheritance as
+described in *Documentation/prctl/seccomp_filter.txt* or any other LSM dealing
+with task's :manpage:`credentials(7)`.
+
+
+Ptrace restrictions
+-------------------
+
+A sandboxed process has less privileges than a non-sandboxed process and must
+then be subject to additional restrictions when manipulating another process.
+To be allowed to use :manpage:`ptrace(2)` and related syscalls on a target
+process, a sandboxed process should have a subset of the target process
+programs.  This security policy can easily be implemented like in
+*tools/testing/selftests/landlock/test_ptrace.c*.
+
+
+Landlock structures and constants
+=================================
+
+Contexts
+--------
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :functions: landlock_context_ptrace
+
+
+Return types
+------------
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :functions: landlock_ret
+
+
+Additional documentation
+========================
+
+See https://landlock.io
-- 
2.23.0


^ permalink raw reply related

* [PATCH bpf-next v11 0/7] Landlock LSM
From: Mickaël Salaün @ 2019-10-29 17:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module

Hi,

This eleventh series is a major rework of the previous series [1] to
make the patches smaller while still putting in place the foundations of
Landlock and implementing a useful feature.  The whole file-system
support (i.e. inode map and program triggers) has been removed (but is
still planed in the future).  This series rewrite the previous static
ptrace restrictions with a programmatic one thanks to eBPF.  To be more
independent from seccomp, Landlock is now a full LSM using task's
credentials thanks to the LSM stacking infrastructure.  The only part of
seccomp still used is the syscall, which makes sense and which is a
quite simple interface.  The clear definition of Landlock domains (sets
of eBPF programs) can be used for other use-cases than sandboxing e.g.,
to implement a system-wide security signaling framework as described by
KRSI [2].  In addition to improvements and bug fixes, I split the
patches as much as possible to ease the review process.

As discussed at LSS NA [3] (with Kees Cook, James Morris, KP Singh and
Florent Revest) and at Kernel Recipes (with Alexei Starovoitov), I
planned to shrink the code of Landlock to a bare minimum to enable
incremental feature integration.  The idea was to create a "memory
protection" hook and the appropriate eBPF program type.  After some
experimentations, I concluded that it is not easy to implement a simple
interface to control actions such as mmap(2) or mprotect(2).  I then
focused on an old stable feature of Landlock: ptrace protection.  But
instead of keeping the static security policy, which make sense in a
sandboxing scenario, I extended Landlock with a ptrace hook and the
appropriate eBPF program type.  The ptrace enforcement is not mandatory
anymore but this new hook could be used (and extended with new helpers)
for a security signaling mechanism such as KRSI.  I hope the KRSI
developments could take advantage of this new Landlock version.

This is the first step of the roadmap discussed at LPC [4].  While the
intended final goal is to allow unprivileged users to use Landlock, this
series allows only a process with global CAP_SYS_ADMIN to load and
enforce a rule.  This may help to get feedback and avoid unexpected
behaviors.

This series can be applied on top of bpf-next, commit e93d99180abd
("selftests/bpf: Restore $(OUTPUT)/test_stub.o rule").  This can be
tested with CONFIG_BPF_SYSCALL, CONFIG_SECCOMP_FILTER and
CONFIG_SECURITY_LANDLOCK.  This patch series can be found in a Git
repository here:
https://github.com/landlock-lsm/linux/commits/landlock-v11
I would really appreciate constructive comments on the design and the
code.

# Landlock LSM

Landlock is a stackable LSM [5] intended to be used as a low-level
framework to build custom access-control/audit systems or safe endpoint
security agents.  There is currently one Landlock hook dedicated to
check ptrace(2).  This hook accepts a dedicated eBPF program, called a
Landlock program, which can currently compare its position in the
hierarchy of similar programs tied to other processes.  This enables to
enforce programmatic scoped ptrace restrictions.

The final goal of this new Linux Security Module (LSM) called Landlock
is to allow any process, including unprivileged ones, to create powerful
security sandboxes comparable to XNU Sandbox, FreeBSD Capsicum or
OpenBSD Pledge (which could be implemented with Landlock).  This kind of
sandbox is expected to help mitigate the security impact of bugs or
unexpected/malicious behaviors in user-space applications.

The use of seccomp and Landlock is more suitable with the help of a
user-space library (e.g.  libseccomp) that could help to specify a
high-level language to express a security policy instead of raw eBPF
programs.  Moreover, thanks to the LLVM front-end, it is quite easy to
write an eBPF program with a subset of the C language.

The documentation patch contains some kernel documentation, explanations
on how to use Landlock and a FAQ.  The compiled documentation and some
talks can be found here: https://landlock.io

# Frequently asked questions

## Why is seccomp-bpf not enough?

A seccomp filter can access only raw syscall arguments (i.e. the
register values) which means that it is not possible to filter according
to the value pointed to by an argument, such as a file pathname. As an
embryonic Landlock version demonstrated (i.e. seccomp-object), filtering
at the syscall level is complicated (e.g. need to take care of race
conditions). This is mainly because the access control checkpoints of
the kernel are not at this high-level but more underneath, at the
LSM-hook level. The LSM hooks are designed to handle this kind of
checks.  Landlock abstracts this approach to leverage the ability of
unprivileged users to limit themselves.

Cf. section "What it isn't?" in
Documentation/userspace-api/seccomp_filter.rst

## Why use the seccomp(2) syscall?

Landlock use the same semantic as seccomp to apply access rule
restrictions. It add a new layer of security for the current process
which is inherited by its children. It makes sense to use an unique
access-restricting syscall (that should be allowed by seccomp filters)
which can only drop privileges. Moreover, a Landlock rule could come
from outside a process (e.g.  passed through a UNIX socket). It is then
useful to differentiate the creation/load of Landlock eBPF programs via
bpf(2), from rule enforcement via seccomp(2).

## Why a new LSM? Are SELinux, AppArmor, Smack and Tomoyo not good
   enough?

The current access control LSMs are fine for their purpose which is to
give the *root* the ability to enforce a security policy for the
*system*. What is missing is a way to enforce a security policy for any
application by its developer and *unprivileged user* as seccomp can do
for raw syscall filtering.

Differences from other (access control) LSMs:
* not only dedicated to administrators (i.e. no_new_priv);
* limited kernel attack surface (e.g. policy parsing);
* constrained policy rules (no DoS: deterministic execution time);
* do not leak more information than the loader process can legitimately
  have access to (minimize metadata inference).

# Changes since v10

* remove all the file system related features: program types, inode
  map and expected_attach_triggers
* replace the static ptrace security policy with a new and simpler
  ptrace program (attached) type and a task_landlock_ptrace_ancestor()
  eBPF helper
* do not rely on seccomp internal structure but use stacked credentials
  insdead
* extend ptrace tests
* add more documentation
* split and rename files/patches
* miscellaneous fixes

Previous changes can be found in the previous cover-letter [1].

[1] https://lore.kernel.org/lkml/20190721213116.23476-1-mic@digikod.net/
[2] https://lore.kernel.org/lkml/20190910115527.5235-1-kpsingh@chromium.org/
[3] https://lwn.net/Articles/798157/
[4] https://lore.kernel.org/lkml/5828776A.1010104@digikod.net/
[5] https://lore.kernel.org/lkml/50db058a-7dde-441b-a7f9-f6837fe8b69f@schaufler-ca.com/

Regards,

Mickaël Salaün (7):
  bpf,landlock: Define an eBPF program type for Landlock hooks
  landlock: Add the management of domains
  landlock,seccomp: Load Landlock programs per process hierarchy
  landlock: Add ptrace LSM hooks
  bpf,landlock: Add task_landlock_ptrace_ancestor() helper
  bpf,landlock: Add tests for the Landlock ptrace program type
  landlock: Add user and kernel documentation for Landlock

 Documentation/security/index.rst              |   1 +
 Documentation/security/landlock/index.rst     |  22 ++
 Documentation/security/landlock/kernel.rst    | 139 +++++++++
 Documentation/security/landlock/user.rst      | 142 ++++++++++
 MAINTAINERS                                   |   9 +
 include/linux/bpf.h                           |   3 +
 include/linux/bpf_types.h                     |   3 +
 include/linux/landlock.h                      |  25 ++
 include/linux/lsm_hooks.h                     |   1 +
 include/uapi/linux/bpf.h                      |  23 +-
 include/uapi/linux/landlock.h                 |  39 +++
 include/uapi/linux/seccomp.h                  |   1 +
 kernel/bpf/syscall.c                          |   9 +
 kernel/bpf/verifier.c                         |  11 +
 kernel/seccomp.c                              |   4 +
 scripts/bpf_helpers_doc.py                    |   1 +
 security/Kconfig                              |   1 +
 security/Makefile                             |   2 +
 security/landlock/Kconfig                     |  19 ++
 security/landlock/Makefile                    |   6 +
 security/landlock/bpf_ptrace.c                |  98 +++++++
 security/landlock/bpf_ptrace.h                |  17 ++
 security/landlock/bpf_run.c                   |  62 ++++
 security/landlock/bpf_run.h                   |  25 ++
 security/landlock/bpf_verify.c                |  87 ++++++
 security/landlock/common.h                    |  84 ++++++
 security/landlock/domain_manage.c             | 265 ++++++++++++++++++
 security/landlock/domain_manage.h             |  23 ++
 security/landlock/domain_syscall.c            |  87 ++++++
 security/landlock/hooks_cred.c                |  47 ++++
 security/landlock/hooks_cred.h                |  14 +
 security/landlock/hooks_ptrace.c              | 114 ++++++++
 security/landlock/hooks_ptrace.h              |  19 ++
 security/landlock/init.c                      |  32 +++
 security/security.c                           |  15 +
 tools/include/uapi/linux/bpf.h                |  23 +-
 tools/include/uapi/linux/landlock.h           |  22 ++
 tools/lib/bpf/libbpf_probes.c                 |   3 +
 tools/testing/selftests/bpf/config            |   3 +
 tools/testing/selftests/bpf/test_verifier.c   |   1 +
 .../testing/selftests/bpf/verifier/landlock.c |  56 ++++
 tools/testing/selftests/landlock/.gitignore   |   5 +
 tools/testing/selftests/landlock/Makefile     |  27 ++
 tools/testing/selftests/landlock/config       |   5 +
 tools/testing/selftests/landlock/test.h       |  48 ++++
 tools/testing/selftests/landlock/test_base.c  |  24 ++
 .../testing/selftests/landlock/test_ptrace.c  | 210 ++++++++++++++
 47 files changed, 1875 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/security/landlock/index.rst
 create mode 100644 Documentation/security/landlock/kernel.rst
 create mode 100644 Documentation/security/landlock/user.rst
 create mode 100644 include/linux/landlock.h
 create mode 100644 include/uapi/linux/landlock.h
 create mode 100644 security/landlock/Kconfig
 create mode 100644 security/landlock/Makefile
 create mode 100644 security/landlock/bpf_ptrace.c
 create mode 100644 security/landlock/bpf_ptrace.h
 create mode 100644 security/landlock/bpf_run.c
 create mode 100644 security/landlock/bpf_run.h
 create mode 100644 security/landlock/bpf_verify.c
 create mode 100644 security/landlock/common.h
 create mode 100644 security/landlock/domain_manage.c
 create mode 100644 security/landlock/domain_manage.h
 create mode 100644 security/landlock/domain_syscall.c
 create mode 100644 security/landlock/hooks_cred.c
 create mode 100644 security/landlock/hooks_cred.h
 create mode 100644 security/landlock/hooks_ptrace.c
 create mode 100644 security/landlock/hooks_ptrace.h
 create mode 100644 security/landlock/init.c
 create mode 100644 tools/include/uapi/linux/landlock.h
 create mode 100644 tools/testing/selftests/bpf/verifier/landlock.c
 create mode 100644 tools/testing/selftests/landlock/.gitignore
 create mode 100644 tools/testing/selftests/landlock/Makefile
 create mode 100644 tools/testing/selftests/landlock/config
 create mode 100644 tools/testing/selftests/landlock/test.h
 create mode 100644 tools/testing/selftests/landlock/test_base.c
 create mode 100644 tools/testing/selftests/landlock/test_ptrace.c

-- 
2.23.0

^ permalink raw reply

* [PATCH bpf-next v11 4/7] landlock: Add ptrace LSM hooks
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

Add a first Landlock hook that can be used to enforce a security policy
or to audit some process activities.  For a sandboxing use-case, it is
needed to inform the kernel if a task can legitimately debug another.
ptrace(2) can also be used by an attacker to impersonate another task
and remain undetected while performing malicious activities.

Using ptrace(2) and related features on a target process can lead to a
privilege escalation.  A sandboxed task must then be able to tell the
kernel if another task is more privileged, via ptrace_may_access().

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* revamp and replace the static policy with a Landlock hook which may be
  used by the corresponding BPF_LANDLOCK_PTRACE program (attach) type
  and a dedicated process_cmp_landlock_ptrace() BPF helper
* check prog return value against LANDLOCK_RET_DENY (ret is a bitmask)

Changes since v6:
* factor out ptrace check
* constify pointers
* cleanup headers
* use the new security_add_hooks()
---
 security/landlock/Makefile       |   4 +-
 security/landlock/bpf_run.c      |  62 +++++++++++++++++
 security/landlock/bpf_run.h      |  25 +++++++
 security/landlock/hooks_ptrace.c | 114 +++++++++++++++++++++++++++++++
 security/landlock/hooks_ptrace.h |  19 ++++++
 security/landlock/init.c         |   2 +
 6 files changed, 224 insertions(+), 2 deletions(-)
 create mode 100644 security/landlock/bpf_run.c
 create mode 100644 security/landlock/bpf_run.h
 create mode 100644 security/landlock/hooks_ptrace.c
 create mode 100644 security/landlock/hooks_ptrace.h

diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 0b291f2c027c..93e4c2f31c8a 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
 landlock-y := init.o \
-	bpf_verify.o bpf_ptrace.o \
+	bpf_verify.o bpf_run.o bpf_ptrace.o \
 	domain_manage.o domain_syscall.o \
-	hooks_cred.o
+	hooks_cred.o hooks_ptrace.o
diff --git a/security/landlock/bpf_run.c b/security/landlock/bpf_run.c
new file mode 100644
index 000000000000..8874958bdc30
--- /dev/null
+++ b/security/landlock/bpf_run.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - eBPF program evaluation
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/filter.h>
+#include <linux/rculist.h>
+#include <uapi/linux/landlock.h>
+
+#include "bpf_run.h"
+#include "common.h"
+#include "hooks_ptrace.h"
+
+static const void *get_prog_ctx(struct landlock_hook_ctx *hook_ctx)
+{
+	switch (hook_ctx->type) {
+	case LANDLOCK_HOOK_PTRACE:
+		return landlock_get_ctx_ptrace(hook_ctx->ctx_ptrace);
+	}
+	WARN_ON(1);
+	return NULL;
+}
+
+/**
+ * landlock_access_denied - run Landlock programs tied to a hook
+ *
+ * @domain: Landlock domain pointer
+ * @hook_ctx: non-NULL valid eBPF context pointer
+ *
+ * Return true if at least one program return deny, false otherwise.
+ */
+bool landlock_access_denied(struct landlock_domain *domain,
+		struct landlock_hook_ctx *hook_ctx)
+{
+	struct landlock_prog_list *prog_list;
+	const size_t hook = get_hook_index(hook_ctx->type);
+
+	if (!domain)
+		return false;
+
+	for (prog_list = domain->programs[hook]; prog_list;
+			prog_list = prog_list->prev) {
+		u32 ret;
+		const void *prog_ctx;
+
+		prog_ctx = get_prog_ctx(hook_ctx);
+		if (!prog_ctx || WARN_ON(IS_ERR(prog_ctx)))
+			return true;
+		rcu_read_lock();
+		ret = BPF_PROG_RUN(prog_list->prog, prog_ctx);
+		rcu_read_unlock();
+		if (ret & LANDLOCK_RET_DENY)
+			return true;
+	}
+	return false;
+}
diff --git a/security/landlock/bpf_run.h b/security/landlock/bpf_run.h
new file mode 100644
index 000000000000..3461cbb8ec12
--- /dev/null
+++ b/security/landlock/bpf_run.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - eBPF program evaluation headers
+ *
+ * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_BPF_RUN_H
+#define _SECURITY_LANDLOCK_BPF_RUN_H
+
+#include "common.h"
+#include "hooks_ptrace.h"
+
+struct landlock_hook_ctx {
+	enum landlock_hook_type type;
+	union {
+		struct landlock_hook_ctx_ptrace *ctx_ptrace;
+	};
+};
+
+bool landlock_access_denied(struct landlock_domain *domain,
+		struct landlock_hook_ctx *hook_ctx);
+
+#endif /* _SECURITY_LANDLOCK_BPF_RUN_H */
diff --git a/security/landlock/hooks_ptrace.c b/security/landlock/hooks_ptrace.c
new file mode 100644
index 000000000000..8e518a472d04
--- /dev/null
+++ b/security/landlock/hooks_ptrace.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - ptrace hooks
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/lsm_hooks.h>
+#include <linux/sched.h>
+#include <uapi/linux/landlock.h>
+
+#include "bpf_run.h"
+#include "common.h"
+#include "hooks_ptrace.h"
+
+struct landlock_hook_ctx_ptrace {
+	struct landlock_context_ptrace prog_ctx;
+};
+
+const struct landlock_context_ptrace *landlock_get_ctx_ptrace(
+		const struct landlock_hook_ctx_ptrace *hook_ctx)
+{
+	if (WARN_ON(!hook_ctx))
+		return NULL;
+
+	return &hook_ctx->prog_ctx;
+}
+
+static int check_ptrace(struct landlock_domain *domain,
+		struct task_struct *tracer, struct task_struct *tracee)
+{
+	struct landlock_hook_ctx_ptrace ctx_ptrace = {
+		.prog_ctx = {
+			.tracer = (uintptr_t)tracer,
+			.tracee = (uintptr_t)tracee,
+		},
+	};
+	struct landlock_hook_ctx hook_ctx = {
+		.type = LANDLOCK_HOOK_PTRACE,
+		.ctx_ptrace = &ctx_ptrace,
+	};
+
+	return landlock_access_denied(domain, &hook_ctx) ? -EPERM : 0;
+}
+
+/**
+ * hook_ptrace_access_check - determine whether the current process may access
+ *			      another
+ *
+ * @child: the process to be accessed
+ * @mode: the mode of attachment
+ *
+ * If the current task (i.e. tracer) has one or multiple BPF_LANDLOCK_PTRACE
+ * programs, then run them with the `struct landlock_context_ptrace` context.
+ * If one of these programs return LANDLOCK_RET_DENY, then deny access with
+ * -EPERM, else allow it by returning 0.
+ */
+static int hook_ptrace_access_check(struct task_struct *child,
+		unsigned int mode)
+{
+	struct landlock_domain *dom_current;
+	const size_t hook = get_hook_index(LANDLOCK_HOOK_PTRACE);
+
+	dom_current = landlock_cred(current_cred())->domain;
+	if (!(dom_current && dom_current->programs[hook]))
+		return 0;
+	return check_ptrace(dom_current, current, child);
+}
+
+/**
+ * hook_ptrace_traceme - determine whether another process may trace the
+ *			 current one
+ *
+ * @parent: the task proposed to be the tracer
+ *
+ * If the parent task (i.e. tracer) has one or multiple BPF_LANDLOCK_PTRACE
+ * programs, then run them with the `struct landlock_context_ptrace` context.
+ * If one of these programs return LANDLOCK_RET_DENY, then deny access with
+ * -EPERM, else allow it by returning 0.
+ */
+static int hook_ptrace_traceme(struct task_struct *parent)
+{
+	struct landlock_domain *dom_parent;
+	const size_t hook = get_hook_index(LANDLOCK_HOOK_PTRACE);
+	int ret;
+
+	rcu_read_lock();
+	dom_parent = landlock_cred(__task_cred(parent))->domain;
+	if (!(dom_parent && dom_parent->programs[hook])) {
+		ret = 0;
+		goto put_rcu;
+	}
+	ret = check_ptrace(dom_parent, parent, current);
+
+put_rcu:
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct security_hook_list landlock_hooks[] = {
+	LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check),
+	LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme),
+};
+
+__init void landlock_add_hooks_ptrace(void)
+{
+	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+			LANDLOCK_NAME);
+}
diff --git a/security/landlock/hooks_ptrace.h b/security/landlock/hooks_ptrace.h
new file mode 100644
index 000000000000..53fe651bdb3e
--- /dev/null
+++ b/security/landlock/hooks_ptrace.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - ptrace hooks headers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_HOOKS_PTRACE_H
+#define _SECURITY_LANDLOCK_HOOKS_PTRACE_H
+
+struct landlock_hook_ctx_ptrace;
+
+const struct landlock_context_ptrace *landlock_get_ctx_ptrace(
+		const struct landlock_hook_ctx_ptrace *hook_ctx);
+
+__init void landlock_add_hooks_ptrace(void);
+
+#endif /* _SECURITY_LANDLOCK_HOOKS_PTRACE_H */
diff --git a/security/landlock/init.c b/security/landlock/init.c
index 8836ec4defd3..541aad17418e 100644
--- a/security/landlock/init.c
+++ b/security/landlock/init.c
@@ -10,11 +10,13 @@
 
 #include "common.h"
 #include "hooks_cred.h"
+#include "hooks_ptrace.h"
 
 static int __init landlock_init(void)
 {
 	pr_info(LANDLOCK_NAME ": Registering hooks\n");
 	landlock_add_hooks_cred();
+	landlock_add_hooks_ptrace();
 	return 0;
 }
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH bpf-next v11 5/7] bpf,landlock: Add task_landlock_ptrace_ancestor() helper
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley, Tejun Heo, Tetsuo Handa, Tycho Andersen,
	Will Drewry, bpf, kernel-hardening, linux-api,
	linux-security-module
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

This new task_landlock_ptrace_ancestor() helper can be used to identify
if the Landlock domain tied to the current tracer is in the same
hierarchy as the domain of tracee.

Indeed, ptrace(2) can be used to impersonate an unsandboxed process and
lead to a privilege escalation.  A common use-case when sandboxing a
process is then to forbid it to debug a less-privileged process.  A
sandbox process (tracer) should only be allowed to trace another process
(tracee) if the tracee has fewer privileges than the tracer.  This
policy can be implemented with this helper.

More complex helpers could be added in the future to enable other ways
to check the relation between the tracer and the tracee.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* new patch taking inspiration from the previous static ptrace policy
---
 include/linux/bpf.h            |  2 +
 include/uapi/linux/bpf.h       | 21 ++++++++++-
 kernel/bpf/verifier.c          |  4 ++
 security/landlock/bpf_ptrace.c | 68 ++++++++++++++++++++++++++++++++++
 security/landlock/bpf_verify.c |  4 ++
 5 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 819a3e207438..67ec198a90cb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -214,6 +214,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
+	ARG_PTR_TO_TASK,	/* pointer to task_struct */
 };
 
 /* type of values returned from helper functions */
@@ -1088,6 +1089,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
 extern const struct bpf_func_proto bpf_strtol_proto;
 extern const struct bpf_func_proto bpf_strtoul_proto;
 extern const struct bpf_func_proto bpf_tcp_sock_proto;
+extern const struct bpf_func_proto bpf_task_landlock_ptrace_ancestor_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6e4147790f96..c88436b97163 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2777,6 +2777,24 @@ union bpf_attr {
  * 		restricted to raw_tracepoint bpf programs.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_task_landlock_ptrace_ancestor(struct task_struct *parent, struct task_struct *child)
+ *	Description
+ *		Check the relation of a potentially parent task with a child
+ *		one, according to their Landlock ptrace hook programs.
+ *	Return
+ *		**-EINVAL** if the child's ptrace programs are not comparable
+ *		to the parent ones, i.e. one of them is an empty set.
+ *
+ *		**-ENOENT** if the parent's ptrace programs are either in a
+ *		separate hierarchy of the child ones, or if the parent's ptrace
+ *		programs are a superset of the child ones.
+ *
+ *		0 if the parent's ptrace programs are the same as the child
+ *		ones.
+ *
+ *		1 if the parent's ptrace programs are indeed a subset of the
+ *		child ones.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2890,7 +2908,8 @@ union bpf_attr {
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
 	FN(tcp_gen_syncookie),		\
-	FN(skb_output),
+	FN(skb_output),			\
+	FN(task_landlock_ptrace_ancestor),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ebf1991906b7..af8f1a777a2d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3492,6 +3492,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		    type != PTR_TO_MAP_VALUE &&
 		    type != expected_type)
 			goto err_type;
+	} else if (arg_type == ARG_PTR_TO_TASK) {
+		expected_type = PTR_TO_TASK;
+		if (type != expected_type)
+			goto err_type;
 	} else {
 		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
diff --git a/security/landlock/bpf_ptrace.c b/security/landlock/bpf_ptrace.c
index 2ec73078ad01..0e1362951463 100644
--- a/security/landlock/bpf_ptrace.c
+++ b/security/landlock/bpf_ptrace.c
@@ -7,9 +7,13 @@
  */
 
 #include <linux/bpf.h>
+#include <linux/cred.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
 #include <uapi/linux/landlock.h>
 
 #include "bpf_ptrace.h"
+#include "common.h"
 
 bool landlock_is_valid_access_ptrace(int off, enum bpf_access_type type,
 		enum bpf_reg_type *reg_type, int *max_size)
@@ -28,3 +32,67 @@ bool landlock_is_valid_access_ptrace(int off, enum bpf_access_type type,
 		return false;
 	}
 }
+
+/**
+ * domain_ptrace_ancestor - check domain ordering according to ptrace
+ *
+ * @parent: a parent domain
+ * @child: a potential child of @parent
+ *
+ * Check if the @parent domain is less or equal to (i.e. a subset of) the
+ * @child domain.
+ */
+static int domain_ptrace_ancestor(const struct landlock_domain *parent,
+		const struct landlock_domain *child)
+{
+	const struct landlock_prog_list *child_progs, *parent_progs;
+	const size_t hook = get_hook_index(LANDLOCK_HOOK_PTRACE);
+
+	if (!parent || !child)
+		/* @parent or @child has no ptrace restriction */
+		return -EINVAL;
+	parent_progs = parent->programs[hook];
+	child_progs = child->programs[hook];
+	if (!parent_progs || !child_progs)
+		/* @parent or @child has no ptrace restriction */
+		return -EINVAL;
+	if (child_progs == parent_progs)
+		/* @parent is at the same level as @child */
+		return 0;
+	for (child_progs = child_progs->prev; child_progs;
+			child_progs = child_progs->prev) {
+		if (child_progs == parent_progs)
+			/* @parent is one of the ancestors of @child */
+			return 1;
+	}
+	/*
+	 * Either there is no relationship between @parent and @child, or
+	 * @child is one of the ancestors of @parent.
+	 */
+	return -ENOENT;
+}
+
+/*
+ * Cf. include/uapi/linux/bpf.h - bpf_task_landlock_ptrace_ancestor
+ */
+BPF_CALL_2(bpf_task_landlock_ptrace_ancestor, const struct task_struct *,
+		parent, const struct task_struct *, child)
+{
+	const struct landlock_domain *dom_parent, *dom_child;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	if (WARN_ON(!parent || !child))
+		return -EFAULT;
+	dom_parent = landlock_cred(__task_cred(parent))->domain;
+	dom_child = landlock_cred(__task_cred(child))->domain;
+	return domain_ptrace_ancestor(dom_parent, dom_child);
+}
+
+const struct bpf_func_proto bpf_task_landlock_ptrace_ancestor_proto = {
+	.func		= bpf_task_landlock_ptrace_ancestor,
+	.gpl_only	= false,
+	.pkt_access	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TASK,
+	.arg2_type	= ARG_PTR_TO_TASK,
+};
diff --git a/security/landlock/bpf_verify.c b/security/landlock/bpf_verify.c
index 6ed921588178..a1d2db75d51d 100644
--- a/security/landlock/bpf_verify.c
+++ b/security/landlock/bpf_verify.c
@@ -70,6 +70,10 @@ static const struct bpf_func_proto *bpf_landlock_func_proto(
 		return &bpf_map_update_elem_proto;
 	case BPF_FUNC_map_delete_elem:
 		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_task_landlock_ptrace_ancestor:
+		if (get_hook_type(prog) == LANDLOCK_HOOK_PTRACE)
+			return &bpf_task_landlock_ptrace_ancestor_proto;
+		return NULL;
 	default:
 		return NULL;
 	}
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v10 00/25] LSM: Module stacking for AppArmor
From: Casey Schaufler @ 2019-10-29 15:51 UTC (permalink / raw)
  To: Stephen Smalley, casey.schaufler, jmorris, linux-security-module,
	selinux
  Cc: keescook, john.johansen, penguin-kernel, paul, casey
In-Reply-To: <ce6c4861-2767-89ab-bad5-f633a67b3fc9@tycho.nsa.gov>

On 10/29/2019 7:53 AM, Stephen Smalley wrote:
> On 10/24/19 4:52 PM, Casey Schaufler wrote:
>> This patchset provides the changes required for
>> the AppArmor security module to stack safely with any other.
>>
>> v10: Ask the security modules if the display can be changed.
>>
>> v9: There is no version 9
>>
>> v8: Incorporate feedback from v7
>>      - Minor clean-up in display value management
>>      - refactor "compound" context creation to use a common
>>        append_ctx() function.
>>
>> v7: Incorporate feedback from v6
>>      - Make setting the display a privileged operation. The
>>        availability of compound contexts reduces the need for
>>        setting the display.
>>
>> v6: Incorporate feedback from v5
>>      - Add subj_<lsm>= and obj_<lsm>= fields to audit records
>>      - Add /proc/.../attr/context to get the full context in
>>        lsmname\0value\0... format as suggested by Simon McVittie
>>      - Add SO_PEERCONTEXT for getsockopt() to get the full context
>>        in the same format, also suggested by Simon McVittie.
>>      - Add /sys/kernel/security/lsm_display_default to provide
>>        the display default value.
>>
>> v5: Incorporate feedback from v4
>>      - Initialize the lsmcontext in security_secid_to_secctx()
>>      - Clear the lsmcontext in all security_release_secctx() cases
>>      - Don't use the "display" on strictly internal context
>>        interfaces.
>>      - The SELinux binder hooks check for cases where the context
>>        "display" isn't compatible with SELinux.
>>
>> v4: Incorporate feedback from v3
>>      - Mark new lsm_<blob>_alloc functions static
>>      - Replace the lsm and slot fields of the security_hook_list
>>        with a pointer to a LSM allocated lsm_id structure. The
>>        LSM identifies if it needs a slot explicitly. Use the
>>        lsm_id rather than make security_add_hooks return the
>>        slot value.
>>      - Validate slot values used in security.c
>>      - Reworked the "display" process attribute handling so that
>>        it works right and doesn't use goofy list processing.
>>      - fix display value check in dentry_init_security
>>      - Replace audit_log of secids with '?' instead of deleting
>>        the audit log
>>
>> v3: Incorporate feedback from v2
>>      - Make lsmblob parameter and variable names more
>>        meaningful, changing "le" and "l" to "blob".
>>      - Improve consistency of constant naming.
>>      - Do more sanity checking during LSM initialization.
>>      - Be a bit clearer about what is temporary scaffolding.
>>      - Rather than clutter security_getpeersec_dgram with
>>        otherwise unnecessary checks remove the apparmor
>>        stub, which does nothing useful.
>>
>> Patche 0001 moves management of the sock security blob from the individual
>> modules to the infrastructure.
>>
>> Patches 0002-0012 replace system use of a "secid" with
>> a structure "lsmblob" containing information from the
>> security modules to be held and reused later. At this
>> point lsmblob contains an array of u32 secids, one "slot"
>> for each of the security modules compiled into the
>> kernel that used secids. A "slot" is allocated when
>> a security module requests one.
>> The infrastructure is changed to use the slot number
>> to pass the correct secid to or from the security module
>> hooks.
>>
>> It is important that the lsmblob be a fixed size entity
>> that does not have to be allocated. Several of the places
>> where it is used would have performance and/or locking
>> issues with dynamic allocation.
>>
>> Patch 0013 provides a mechanism for a process to
>> identify which security module's hooks should be used
>> when displaying or converting a security context string.
>> A new interface /proc/.../attr/display contains the name
>> of the security module to show. Reading from this file
>> will present the name of the module, while writing to
>> it will set the value. Only names of active security
>> modules are accepted. Internally, the name is translated
>> to the appropriate "slot" number for the module which
>> is then stored in the task security blob. Setting the
>> display requires that all modules using the /proc interfaces
>> allow the transition.
>>
>> Patch 0014 Starts the process of changing how a security
>> context is represented. Since it is possible for a
>> security context to have been generated by more than one
>> security module it is now necessary to note which module
>> created a security context so that the correct "release"
>> hook can be called. There are several places where the
>> module that created a security context cannot be inferred.
>>
>> This is achieved by introducing a "lsmcontext" structure
>> which contains the context string, its length and the
>> "slot" number of the security module that created it.
>> The security_release_secctx() interface is changed,
>> replacing the (string,len) pointer pair with a lsmcontext
>> pointer.
>>
>> Patches 0015-0017 convert the security interfaces from
>> (string,len) pointer pairs to a lsmcontext pointer.
>> The slot number identifying the creating module is
>> added by the infrastructure. Where the security context
>> is stored for extended periods the data type is changed.
>>
>> The Netlabel code is converted to save lsmblob structures
>> instead of secids in Patches 0018-0019.
>>
>> Patch 0020 adds checks to the binder hooks which verify
>> that if both ends of a transaction use the same "display".
>>
>> Patches 0021-0022 add addition data to the audit records
>> to identify the LSM specific data for all active modules.
>>
>> Patches 0023-0024 add new interfaces for getting the
>> compound security contexts.
>>
>> Finally, with all interference on the AppArmor hooks
>> removed, Patch 0025 removes the exclusive bit from
>> AppArmor. An unnecessary stub hook was also removed.
>>
>> The Ubuntu project is using an earlier version of
>> this patchset in their distribution to enable stacking
>> for containers.
>>
>> Performance measurements to date have the change
>> within the "noise". The sockperf and dbench results
>> are on the order of 0.2% to 0.8% difference, with
>> better performance being as common as worse. The
>> benchmarks were run with AppArmor and Smack on Ubuntu.
>>
>> https://github.com/cschaufler/lsm-stacking.git#stack-5.2-v10-apparmor
>
> Can you re-base on something more recent than v5.1-rc2 (that's the base for that branch currently)?
> At present it won't even boot for me on modern Fedora.  Two key missing commits are:

Sigh. It's based on James' next-general. As it's going up through James,
and he hasn't updated that branch, I'm sort of stuck. BTW, I have a re-based
version, but don't see how to get it into my git tree without mucking up
the eventual merge.

>
> e33c1b9923775d17ad246946fe67fcb9be288677 ("apparmor: Restore Y/N in /sys for apparmor's "enabled") - without this, dbus falls over (or at least dbus-broker in Fedora)
>
> 169ce0c081cd85f78388bb6c1638c1ad7b81bde7 ("selinux: fix residual uses of current_security() for the SELinux blob") - without this, selinux ends up dereferencing something other than its own security blob after these patches
>
>>
>> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
>> ---
>>   arch/alpha/include/uapi/asm/socket.h    |   1 +
>>   arch/mips/include/uapi/asm/socket.h     |   1 +
>>   arch/parisc/include/uapi/asm/socket.h   |   1 +
>>   arch/sparc/include/uapi/asm/socket.h    |   1 +
>>   drivers/android/binder.c                |  24 +-
>>   fs/kernfs/dir.c                         |   5 +-
>>   fs/kernfs/inode.c                       |  35 +-
>>   fs/kernfs/kernfs-internal.h             |   3 +-
>>   fs/nfs/nfs4proc.c                       |  22 +-
>>   fs/nfsd/nfs4xdr.c                       |  20 +-
>>   fs/proc/base.c                          |   2 +
>>   include/linux/audit.h                   |   1 +
>>   include/linux/cred.h                    |   3 +-
>>   include/linux/lsm_hooks.h               |  37 +-
>>   include/linux/security.h                | 175 ++++++++--
>>   include/net/af_unix.h                   |   2 +-
>>   include/net/netlabel.h                  |   8 +-
>>   include/net/scm.h                       |  15 +-
>>   include/uapi/asm-generic/socket.h       |   1 +
>>   kernel/audit.c                          |  70 +++-
>>   kernel/audit.h                          |   9 +-
>>   kernel/audit_fsnotify.c                 |   1 +
>>   kernel/auditfilter.c                    |  10 +-
>>   kernel/auditsc.c                        | 129 ++++---
>>   kernel/cred.c                           |  12 +-
>>   net/core/sock.c                         |   7 +-
>>   net/ipv4/cipso_ipv4.c                   |   6 +-
>>   net/ipv4/ip_sockglue.c                  |  12 +-
>>   net/netfilter/nf_conntrack_netlink.c    |  20 +-
>>   net/netfilter/nf_conntrack_standalone.c |  11 +-
>>   net/netfilter/nfnetlink_queue.c         |  26 +-
>>   net/netfilter/nft_meta.c                |  13 +-
>>   net/netfilter/xt_SECMARK.c              |   5 +-
>>   net/netlabel/netlabel_kapi.c            |   6 +-
>>   net/netlabel/netlabel_unlabeled.c       |  97 +++---
>>   net/netlabel/netlabel_unlabeled.h       |   2 +-
>>   net/netlabel/netlabel_user.c            |  13 +-
>>   net/netlabel/netlabel_user.h            |   6 +-
>>   net/unix/af_unix.c                      |   6 +-
>>   net/xfrm/xfrm_policy.c                  |   2 +
>>   net/xfrm/xfrm_state.c                   |   2 +
>>   security/apparmor/include/apparmor.h    |   3 +-
>>   security/apparmor/include/net.h         |   6 +-
>>   security/apparmor/lsm.c                 | 121 ++++---
>>   security/commoncap.c                    |   7 +-
>>   security/integrity/ima/ima.h            |  14 +-
>>   security/integrity/ima/ima_api.c        |  10 +-
>>   security/integrity/ima/ima_appraise.c   |   6 +-
>>   security/integrity/ima/ima_main.c       |  36 +-
>>   security/integrity/ima/ima_policy.c     |  19 +-
>>   security/integrity/integrity_audit.c    |   1 +
>>   security/loadpin/loadpin.c              |   8 +-
>>   security/safesetid/lsm.c                |   8 +-
>>   security/security.c                     | 586 +++++++++++++++++++++++++++++---
>>   security/selinux/hooks.c                | 109 +++---
>>   security/selinux/include/classmap.h     |   2 +-
>>   security/selinux/include/objsec.h       |   5 +
>>   security/selinux/include/security.h     |   1 +
>>   security/selinux/netlabel.c             |  25 +-
>>   security/selinux/ss/services.c          |   4 +-
>>   security/smack/smack.h                  |   6 +
>>   security/smack/smack_lsm.c              | 124 ++++---
>>   security/smack/smack_netfilter.c        |   8 +-
>>   security/smack/smackfs.c                |  10 +-
>>   security/tomoyo/tomoyo.c                |   8 +-
>>   security/yama/yama_lsm.c                |   7 +-
>>   66 files changed, 1376 insertions(+), 580 deletions(-)
>>
>

^ permalink raw reply

* Re: [PATCH v10 13/25] LSM: Specify which LSM to display
From: Casey Schaufler @ 2019-10-29 15:44 UTC (permalink / raw)
  To: Simon McVittie
  Cc: casey.schaufler, jmorris, linux-security-module, selinux,
	keescook, john.johansen, penguin-kernel, paul, sds, casey
In-Reply-To: <20191029144408.GA26815@horizon>

On 10/29/2019 7:44 AM, Simon McVittie wrote:
> On Thu, 24 Oct 2019 at 13:52:16 -0700, Casey Schaufler wrote:
>> Create a new entry "display" in /proc/.../attr for controlling
>> which LSM security information is displayed for a process.
> It still isn't immediately obvious to me from the commit message whether
> the "..." stands for the pid of the process that will read LSM information,
> or the pid of the process whose LSM information will be read.

For all practical purposes "..." will be "self". You can read the
attr/display of another process, but I don't know where that would
be useful. You can't write to the attr/display of an different process.

>
> I believe the intended meaning was the former? So perhaps
>
>     Create a new entry "display" in /proc/$reader/attr that controls
>     which LSM security information will be displayed when the process
>     $reader reads LSM information.
>
>     (Note that when $reader reads /proc/$subject/attr/current for
>     $reader != $subject, it is /proc/$reader/attr/display that controls
>     what is displayed there, not /proc/$subject/attr/display.)
>
> The commit that introduces /proc/.../attr/context could probably
> benefit from similar treatment - maybe it could be referred to as
> /proc/$subject/attr/context?

Thanks. I'll work on making it clearer.

>
>     smcv


^ permalink raw reply

* Re: [PATCH v10 00/25] LSM: Module stacking for AppArmor
From: Stephen Smalley @ 2019-10-29 14:53 UTC (permalink / raw)
  To: Casey Schaufler, casey.schaufler, jmorris, linux-security-module,
	selinux
  Cc: keescook, john.johansen, penguin-kernel, paul
In-Reply-To: <20191024205228.6922-1-casey@schaufler-ca.com>

On 10/24/19 4:52 PM, Casey Schaufler wrote:
> This patchset provides the changes required for
> the AppArmor security module to stack safely with any other.
> 
> v10: Ask the security modules if the display can be changed.
> 
> v9: There is no version 9
> 
> v8: Incorporate feedback from v7
>      - Minor clean-up in display value management
>      - refactor "compound" context creation to use a common
>        append_ctx() function.
> 
> v7: Incorporate feedback from v6
>      - Make setting the display a privileged operation. The
>        availability of compound contexts reduces the need for
>        setting the display.
> 
> v6: Incorporate feedback from v5
>      - Add subj_<lsm>= and obj_<lsm>= fields to audit records
>      - Add /proc/.../attr/context to get the full context in
>        lsmname\0value\0... format as suggested by Simon McVittie
>      - Add SO_PEERCONTEXT for getsockopt() to get the full context
>        in the same format, also suggested by Simon McVittie.
>      - Add /sys/kernel/security/lsm_display_default to provide
>        the display default value.
> 
> v5: Incorporate feedback from v4
>      - Initialize the lsmcontext in security_secid_to_secctx()
>      - Clear the lsmcontext in all security_release_secctx() cases
>      - Don't use the "display" on strictly internal context
>        interfaces.
>      - The SELinux binder hooks check for cases where the context
>        "display" isn't compatible with SELinux.
> 
> v4: Incorporate feedback from v3
>      - Mark new lsm_<blob>_alloc functions static
>      - Replace the lsm and slot fields of the security_hook_list
>        with a pointer to a LSM allocated lsm_id structure. The
>        LSM identifies if it needs a slot explicitly. Use the
>        lsm_id rather than make security_add_hooks return the
>        slot value.
>      - Validate slot values used in security.c
>      - Reworked the "display" process attribute handling so that
>        it works right and doesn't use goofy list processing.
>      - fix display value check in dentry_init_security
>      - Replace audit_log of secids with '?' instead of deleting
>        the audit log
> 
> v3: Incorporate feedback from v2
>      - Make lsmblob parameter and variable names more
>        meaningful, changing "le" and "l" to "blob".
>      - Improve consistency of constant naming.
>      - Do more sanity checking during LSM initialization.
>      - Be a bit clearer about what is temporary scaffolding.
>      - Rather than clutter security_getpeersec_dgram with
>        otherwise unnecessary checks remove the apparmor
>        stub, which does nothing useful.
> 
> Patche 0001 moves management of the sock security blob from the individual
> modules to the infrastructure.
> 
> Patches 0002-0012 replace system use of a "secid" with
> a structure "lsmblob" containing information from the
> security modules to be held and reused later. At this
> point lsmblob contains an array of u32 secids, one "slot"
> for each of the security modules compiled into the
> kernel that used secids. A "slot" is allocated when
> a security module requests one.
> The infrastructure is changed to use the slot number
> to pass the correct secid to or from the security module
> hooks.
> 
> It is important that the lsmblob be a fixed size entity
> that does not have to be allocated. Several of the places
> where it is used would have performance and/or locking
> issues with dynamic allocation.
> 
> Patch 0013 provides a mechanism for a process to
> identify which security module's hooks should be used
> when displaying or converting a security context string.
> A new interface /proc/.../attr/display contains the name
> of the security module to show. Reading from this file
> will present the name of the module, while writing to
> it will set the value. Only names of active security
> modules are accepted. Internally, the name is translated
> to the appropriate "slot" number for the module which
> is then stored in the task security blob. Setting the
> display requires that all modules using the /proc interfaces
> allow the transition.
> 
> Patch 0014 Starts the process of changing how a security
> context is represented. Since it is possible for a
> security context to have been generated by more than one
> security module it is now necessary to note which module
> created a security context so that the correct "release"
> hook can be called. There are several places where the
> module that created a security context cannot be inferred.
> 
> This is achieved by introducing a "lsmcontext" structure
> which contains the context string, its length and the
> "slot" number of the security module that created it.
> The security_release_secctx() interface is changed,
> replacing the (string,len) pointer pair with a lsmcontext
> pointer.
> 
> Patches 0015-0017 convert the security interfaces from
> (string,len) pointer pairs to a lsmcontext pointer.
> The slot number identifying the creating module is
> added by the infrastructure. Where the security context
> is stored for extended periods the data type is changed.
> 
> The Netlabel code is converted to save lsmblob structures
> instead of secids in Patches 0018-0019.
> 
> Patch 0020 adds checks to the binder hooks which verify
> that if both ends of a transaction use the same "display".
> 
> Patches 0021-0022 add addition data to the audit records
> to identify the LSM specific data for all active modules.
> 
> Patches 0023-0024 add new interfaces for getting the
> compound security contexts.
> 
> Finally, with all interference on the AppArmor hooks
> removed, Patch 0025 removes the exclusive bit from
> AppArmor. An unnecessary stub hook was also removed.
> 
> The Ubuntu project is using an earlier version of
> this patchset in their distribution to enable stacking
> for containers.
> 
> Performance measurements to date have the change
> within the "noise". The sockperf and dbench results
> are on the order of 0.2% to 0.8% difference, with
> better performance being as common as worse. The
> benchmarks were run with AppArmor and Smack on Ubuntu.
> 
> https://github.com/cschaufler/lsm-stacking.git#stack-5.2-v10-apparmor

Can you re-base on something more recent than v5.1-rc2 (that's the base 
for that branch currently)?
At present it won't even boot for me on modern Fedora.  Two key missing 
commits are:

e33c1b9923775d17ad246946fe67fcb9be288677 ("apparmor: Restore Y/N in /sys 
for apparmor's "enabled") - without this, dbus falls over (or at least 
dbus-broker in Fedora)

169ce0c081cd85f78388bb6c1638c1ad7b81bde7 ("selinux: fix residual uses of 
current_security() for the SELinux blob") - without this, selinux ends 
up dereferencing something other than its own security blob after these 
patches

> 
> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
> ---
>   arch/alpha/include/uapi/asm/socket.h    |   1 +
>   arch/mips/include/uapi/asm/socket.h     |   1 +
>   arch/parisc/include/uapi/asm/socket.h   |   1 +
>   arch/sparc/include/uapi/asm/socket.h    |   1 +
>   drivers/android/binder.c                |  24 +-
>   fs/kernfs/dir.c                         |   5 +-
>   fs/kernfs/inode.c                       |  35 +-
>   fs/kernfs/kernfs-internal.h             |   3 +-
>   fs/nfs/nfs4proc.c                       |  22 +-
>   fs/nfsd/nfs4xdr.c                       |  20 +-
>   fs/proc/base.c                          |   2 +
>   include/linux/audit.h                   |   1 +
>   include/linux/cred.h                    |   3 +-
>   include/linux/lsm_hooks.h               |  37 +-
>   include/linux/security.h                | 175 ++++++++--
>   include/net/af_unix.h                   |   2 +-
>   include/net/netlabel.h                  |   8 +-
>   include/net/scm.h                       |  15 +-
>   include/uapi/asm-generic/socket.h       |   1 +
>   kernel/audit.c                          |  70 +++-
>   kernel/audit.h                          |   9 +-
>   kernel/audit_fsnotify.c                 |   1 +
>   kernel/auditfilter.c                    |  10 +-
>   kernel/auditsc.c                        | 129 ++++---
>   kernel/cred.c                           |  12 +-
>   net/core/sock.c                         |   7 +-
>   net/ipv4/cipso_ipv4.c                   |   6 +-
>   net/ipv4/ip_sockglue.c                  |  12 +-
>   net/netfilter/nf_conntrack_netlink.c    |  20 +-
>   net/netfilter/nf_conntrack_standalone.c |  11 +-
>   net/netfilter/nfnetlink_queue.c         |  26 +-
>   net/netfilter/nft_meta.c                |  13 +-
>   net/netfilter/xt_SECMARK.c              |   5 +-
>   net/netlabel/netlabel_kapi.c            |   6 +-
>   net/netlabel/netlabel_unlabeled.c       |  97 +++---
>   net/netlabel/netlabel_unlabeled.h       |   2 +-
>   net/netlabel/netlabel_user.c            |  13 +-
>   net/netlabel/netlabel_user.h            |   6 +-
>   net/unix/af_unix.c                      |   6 +-
>   net/xfrm/xfrm_policy.c                  |   2 +
>   net/xfrm/xfrm_state.c                   |   2 +
>   security/apparmor/include/apparmor.h    |   3 +-
>   security/apparmor/include/net.h         |   6 +-
>   security/apparmor/lsm.c                 | 121 ++++---
>   security/commoncap.c                    |   7 +-
>   security/integrity/ima/ima.h            |  14 +-
>   security/integrity/ima/ima_api.c        |  10 +-
>   security/integrity/ima/ima_appraise.c   |   6 +-
>   security/integrity/ima/ima_main.c       |  36 +-
>   security/integrity/ima/ima_policy.c     |  19 +-
>   security/integrity/integrity_audit.c    |   1 +
>   security/loadpin/loadpin.c              |   8 +-
>   security/safesetid/lsm.c                |   8 +-
>   security/security.c                     | 586 +++++++++++++++++++++++++++++---
>   security/selinux/hooks.c                | 109 +++---
>   security/selinux/include/classmap.h     |   2 +-
>   security/selinux/include/objsec.h       |   5 +
>   security/selinux/include/security.h     |   1 +
>   security/selinux/netlabel.c             |  25 +-
>   security/selinux/ss/services.c          |   4 +-
>   security/smack/smack.h                  |   6 +
>   security/smack/smack_lsm.c              | 124 ++++---
>   security/smack/smack_netfilter.c        |   8 +-
>   security/smack/smackfs.c                |  10 +-
>   security/tomoyo/tomoyo.c                |   8 +-
>   security/yama/yama_lsm.c                |   7 +-
>   66 files changed, 1376 insertions(+), 580 deletions(-)
> 


^ permalink raw reply

* Re: [PATCH v10 13/25] LSM: Specify which LSM to display
From: Simon McVittie @ 2019-10-29 14:44 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: casey.schaufler, jmorris, linux-security-module, selinux,
	keescook, john.johansen, penguin-kernel, paul, sds
In-Reply-To: <20191024205228.6922-14-casey@schaufler-ca.com>

On Thu, 24 Oct 2019 at 13:52:16 -0700, Casey Schaufler wrote:
> Create a new entry "display" in /proc/.../attr for controlling
> which LSM security information is displayed for a process.

It still isn't immediately obvious to me from the commit message whether
the "..." stands for the pid of the process that will read LSM information,
or the pid of the process whose LSM information will be read.

I believe the intended meaning was the former? So perhaps

    Create a new entry "display" in /proc/$reader/attr that controls
    which LSM security information will be displayed when the process
    $reader reads LSM information.

    (Note that when $reader reads /proc/$subject/attr/current for
    $reader != $subject, it is /proc/$reader/attr/display that controls
    what is displayed there, not /proc/$subject/attr/display.)

The commit that introduces /proc/.../attr/context could probably
benefit from similar treatment - maybe it could be referred to as
/proc/$subject/attr/context?

    smcv

^ permalink raw reply

* Re: [PATCH] KEYS: trusted: Remove set but not used variable 'keyhndl'
From: Jarkko Sakkinen @ 2019-10-29 14:18 UTC (permalink / raw)
  To: zhengbin
  Cc: dhowells, jmorris, serge, sumit.garg, keyrings,
	linux-security-module, yi.zhang
In-Reply-To: <20191029141637.GA7415@linux.intel.com>

On Tue, Oct 29, 2019 at 04:16:37PM +0200, Jarkko Sakkinen wrote:
> On Tue, Oct 29, 2019 at 05:33:32PM +0800, zhengbin wrote:
> > Fixes gcc '-Wunused-but-set-variable' warning:
> > 
> > security/keys/trusted-keys/trusted_tpm1.c: In function tpm_unseal:
> > security/keys/trusted-keys/trusted_tpm1.c:588:11: warning: variable keyhndl set but not used [-Wunused-but-set-variable]
> > 
> > It is introduced by commit 00aa975bd031 ("KEYS: trusted:
> > Create trusted keys subsystem"), but never used, so remove it.
> > 
> > Reported-by: Hulk Robot <hulkci@huawei.com>
> > Signed-off-by: zhengbin <zhengbin13@huawei.com>
> 
> Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> 
> I'll pick this to my tree.

Please use fixes tag next time instead of ad hoc.

Anyway, applied, thanks.

/Jarkko

^ permalink raw reply

* Re: [PATCH] KEYS: trusted: Remove set but not used variable 'keyhndl'
From: Jarkko Sakkinen @ 2019-10-29 14:16 UTC (permalink / raw)
  To: zhengbin
  Cc: dhowells, jmorris, serge, sumit.garg, keyrings,
	linux-security-module, yi.zhang
In-Reply-To: <1572341612-31893-1-git-send-email-zhengbin13@huawei.com>

On Tue, Oct 29, 2019 at 05:33:32PM +0800, zhengbin wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> security/keys/trusted-keys/trusted_tpm1.c: In function tpm_unseal:
> security/keys/trusted-keys/trusted_tpm1.c:588:11: warning: variable keyhndl set but not used [-Wunused-but-set-variable]
> 
> It is introduced by commit 00aa975bd031 ("KEYS: trusted:
> Create trusted keys subsystem"), but never used, so remove it.
> 
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: zhengbin <zhengbin13@huawei.com>

Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>

I'll pick this to my tree.

/Jarkko

^ permalink raw reply

* Re: [PATCH v23 12/24] x86/sgx: Linux Enclave Driver
From: Jarkko Sakkinen @ 2019-10-29  9:29 UTC (permalink / raw)
  To: linux-kernel, x86, linux-sgx
  Cc: akpm, dave.hansen, sean.j.christopherson, nhorman, npmccallum,
	serge.ayoun, shay.katz-zamir, haitao.huang, andriy.shevchenko,
	tglx, kai.svahn, bp, josh, luto, kai.huang, rientjes, cedric.xing,
	puiterwijk, linux-security-module, Suresh Siddha
In-Reply-To: <20191028210324.12475-13-jarkko.sakkinen@linux.intel.com>

On Mon, Oct 28, 2019 at 11:03:12PM +0200, Jarkko Sakkinen wrote:
> +/**
> + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
> + * @encl:       pointer to an enclave instance (via ioctl() file pointer)
> + * @arg:	a user pointer to a struct sgx_enclave_add_pages instance
> + *
> + * Add (EADD) one or more pages to an uninitialized enclave, and optionally
> + * extend (EEXTEND) the measurement with the contents of the page. The range of
> + * pages must be virtually contiguous. The SECINFO and measurement mask are
> + * applied to all pages, i.e. pages with different properties must be added in
> + * separate calls.
> + *
> + * A SECINFO for a TCS is required to always contain zero permissions because
> + * CPU silently zeros them. Allowing anything else would cause a mismatch in
> + * the measurement.
> + *
> + * mmap()'s protection bits are capped by the page permissions. For each page
> + * address, the maximum protection bits are computed with the following
> + * heuristics:
> + *
> + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
> + * 2. A TCS page: PROT_R | PROT_W.
> + * 3. No page: PROT_NONE.
> + *
> + * mmap() is not allowed to surpass the minimum of the maximum protection bits
> + * within the given address range.
> + *
> + * As stated above, a non-existent page is interpreted as a page with no
> + * permissions. In effect, this allows mmap() with PROT_NONE to be used to seek
> + * an address range for the enclave that can be then populated into SECS.
> + *
> + * @arg->addr, @arg->src and @arg->length are adjusted to reflect the
> + * remaining pages that need to be added to the enclave, e.g. userspace can
> + * re-invoke SGX_IOC_ENCLAVE_ADD_PAGES using the same struct in response to an
> + * ERESTARTSYS error.
> + *
> + * Return:
> + *   0 on success,
> + *   -EINVAL if any input param or the SECINFO contains invalid data,
> + *   -EACCES if an executable source page is located in a noexec partition,
> + *   -ENOMEM if any memory allocation, including EPC, fails,
> + *   -ERESTARTSYS if a pending signal is recognized
> + */
> +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)

This should return the number of pages processed instead of zero on
success. Kernel needs to be able to cap the amount it will process.

/Jarkko

^ permalink raw reply

* [PATCH] KEYS: trusted: Remove set but not used variable 'keyhndl'
From: zhengbin @ 2019-10-29  9:33 UTC (permalink / raw)
  To: dhowells, jmorris, serge, sumit.garg, jarkko.sakkinen, keyrings,
	linux-security-module
  Cc: yi.zhang, zhengbin13

Fixes gcc '-Wunused-but-set-variable' warning:

security/keys/trusted-keys/trusted_tpm1.c: In function tpm_unseal:
security/keys/trusted-keys/trusted_tpm1.c:588:11: warning: variable keyhndl set but not used [-Wunused-but-set-variable]

It is introduced by commit 00aa975bd031 ("KEYS: trusted:
Create trusted keys subsystem"), but never used, so remove it.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: zhengbin <zhengbin13@huawei.com>
---
 security/keys/trusted-keys/trusted_tpm1.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c
index eb5074e..d2c5ec1 100644
--- a/security/keys/trusted-keys/trusted_tpm1.c
+++ b/security/keys/trusted-keys/trusted_tpm1.c
@@ -585,7 +585,6 @@ static int tpm_unseal(struct tpm_buf *tb,
 	uint32_t authhandle2 = 0;
 	unsigned char cont = 0;
 	uint32_t ordinal;
-	uint32_t keyhndl;
 	int ret;

 	/* sessions for unsealing key and data */
@@ -601,7 +600,6 @@ static int tpm_unseal(struct tpm_buf *tb,
 	}

 	ordinal = htonl(TPM_ORD_UNSEAL);
-	keyhndl = htonl(SRKHANDLE);
 	ret = tpm_get_random(chip, nonceodd, TPM_NONCE_SIZE);
 	if (ret != TPM_NONCE_SIZE) {
 		pr_info("trusted_key: tpm_get_random failed (%d)\n", ret);
--
2.7.4


^ permalink raw reply related

* [PATCH v23 15/24] x86/sgx: Add provisioning
From: Jarkko Sakkinen @ 2019-10-28 21:03 UTC (permalink / raw)
  To: linux-kernel, x86, linux-sgx
  Cc: akpm, dave.hansen, sean.j.christopherson, nhorman, npmccallum,
	serge.ayoun, shay.katz-zamir, haitao.huang, andriy.shevchenko,
	tglx, kai.svahn, bp, josh, luto, kai.huang, rientjes, cedric.xing,
	puiterwijk, Jarkko Sakkinen, linux-security-module
In-Reply-To: <20191028210324.12475-1-jarkko.sakkinen@linux.intel.com>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=a, Size: 6385 bytes --]

In order to provide a mechanism for devilering provisoning rights:

1. Add a new device file /dev/sgx/provision that works as a token for
   allowing an enclave to have the provisioning privileges.
2. Add a new ioctl called SGX_IOC_ENCLAVE_SET_ATTRIBUTE that accepts the
   following data structure:

   struct sgx_enclave_set_attribute {
           __u64 addr;
           __u64 attribute_fd;
   };

A daemon could sit on top of /dev/sgx/provision and send a file
descriptor of this file to a process that needs to be able to provision
enclaves.

The way this API is used is straight-forward. Lets assume that dev_fd is
a handle to /dev/sgx/enclave and prov_fd is a handle to
/dev/sgx/provision.  You would allow SGX_IOC_ENCLAVE_CREATE to
initialize an enclave with the PROVISIONKEY attribute by

params.addr = <enclave address>;
params.token_fd = prov_fd;

ioctl(dev_fd, SGX_IOC_ENCLAVE_SET_ATTRIBUTE, &params);

Cc: linux-security-module@vger.kernel.org
Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/x86/include/uapi/asm/sgx.h  | 11 ++++++++
 arch/x86/kernel/cpu/sgx/driver.c | 23 +++++++++++++++-
 arch/x86/kernel/cpu/sgx/driver.h |  2 ++
 arch/x86/kernel/cpu/sgx/ioctl.c  | 47 ++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index e0f79ebdfd8a..e7b20f1e41b3 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -25,6 +25,8 @@ enum sgx_page_flags {
 	_IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
 #define SGX_IOC_ENCLAVE_INIT \
 	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
+#define SGX_IOC_ENCLAVE_SET_ATTRIBUTE \
+	_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_set_attribute)
 
 /**
  * struct sgx_enclave_create - parameter structure for the
@@ -61,4 +63,13 @@ struct sgx_enclave_init {
 	__u64 sigstruct;
 };
 
+/**
+ * struct sgx_enclave_set_attribute - parameter structure for the
+ *				      %SGX_IOC_ENCLAVE_SET_ATTRIBUTE ioctl
+ * @attribute_fd:	file handle of the attribute file in the securityfs
+ */
+struct sgx_enclave_set_attribute {
+	__u64 attribute_fd;
+};
+
 #endif /* _UAPI_ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index c724dcccf2e2..4d996463b213 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -141,12 +141,18 @@ static const struct file_operations sgx_encl_fops = {
 	.get_unmapped_area	= sgx_get_unmapped_area,
 };
 
+const struct file_operations sgx_provision_fops = {
+	.owner			= THIS_MODULE,
+};
+
 static struct bus_type sgx_bus_type = {
 	.name	= "sgx",
 };
 
 static struct device sgx_encl_dev;
 static struct cdev sgx_encl_cdev;
+static struct device sgx_provision_dev;
+static struct cdev sgx_provision_cdev;
 static dev_t sgx_devt;
 
 static void sgx_dev_release(struct device *dev)
@@ -223,22 +229,37 @@ int __init sgx_drv_init(void)
 	if (ret)
 		goto err_chrdev_region;
 
+	ret = sgx_dev_init("sgx/provision", &sgx_provision_dev,
+			   &sgx_provision_cdev, &sgx_provision_fops, 1);
+	if (ret)
+		goto err_encl_dev;
+
 	sgx_encl_wq = alloc_workqueue("sgx-encl-wq",
 				      WQ_UNBOUND | WQ_FREEZABLE, 1);
 	if (!sgx_encl_wq) {
 		ret = -ENOMEM;
-		goto err_encl_dev;
+		goto err_provision_dev;
 	}
 
 	ret = cdev_device_add(&sgx_encl_cdev, &sgx_encl_dev);
 	if (ret)
 		goto err_encl_wq;
 
+	ret = cdev_device_add(&sgx_provision_cdev, &sgx_provision_dev);
+	if (ret)
+		goto err_encl_cdev;
+
 	return 0;
 
+err_encl_cdev:
+	cdev_device_del(&sgx_encl_cdev, &sgx_encl_dev);
+
 err_encl_wq:
 	destroy_workqueue(sgx_encl_wq);
 
+err_provision_dev:
+	put_device(&sgx_provision_dev);
+
 err_encl_dev:
 	put_device(&sgx_encl_dev);
 
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
index e95c6e86c0c6..2f13886522a8 100644
--- a/arch/x86/kernel/cpu/sgx/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -25,6 +25,8 @@ extern u64 sgx_attributes_reserved_mask;
 extern u64 sgx_xfrm_reserved_mask;
 extern u32 sgx_xsave_size_tbl[64];
 
+extern const struct file_operations sgx_provision_fops;
+
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 
 int sgx_drv_init(void);
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 691efac24ed7..e71618cef90c 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -622,6 +622,50 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
 	return ret;
 }
 
+/**
+ * sgx_ioc_enclave_set_attribute - handler for %SGX_IOC_ENCLAVE_SET_ATTRIBUTE
+ * @filep:	open file to /dev/sgx
+ * @arg:	userspace pointer to a struct sgx_enclave_set_attribute instance
+ *
+ * Mark the enclave as being allowed to access a restricted attribute bit.
+ * The requested attribute is specified via the attribute_fd field in the
+ * provided struct sgx_enclave_set_attribute.  The attribute_fd must be a
+ * handle to an SGX attribute file, e.g. “/dev/sgx/provision".
+ *
+ * Failure to explicitly request access to a restricted attribute will cause
+ * sgx_ioc_enclave_init() to fail.  Currently, the only restricted attribute
+ * is access to the PROVISION_KEY.
+ *
+ * Note, access to the EINITTOKEN_KEY is disallowed entirely.
+ *
+ * Return: 0 on success, -errno otherwise
+ */
+static long sgx_ioc_enclave_set_attribute(struct sgx_encl *encl,
+					  void __user *arg)
+{
+	struct sgx_enclave_set_attribute params;
+	struct file *attribute_file;
+	int ret;
+
+	if (copy_from_user(&params, arg, sizeof(params)))
+		return -EFAULT;
+
+	attribute_file = fget(params.attribute_fd);
+	if (!attribute_file)
+		return -EINVAL;
+
+	if (attribute_file->f_op != &sgx_provision_fops) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	encl->allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+	ret = 0;
+
+out:
+	fput(attribute_file);
+	return ret;
+}
 
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 {
@@ -645,6 +689,9 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	case SGX_IOC_ENCLAVE_INIT:
 		ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
 		break;
+	case SGX_IOC_ENCLAVE_SET_ATTRIBUTE:
+		ret = sgx_ioc_enclave_set_attribute(encl, (void __user *)arg);
+		break;
 	default:
 		ret = -ENOIOCTLCMD;
 		break;
-- 
2.20.1


^ permalink raw reply related

* [PATCH v23 12/24] x86/sgx: Linux Enclave Driver
From: Jarkko Sakkinen @ 2019-10-28 21:03 UTC (permalink / raw)
  To: linux-kernel, x86, linux-sgx
  Cc: akpm, dave.hansen, sean.j.christopherson, nhorman, npmccallum,
	serge.ayoun, shay.katz-zamir, haitao.huang, andriy.shevchenko,
	tglx, kai.svahn, bp, josh, luto, kai.huang, rientjes, cedric.xing,
	puiterwijk, Jarkko Sakkinen, linux-security-module, Suresh Siddha
In-Reply-To: <20191028210324.12475-1-jarkko.sakkinen@linux.intel.com>

Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
can be used by applications to set aside private regions of code and
data. The code outside the SGX hosted software entity is disallowed to
access the memory inside the enclave enforced by the CPU. We call these
entities as enclaves.

This commit implements a driver that provides an ioctl API to construct
and run enclaves. Enclaves are constructed from pages residing in
reserved physical memory areas. The contents of these pages can only be
accessed when they are mapped as part of an enclave, by a hardware
thread running inside the enclave.

The starting state of an enclave consists of a fixed measured set of
pages that are copied to the EPC during the construction process by
using ENCLS leaf functions and Software Enclave Control Structure (SECS)
that defines the enclave properties.

Enclave are constructed by using ENCLS leaf functions ECREATE, EADD and
EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
the EPC and EINIT check a given signed measurement and moves the enclave
into a state ready for execution.

An initialized enclave can only be accessed through special Thread Control
Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
function converts a thread into enclave mode and continues the execution in
the offset defined by the TCS provided to EENTER. An enclave is exited
through syscall, exception, interrupts or by explicitly calling another
ENCLU leaf EEXIT.

The permissions, which enclave page is added will set the limit for maximum
permissions that can be set for mmap() and mprotect(). This will
effectively allow to build different security schemes between producers and
consumers of enclaves. Later on we can increase granularity with LSM hooks
for page addition (i.e. for producers) and mapping of the enclave (i.e. for
consumers)

Cc: linux-security-module@vger.kernel.org
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 Documentation/ioctl/ioctl-number.rst |   1 +
 arch/x86/include/uapi/asm/sgx.h      |  64 +++
 arch/x86/kernel/cpu/sgx/Makefile     |   3 +
 arch/x86/kernel/cpu/sgx/driver.c     | 252 ++++++++++
 arch/x86/kernel/cpu/sgx/driver.h     |  32 ++
 arch/x86/kernel/cpu/sgx/encl.c       | 329 ++++++++++++++
 arch/x86/kernel/cpu/sgx/encl.h       |  87 ++++
 arch/x86/kernel/cpu/sgx/ioctl.c      | 656 +++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/main.c       |  10 +
 arch/x86/kernel/cpu/sgx/reclaim.c    |   1 +
 10 files changed, 1435 insertions(+)
 create mode 100644 arch/x86/include/uapi/asm/sgx.h
 create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
 create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
 create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
 create mode 100644 arch/x86/kernel/cpu/sgx/encl.h
 create mode 100644 arch/x86/kernel/cpu/sgx/ioctl.c

diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
index bef79cd4c6b4..f9f3ea9606fc 100644
--- a/Documentation/ioctl/ioctl-number.rst
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -321,6 +321,7 @@ Code  Seq#    Include File                                           Comments
                                                                      <mailto:tlewis@mindspring.com>
 0xA3  90-9F  linux/dtlk.h
 0xA4  00-1F  uapi/linux/tee.h                                        Generic TEE subsystem
+0xA4  00-1F  uapi/asm/sgx.h                                          Intel SGX subsystem (a legit conflict as TEE and SGX do not co-exist)
 0xAA  00-3F  linux/uapi/linux/userfaultfd.h
 0xAB  00-1F  linux/nbd.h
 0xAC  00-1F  linux/raw.h
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
new file mode 100644
index 000000000000..e0f79ebdfd8a
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) WITH Linux-syscall-note */
+/*
+ * Copyright(c) 2016-19 Intel Corporation.
+ */
+#ifndef _UAPI_ASM_X86_SGX_H
+#define _UAPI_ASM_X86_SGX_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/**
+ * enum sgx_epage_flags - page control flags
+ * %SGX_PAGE_MEASURE:	Measure the page contents with a sequence of
+ *			ENCLS[EEXTEND] operations.
+ */
+enum sgx_page_flags {
+	SGX_PAGE_MEASURE	= 0x01,
+};
+
+#define SGX_MAGIC 0xA4
+
+#define SGX_IOC_ENCLAVE_CREATE \
+	_IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
+#define SGX_IOC_ENCLAVE_ADD_PAGES \
+	_IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
+#define SGX_IOC_ENCLAVE_INIT \
+	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
+
+/**
+ * struct sgx_enclave_create - parameter structure for the
+ *                             %SGX_IOC_ENCLAVE_CREATE ioctl
+ * @src:	address for the SECS page data
+ */
+struct sgx_enclave_create  {
+	__u64	src;
+};
+
+/**
+ * struct sgx_enclave_add_pages - parameter structure for the
+ *                                %SGX_IOC_ENCLAVE_ADD_PAGE ioctl
+ * @src:	start address for the page data
+ * @offset:	starting page offset
+ * @length:	length of the data (multiple of the page size)
+ * @secinfo:	address for the SECINFO data
+ * @flags:	page control flags
+ */
+struct sgx_enclave_add_pages {
+	__u64	src;
+	__u64	offset;
+	__u64	length;
+	__u64	secinfo;
+	__u64	flags;
+};
+
+/**
+ * struct sgx_enclave_init - parameter structure for the
+ *                           %SGX_IOC_ENCLAVE_INIT ioctl
+ * @sigstruct:	address for the SIGSTRUCT data
+ */
+struct sgx_enclave_init {
+	__u64 sigstruct;
+};
+
+#endif /* _UAPI_ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 874492d9e3bd..3ddcdabab081 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -1,4 +1,7 @@
 obj-y += \
+	driver.o \
+	encl.o \
 	encls.o \
+	ioctl.o \
 	main.o \
 	reclaim.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..c724dcccf2e2
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <linux/acpi.h>
+#include <linux/cdev.h>
+#include <linux/mman.h>
+#include <linux/platform_device.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+MODULE_DESCRIPTION("Intel SGX Enclave Driver");
+MODULE_AUTHOR("Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct workqueue_struct *sgx_encl_wq;
+u64 sgx_encl_size_max_32;
+u64 sgx_encl_size_max_64;
+u32 sgx_misc_reserved_mask;
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_xsave_size_tbl[64];
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+	struct sgx_encl *encl;
+	int ret;
+
+	encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+	if (!encl)
+		return -ENOMEM;
+
+	atomic_set(&encl->flags, 0);
+	kref_init(&encl->refcount);
+	INIT_RADIX_TREE(&encl->page_tree, GFP_KERNEL);
+	mutex_init(&encl->lock);
+	INIT_LIST_HEAD(&encl->mm_list);
+	spin_lock_init(&encl->mm_lock);
+
+	ret = init_srcu_struct(&encl->srcu);
+	if (ret) {
+		kfree(encl);
+		return ret;
+	}
+
+	file->private_data = encl;
+
+	return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+	struct sgx_encl *encl = file->private_data;
+	struct sgx_encl_mm *encl_mm;
+
+	for ( ; ; )  {
+		spin_lock(&encl->mm_lock);
+
+		if (list_empty(&encl->mm_list)) {
+			encl_mm = NULL;
+		} else {
+			encl_mm = list_first_entry(&encl->mm_list,
+						   struct sgx_encl_mm, list);
+			list_del_rcu(&encl_mm->list);
+		}
+
+		spin_unlock(&encl->mm_lock);
+
+		/* The list is empty, ready to go. */
+		if (!encl_mm)
+			break;
+
+		synchronize_srcu(&encl->srcu);
+		mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+		kfree(encl_mm);
+	};
+
+	mutex_lock(&encl->lock);
+	atomic_or(SGX_ENCL_DEAD, &encl->flags);
+	mutex_unlock(&encl->lock);
+
+	kref_put(&encl->refcount, sgx_encl_release);
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
+			      unsigned long arg)
+{
+	return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sgx_encl *encl = file->private_data;
+	int ret;
+
+	ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end,
+			       vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC));
+	if (ret)
+		return ret;
+
+	ret = sgx_encl_mm_add(encl, vma->vm_mm);
+	if (ret)
+		return ret;
+
+	vma->vm_ops = &sgx_vm_ops;
+	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+	vma->vm_private_data = encl;
+
+	return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+					   unsigned long addr,
+					   unsigned long len,
+					   unsigned long pgoff,
+					   unsigned long flags)
+{
+	if (flags & MAP_PRIVATE)
+		return -EINVAL;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+static const struct file_operations sgx_encl_fops = {
+	.owner			= THIS_MODULE,
+	.open			= sgx_open,
+	.release		= sgx_release,
+	.unlocked_ioctl		= sgx_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl		= sgx_compat_ioctl,
+#endif
+	.mmap			= sgx_mmap,
+	.get_unmapped_area	= sgx_get_unmapped_area,
+};
+
+static struct bus_type sgx_bus_type = {
+	.name	= "sgx",
+};
+
+static struct device sgx_encl_dev;
+static struct cdev sgx_encl_cdev;
+static dev_t sgx_devt;
+
+static void sgx_dev_release(struct device *dev)
+{
+}
+
+static __init int sgx_dev_init(const char *name, struct device *dev,
+			       struct cdev *cdev,
+			       const struct file_operations *fops, int minor)
+{
+	int ret;
+
+	device_initialize(dev);
+
+	dev->bus = &sgx_bus_type;
+	dev->devt = MKDEV(MAJOR(sgx_devt), minor);
+	dev->release = sgx_dev_release;
+
+	ret = dev_set_name(dev, name);
+	if (ret) {
+		put_device(dev);
+		return ret;
+	}
+
+	cdev_init(cdev, fops);
+	cdev->owner = THIS_MODULE;
+	return 0;
+}
+
+int __init sgx_drv_init(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	u64 attr_mask, xfrm_mask;
+	int ret;
+	int i;
+
+	if (!boot_cpu_has(X86_FEATURE_SGX_LC)) {
+		pr_info("The public key MSRs are not writable\n");
+		return -ENODEV;
+	}
+
+	ret = bus_register(&sgx_bus_type);
+	if (ret)
+		return ret;
+
+	ret = alloc_chrdev_region(&sgx_devt, 0, SGX_DRV_NR_DEVICES, "sgx");
+	if (ret < 0)
+		goto err_bus;
+
+	cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+	sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF);
+	sgx_encl_size_max_32 = 1ULL << (edx & 0xFF);
+
+	cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+	attr_mask = (((u64)ebx) << 32) + (u64)eax;
+	sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+	if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+
+		for (i = 2; i < 64; i++) {
+			cpuid_count(0x0D, i, &eax, &ebx, &ecx, &edx);
+			if ((1 << i) & xfrm_mask)
+				sgx_xsave_size_tbl[i] = eax + ebx;
+		}
+
+		sgx_xfrm_reserved_mask = ~xfrm_mask;
+	}
+
+	ret = sgx_dev_init("sgx/enclave", &sgx_encl_dev, &sgx_encl_cdev,
+			   &sgx_encl_fops, 0);
+	if (ret)
+		goto err_chrdev_region;
+
+	sgx_encl_wq = alloc_workqueue("sgx-encl-wq",
+				      WQ_UNBOUND | WQ_FREEZABLE, 1);
+	if (!sgx_encl_wq) {
+		ret = -ENOMEM;
+		goto err_encl_dev;
+	}
+
+	ret = cdev_device_add(&sgx_encl_cdev, &sgx_encl_dev);
+	if (ret)
+		goto err_encl_wq;
+
+	return 0;
+
+err_encl_wq:
+	destroy_workqueue(sgx_encl_wq);
+
+err_encl_dev:
+	put_device(&sgx_encl_dev);
+
+err_chrdev_region:
+	unregister_chrdev_region(sgx_devt, SGX_DRV_NR_DEVICES);
+
+err_bus:
+	bus_unregister(&sgx_bus_type);
+
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..e95c6e86c0c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <crypto/hash.h>
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <uapi/asm/sgx.h>
+#include "sgx.h"
+
+#define SGX_DRV_NR_DEVICES	2
+#define SGX_EINIT_SPIN_COUNT	20
+#define SGX_EINIT_SLEEP_COUNT	50
+#define SGX_EINIT_SLEEP_TIME	20
+
+extern struct workqueue_struct *sgx_encl_wq;
+extern u64 sgx_encl_size_max_32;
+extern u64 sgx_encl_size_max_64;
+extern u32 sgx_misc_reserved_mask;
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_xsave_size_tbl[64];
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..cd2b8dbb0eca
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include "arch.h"
+#include "encl.h"
+#include "sgx.h"
+
+static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+						unsigned long addr)
+{
+	struct sgx_encl_page *entry;
+	unsigned int flags;
+
+	/* If process was forked, VMA is still there but vm_private_data is set
+	 * to NULL.
+	 */
+	if (!encl)
+		return ERR_PTR(-EFAULT);
+
+	flags = atomic_read(&encl->flags);
+
+	if ((flags & SGX_ENCL_DEAD) || !(flags & SGX_ENCL_INITIALIZED))
+		return ERR_PTR(-EFAULT);
+
+	entry = radix_tree_lookup(&encl->page_tree, addr >> PAGE_SHIFT);
+	if (!entry)
+		return ERR_PTR(-EFAULT);
+
+	/* Page is already resident in the EPC. */
+	if (entry->epc_page)
+		return entry;
+
+	return ERR_PTR(-EFAULT);
+}
+
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
+{
+	struct sgx_encl_mm *encl_mm =
+		container_of(mn, struct sgx_encl_mm, mmu_notifier);
+	struct sgx_encl_mm *tmp = NULL;
+
+	/*
+	 * The enclave itself can remove encl_mm.  Note, objects can't be moved
+	 * off an RCU protected list, but deletion is ok.
+	 */
+	spin_lock(&encl_mm->encl->mm_lock);
+	list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+		if (tmp == encl_mm) {
+			list_del_rcu(&encl_mm->list);
+			break;
+		}
+	}
+	spin_unlock(&encl_mm->encl->mm_lock);
+
+	if (tmp == encl_mm) {
+		synchronize_srcu(&encl_mm->encl->srcu);
+		mmu_notifier_put(mn);
+	}
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+	struct sgx_encl_mm *encl_mm =
+		container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+	kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+	.release		= sgx_mmu_notifier_release,
+	.free_notifier		= sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+					    struct mm_struct *mm)
+{
+	struct sgx_encl_mm *encl_mm = NULL;
+	struct sgx_encl_mm *tmp;
+	int idx;
+
+	idx = srcu_read_lock(&encl->srcu);
+
+	list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+		if (tmp->mm == mm) {
+			encl_mm = tmp;
+			break;
+		}
+	}
+
+	srcu_read_unlock(&encl->srcu, idx);
+
+	return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+	struct sgx_encl_mm *encl_mm;
+	int ret;
+
+	if (atomic_read(&encl->flags) & SGX_ENCL_DEAD)
+		return -EINVAL;
+
+	/*
+	 * mm_structs are kept on mm_list until the mm or the enclave dies,
+	 * i.e. once an mm is off the list, it's gone for good, therefore it's
+	 * impossible to get a false positive on @mm due to a stale mm_list.
+	 */
+	if (sgx_encl_find_mm(encl, mm))
+		return 0;
+
+	encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+	if (!encl_mm)
+		return -ENOMEM;
+
+	encl_mm->encl = encl;
+	encl_mm->mm = mm;
+	encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+	ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+	if (ret) {
+		kfree(encl_mm);
+		return ret;
+	}
+
+	spin_lock(&encl->mm_lock);
+	list_add_rcu(&encl_mm->list, &encl->mm_list);
+	spin_unlock(&encl->mm_lock);
+
+	synchronize_srcu(&encl->srcu);
+
+	return 0;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+	struct sgx_encl *encl = vma->vm_private_data;
+
+	if (!encl)
+		return;
+
+	if (sgx_encl_mm_add(encl, vma->vm_mm))
+		vma->vm_private_data = NULL;
+}
+
+static unsigned int sgx_vma_fault(struct vm_fault *vmf)
+{
+	unsigned long addr = (unsigned long)vmf->address;
+	struct vm_area_struct *vma = vmf->vma;
+	struct sgx_encl *encl = vma->vm_private_data;
+	struct sgx_encl_page *entry;
+	int ret = VM_FAULT_NOPAGE;
+	unsigned long pfn;
+
+	if (!encl)
+		return VM_FAULT_SIGBUS;
+
+	mutex_lock(&encl->lock);
+
+	entry = sgx_encl_load_page(encl, addr);
+	if (IS_ERR(entry)) {
+		if (unlikely(PTR_ERR(entry) != -EBUSY))
+			ret = VM_FAULT_SIGBUS;
+
+		goto out;
+	}
+
+	if (!follow_pfn(vma, addr, &pfn))
+		goto out;
+
+	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(entry->epc_page->desc));
+	if (ret != VM_FAULT_NOPAGE) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+out:
+	mutex_unlock(&encl->lock);
+	return ret;
+}
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl:		an enclave
+ * @start:		lower bound of the address range, inclusive
+ * @end:		upper bound of the address range, exclusive
+ * @vm_prot_bits:	requested protections of the address range
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * the permissions requested by @vm_prot_bits do not exceed that of any enclave
+ * page to be mapped.  Page addresses that do not have an associated enclave
+ * page are interpreted to zero permissions.
+ *
+ * Return:
+ *   0 on success,
+ *   -EACCES if VMA permissions exceed enclave page permissions
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+		     unsigned long end, unsigned long vm_prot_bits)
+{
+	unsigned long idx, idx_start, idx_end;
+	struct sgx_encl_page *page;
+
+	/* PROT_NONE always succeeds. */
+	if (!vm_prot_bits)
+		return 0;
+
+	idx_start = PFN_DOWN(start);
+	idx_end = PFN_DOWN(end - 1);
+
+	for (idx = idx_start; idx <= idx_end; ++idx) {
+		mutex_lock(&encl->lock);
+		page = radix_tree_lookup(&encl->page_tree, idx);
+		mutex_unlock(&encl->lock);
+
+		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
+			return -EACCES;
+	}
+
+	return 0;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
+			    unsigned long end, unsigned long prot)
+{
+	return sgx_encl_may_map(vma->vm_private_data, start, end,
+				calc_vm_prot_bits(prot, 0));
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+	.open = sgx_vma_open,
+	.fault = sgx_vma_fault,
+	.may_mprotect = sgx_vma_mprotect,
+};
+
+/**
+ * sgx_encl_find - find an enclave
+ * @mm:		mm struct of the current process
+ * @addr:	address in the ELRANGE
+ * @vma:	the resulting VMA
+ *
+ * Find an enclave identified by the given address. Give back a VMA that is
+ * part of the enclave and located in that address. The VMA is given back if it
+ * is a proper enclave VMA even if an &sgx_encl instance does not exist yet
+ * (enclave creation has not been performed).
+ *
+ * Return:
+ *   0 on success,
+ *   -EINVAL if an enclave was not found,
+ *   -ENOENT if the enclave has not been created yet
+ */
+int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+		  struct vm_area_struct **vma)
+{
+	struct vm_area_struct *result;
+	struct sgx_encl *encl;
+
+	result = find_vma(mm, addr);
+	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+		return -EINVAL;
+
+	encl = result->vm_private_data;
+	*vma = result;
+
+	return encl ? 0 : -ENOENT;
+}
+
+/**
+ * sgx_encl_destroy() - destroy enclave resources
+ * @encl:	an &sgx_encl instance
+ */
+void sgx_encl_destroy(struct sgx_encl *encl)
+{
+	struct sgx_encl_page *entry;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	atomic_or(SGX_ENCL_DEAD, &encl->flags);
+
+	radix_tree_for_each_slot(slot, &encl->page_tree, &iter, 0) {
+		entry = *slot;
+
+		if (entry->epc_page) {
+			sgx_free_page(entry->epc_page);
+			encl->secs_child_cnt--;
+			entry->epc_page = NULL;
+		}
+
+		radix_tree_delete(&entry->encl->page_tree,
+				  PFN_DOWN(entry->desc));
+		kfree(entry);
+	}
+
+	if (!encl->secs_child_cnt && encl->secs.epc_page) {
+		sgx_free_page(encl->secs.epc_page);
+		encl->secs.epc_page = NULL;
+	}
+}
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @kref:	address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+
+	sgx_encl_destroy(encl);
+
+	if (encl->backing)
+		fput(encl->backing);
+
+	WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+	/* Detect EPC page leak's. */
+	WARN_ON_ONCE(encl->secs_child_cnt);
+	WARN_ON_ONCE(encl->secs.epc_page);
+
+	kfree(encl);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..1d1bc5d590ee
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/**
+ * Copyright(c) 2016-19 Intel Corporation.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include "sgx.h"
+
+/**
+ * enum sgx_encl_page_desc - defines bits for an enclave page's descriptor
+ * %SGX_ENCL_PAGE_ADDR_MASK:		Holds the virtual address of the page.
+ *
+ * The page address for SECS is zero and is used by the subsystem to recognize
+ * the SECS page.
+ */
+enum sgx_encl_page_desc {
+	/* Bits 11:3 are available when the page is not swapped. */
+	SGX_ENCL_PAGE_ADDR_MASK		= PAGE_MASK,
+};
+
+#define SGX_ENCL_PAGE_ADDR(page) \
+	((page)->desc & SGX_ENCL_PAGE_ADDR_MASK)
+
+struct sgx_encl_page {
+	unsigned long desc;
+	unsigned long vm_max_prot_bits;
+	struct sgx_epc_page *epc_page;
+	struct sgx_encl *encl;
+};
+
+enum sgx_encl_flags {
+	SGX_ENCL_CREATED	= BIT(0),
+	SGX_ENCL_INITIALIZED	= BIT(1),
+	SGX_ENCL_DEBUG		= BIT(2),
+	SGX_ENCL_DEAD		= BIT(3),
+	SGX_ENCL_IOCTL		= BIT(4),
+};
+
+struct sgx_encl_mm {
+	struct sgx_encl *encl;
+	struct mm_struct *mm;
+	struct list_head list;
+	struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+	atomic_t flags;
+	u64 secs_attributes;
+	u64 allowed_attributes;
+	unsigned int page_cnt;
+	unsigned int secs_child_cnt;
+	struct mutex lock;
+	struct list_head mm_list;
+	spinlock_t mm_lock;
+	struct file *backing;
+	struct kref refcount;
+	struct srcu_struct srcu;
+	unsigned long base;
+	unsigned long size;
+	unsigned long ssaframesize;
+	struct radix_tree_root page_tree;
+	struct sgx_encl_page secs;
+	cpumask_t cpumask;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+		  struct vm_area_struct **vma);
+void sgx_encl_destroy(struct sgx_encl *encl);
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+		     unsigned long end, unsigned long vm_prot_bits);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
new file mode 100644
index 000000000000..691efac24ed7
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-19 Intel Corporation.
+
+#include <asm/mman.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+static u32 sgx_calc_ssaframesize(u32 miscselect, u64 xfrm)
+{
+	u32 size_max = PAGE_SIZE;
+	u32 size;
+	int i;
+
+	for (i = 2; i < 64; i++) {
+		if (!((1 << i) & xfrm))
+			continue;
+
+		size = SGX_SSA_GPRS_SIZE + sgx_xsave_size_tbl[i];
+		if (miscselect & SGX_MISC_EXINFO)
+			size += SGX_SSA_MISC_EXINFO_SIZE;
+
+		if (size > size_max)
+			size_max = size;
+	}
+
+	return PFN_UP(size_max);
+}
+
+static int sgx_validate_secs(const struct sgx_secs *secs,
+			     unsigned long ssaframesize)
+{
+	if (secs->size < (2 * PAGE_SIZE) || !is_power_of_2(secs->size))
+		return -EINVAL;
+
+	if (secs->base & (secs->size - 1))
+		return -EINVAL;
+
+	if (secs->miscselect & sgx_misc_reserved_mask ||
+	    secs->attributes & sgx_attributes_reserved_mask ||
+	    secs->xfrm & sgx_xfrm_reserved_mask)
+		return -EINVAL;
+
+	if (secs->attributes & SGX_ATTR_MODE64BIT) {
+		if (secs->size > sgx_encl_size_max_64)
+			return -EINVAL;
+	} else if (secs->size > sgx_encl_size_max_32)
+		return -EINVAL;
+
+	if (!(secs->xfrm & XFEATURE_MASK_FP) ||
+	    !(secs->xfrm & XFEATURE_MASK_SSE) ||
+	    (((secs->xfrm >> XFEATURE_BNDREGS) & 1) !=
+	     ((secs->xfrm >> XFEATURE_BNDCSR) & 1)))
+		return -EINVAL;
+
+	if (!secs->ssa_frame_size || ssaframesize > secs->ssa_frame_size)
+		return -EINVAL;
+
+	if (memchr_inv(secs->reserved1, 0, sizeof(secs->reserved1)) ||
+	    memchr_inv(secs->reserved2, 0, sizeof(secs->reserved2)) ||
+	    memchr_inv(secs->reserved3, 0, sizeof(secs->reserved3)) ||
+	    memchr_inv(secs->reserved4, 0, sizeof(secs->reserved4)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+						 unsigned long offset,
+						 u64 secinfo_flags)
+{
+	struct sgx_encl_page *encl_page;
+	unsigned long prot;
+
+	encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+	if (!encl_page)
+		return ERR_PTR(-ENOMEM);
+
+	encl_page->desc = encl->base + offset;
+	encl_page->encl = encl;
+
+	prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ)  |
+	       _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+	       _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+	/*
+	 * TCS pages must always RW set for CPU access while the SECINFO
+	 * permissions are *always* zero - the CPU ignores the user provided
+	 * values and silently overwrites them with zero permissions.
+	 */
+	if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+		prot |= PROT_READ | PROT_WRITE;
+
+	/* Calculate maximum of the VM flags for the page. */
+	encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+	return encl_page;
+}
+
+static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+	unsigned long encl_size = secs->size + PAGE_SIZE;
+	struct sgx_epc_page *secs_epc;
+	unsigned long ssaframesize;
+	struct sgx_pageinfo pginfo;
+	struct sgx_secinfo secinfo;
+	struct file *backing;
+	long ret;
+
+	if (atomic_read(&encl->flags) & SGX_ENCL_CREATED)
+		return -EINVAL;
+
+	ssaframesize = sgx_calc_ssaframesize(secs->miscselect, secs->xfrm);
+	if (sgx_validate_secs(secs, ssaframesize)) {
+		pr_debug("invalid SECS\n");
+		return -EINVAL;
+	}
+
+	backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
+				   VM_NORESERVE);
+	if (IS_ERR(backing))
+		return PTR_ERR(backing);
+
+	encl->backing = backing;
+
+	secs_epc = sgx_try_alloc_page();
+	if (IS_ERR(secs_epc)) {
+		ret = PTR_ERR(secs_epc);
+		goto err_out_backing;
+	}
+
+	encl->secs.epc_page = secs_epc;
+
+	pginfo.addr = 0;
+	pginfo.contents = (unsigned long)secs;
+	pginfo.metadata = (unsigned long)&secinfo;
+	pginfo.secs = 0;
+	memset(&secinfo, 0, sizeof(secinfo));
+
+	ret = __ecreate((void *)&pginfo, sgx_epc_addr(secs_epc));
+	if (ret) {
+		pr_debug("ECREATE returned %ld\n", ret);
+		goto err_out;
+	}
+
+	if (secs->attributes & SGX_ATTR_DEBUG)
+		atomic_or(SGX_ENCL_DEBUG, &encl->flags);
+
+	encl->secs.encl = encl;
+	encl->secs_attributes = secs->attributes;
+	encl->allowed_attributes |= SGX_ATTR_ALLOWED_MASK;
+	encl->base = secs->base;
+	encl->size = secs->size;
+	encl->ssaframesize = secs->ssa_frame_size;
+
+	/*
+	 * Set SGX_ENCL_CREATED only after the enclave is fully prepped.  This
+	 * allows setting and checking enclave creation without having to take
+	 * encl->lock.
+	 */
+	atomic_or(SGX_ENCL_CREATED, &encl->flags);
+
+	return 0;
+
+err_out:
+	sgx_free_page(encl->secs.epc_page);
+	encl->secs.epc_page = NULL;
+
+err_out_backing:
+	fput(encl->backing);
+	encl->backing = NULL;
+
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @filep:	open file to /dev/sgx
+ * @arg:	userspace pointer to a struct sgx_enclave_create instance
+ *
+ * Allocate kernel data structures for a new enclave and execute ECREATE after
+ * verifying the correctness of the provided SECS.
+ *
+ * Note, enforcement of restricted and disallowed attributes is deferred until
+ * sgx_ioc_enclave_init(), only the architectural correctness of the SECS is
+ * checked by sgx_ioc_enclave_create().
+ *
+ * Return:
+ *   0 on success,
+ *   -errno otherwise
+ */
+static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
+{
+	struct sgx_enclave_create ecreate;
+	struct page *secs_page;
+	struct sgx_secs *secs;
+	int ret;
+
+	if (copy_from_user(&ecreate, arg, sizeof(ecreate)))
+		return -EFAULT;
+
+	secs_page = alloc_page(GFP_HIGHUSER);
+	if (!secs_page)
+		return -ENOMEM;
+
+	secs = kmap(secs_page);
+	if (copy_from_user(secs, (void __user *)ecreate.src, sizeof(*secs))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = sgx_encl_create(encl, secs);
+
+out:
+	kunmap(secs_page);
+	__free_page(secs_page);
+	return ret;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+	u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+	u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+	if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+		return -EINVAL;
+
+	if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+		return -EINVAL;
+
+	/*
+	 * CPU will silently overwrite the permissions as zero, which means
+	 * that we need to validate it ourselves.
+	 */
+	if (pt == SGX_SECINFO_TCS && perm)
+		return -EINVAL;
+
+	if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+		return -EINVAL;
+
+	if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+			       struct sgx_encl_page *encl_page,
+			       struct sgx_epc_page *epc_page,
+			       struct sgx_secinfo *secinfo, unsigned long src)
+{
+	struct sgx_pageinfo pginfo;
+	struct vm_area_struct *vma;
+	struct page *src_page;
+	int ret;
+
+	/* Query vma's VM_MAYEXEC as an indirect path_noexec() check. */
+	if (encl_page->vm_max_prot_bits & VM_EXEC) {
+		vma = find_vma(current->mm, src);
+		if (!vma)
+			return -EFAULT;
+
+		if (!(vma->vm_flags & VM_MAYEXEC))
+			return -EACCES;
+	}
+
+	ret = get_user_pages(src, 1, 0, &src_page, NULL);
+	if (ret < 1)
+		return ret;
+
+	pginfo.secs = (unsigned long)sgx_epc_addr(encl->secs.epc_page);
+	pginfo.addr = SGX_ENCL_PAGE_ADDR(encl_page);
+	pginfo.metadata = (unsigned long)secinfo;
+	pginfo.contents = (unsigned long)kmap_atomic(src_page);
+
+	ret = __eadd(&pginfo, sgx_epc_addr(epc_page));
+
+	kunmap_atomic((void *)pginfo.contents);
+	put_page(src_page);
+
+	return ret ? -EFAULT : 0;
+}
+
+static int __sgx_encl_extend(struct sgx_encl *encl,
+			     struct sgx_epc_page *epc_page)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		ret = __eextend(sgx_epc_addr(encl->secs.epc_page),
+				sgx_epc_addr(epc_page) + (i * 0x100));
+		if (ret) {
+			if (encls_failed(ret))
+				ENCLS_WARN(ret, "EEXTEND");
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl,
+			     struct sgx_enclave_add_pages *addp,
+			     struct sgx_secinfo *secinfo)
+{
+	struct sgx_encl_page *encl_page;
+	struct sgx_epc_page *epc_page;
+	int ret;
+
+	encl_page = sgx_encl_page_alloc(encl, addp->offset, secinfo->flags);
+	if (IS_ERR(encl_page))
+		return PTR_ERR(encl_page);
+
+	epc_page = sgx_try_alloc_page();
+	if (IS_ERR(epc_page)) {
+		kfree(encl_page);
+		return PTR_ERR(epc_page);
+	}
+
+	if (atomic_read(&encl->flags) &
+	    (SGX_ENCL_INITIALIZED | SGX_ENCL_DEAD)) {
+		ret = -EFAULT;
+		goto err_out_free;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	mutex_lock(&encl->lock);
+
+	/*
+	 * Insert prior to EADD in case of OOM.  EADD modifies MRENCLAVE, i.e.
+	 * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+	 * to userspace errors (or kernel/hardware bugs).
+	 */
+	ret = radix_tree_insert(&encl->page_tree, PFN_DOWN(encl_page->desc),
+				encl_page);
+	if (ret)
+		goto err_out_unlock;
+
+	ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+				  addp->src);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Complete the "add" before doing the "extend" so that the "add"
+	 * isn't in a half-baked state in the extremely unlikely scenario the
+	 * the enclave will be destroyed in response to EEXTEND failure.
+	 */
+	encl_page->encl = encl;
+	encl_page->epc_page = epc_page;
+	encl->secs_child_cnt++;
+
+	if (addp->flags & SGX_PAGE_MEASURE) {
+		ret = __sgx_encl_extend(encl, epc_page);
+		if (ret)
+			sgx_encl_destroy(encl);
+	}
+
+	mutex_unlock(&encl->lock);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+
+err_out:
+	radix_tree_delete(&encl_page->encl->page_tree,
+			  PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+	mutex_unlock(&encl->lock);
+	up_read(&current->mm->mmap_sem);
+
+err_out_free:
+	sgx_free_page(epc_page);
+	kfree(encl_page);
+
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl:       pointer to an enclave instance (via ioctl() file pointer)
+ * @arg:	a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add (EADD) one or more pages to an uninitialized enclave, and optionally
+ * extend (EEXTEND) the measurement with the contents of the page. The range of
+ * pages must be virtually contiguous. The SECINFO and measurement mask are
+ * applied to all pages, i.e. pages with different properties must be added in
+ * separate calls.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ * 3. No page: PROT_NONE.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * As stated above, a non-existent page is interpreted as a page with no
+ * permissions. In effect, this allows mmap() with PROT_NONE to be used to seek
+ * an address range for the enclave that can be then populated into SECS.
+ *
+ * @arg->addr, @arg->src and @arg->length are adjusted to reflect the
+ * remaining pages that need to be added to the enclave, e.g. userspace can
+ * re-invoke SGX_IOC_ENCLAVE_ADD_PAGES using the same struct in response to an
+ * ERESTARTSYS error.
+ *
+ * Return:
+ *   0 on success,
+ *   -EINVAL if any input param or the SECINFO contains invalid data,
+ *   -EACCES if an executable source page is located in a noexec partition,
+ *   -ENOMEM if any memory allocation, including EPC, fails,
+ *   -ERESTARTSYS if a pending signal is recognized
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+	struct sgx_enclave_add_pages addp;
+	struct sgx_secinfo secinfo;
+	int ret;
+
+	if (!(atomic_read(&encl->flags) & SGX_ENCL_CREATED))
+		return -EINVAL;
+
+	if (copy_from_user(&addp, arg, sizeof(addp)))
+		return -EFAULT;
+
+	if (!IS_ALIGNED(addp.offset, PAGE_SIZE) ||
+	    !IS_ALIGNED(addp.src, PAGE_SIZE))
+		return -EINVAL;
+
+	if (!(access_ok(addp.src, PAGE_SIZE)))
+		return -EFAULT;
+
+	if (addp.length & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	if (addp.offset + addp.length - PAGE_SIZE >= encl->size)
+		return -EINVAL;
+
+	if (copy_from_user(&secinfo, (void __user *)addp.secinfo,
+			   sizeof(secinfo)))
+		return -EFAULT;
+
+	if (sgx_validate_secinfo(&secinfo))
+		return -EINVAL;
+
+	for ( ; addp.length > 0; addp.length -= PAGE_SIZE) {
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched())
+			cond_resched();
+
+		ret = sgx_encl_add_page(encl, &addp, &secinfo);
+		if (ret)
+			break;
+
+		addp.offset += PAGE_SIZE;
+		addp.src += PAGE_SIZE;
+	}
+
+	if (copy_to_user(arg, &addp, sizeof(addp)))
+		return -EFAULT;
+
+	return ret;
+}
+
+static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
+			      void *hash)
+{
+	SHASH_DESC_ON_STACK(shash, tfm);
+
+	shash->tfm = tfm;
+
+	return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
+}
+
+static int sgx_get_key_hash(const void *modulus, void *hash)
+{
+	struct crypto_shash *tfm;
+	int ret;
+
+	tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	ret = __sgx_get_key_hash(tfm, modulus, hash);
+
+	crypto_free_shash(tfm);
+	return ret;
+}
+
+static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+			 struct sgx_einittoken *token)
+{
+	u64 mrsigner[4];
+	int ret;
+	int i;
+	int j;
+
+	/* Check that the required attributes have been authorized. */
+	if (encl->secs_attributes & ~encl->allowed_attributes)
+		return -EINVAL;
+
+	ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
+	if (ret)
+		return ret;
+
+	mutex_lock(&encl->lock);
+
+	if (atomic_read(&encl->flags) & SGX_ENCL_INITIALIZED) {
+		ret = -EFAULT;
+		goto err_out;
+	}
+
+	for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+		for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+			ret = sgx_einit(sigstruct, token, encl->secs.epc_page,
+					mrsigner);
+			if (ret == SGX_UNMASKED_EVENT)
+				continue;
+			else
+				break;
+		}
+
+		if (ret != SGX_UNMASKED_EVENT)
+			break;
+
+		msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			goto err_out;
+		}
+	}
+
+	if (ret & ENCLS_FAULT_FLAG) {
+		if (encls_failed(ret))
+			ENCLS_WARN(ret, "EINIT");
+
+		sgx_encl_destroy(encl);
+		ret = -EFAULT;
+	} else if (ret) {
+		pr_debug("EINIT returned %d\n", ret);
+		ret = -EPERM;
+	} else {
+		atomic_or(SGX_ENCL_INITIALIZED, &encl->flags);
+	}
+
+err_out:
+	mutex_unlock(&encl->lock);
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init - handler for %SGX_IOC_ENCLAVE_INIT
+ *
+ * @filep:	open file to /dev/sgx
+ * @arg:	userspace pointer to a struct sgx_enclave_init instance
+ *
+ * Flush any outstanding enqueued EADD operations and perform EINIT.  The
+ * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
+ * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ *
+ * Return:
+ *   0 on success,
+ *   SGX error code on EINIT failure,
+ *   -errno otherwise
+ */
+static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
+{
+	struct sgx_einittoken *einittoken;
+	struct sgx_sigstruct *sigstruct;
+	struct sgx_enclave_init einit;
+	struct page *initp_page;
+	int ret;
+
+	if (!(atomic_read(&encl->flags) & SGX_ENCL_CREATED))
+		return -EINVAL;
+
+	if (copy_from_user(&einit, arg, sizeof(einit)))
+		return -EFAULT;
+
+	initp_page = alloc_page(GFP_HIGHUSER);
+	if (!initp_page)
+		return -ENOMEM;
+
+	sigstruct = kmap(initp_page);
+	einittoken = (struct sgx_einittoken *)
+		((unsigned long)sigstruct + PAGE_SIZE / 2);
+	memset(einittoken, 0, sizeof(*einittoken));
+
+	if (copy_from_user(sigstruct, (void __user *)einit.sigstruct,
+			   sizeof(*sigstruct))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = sgx_encl_init(encl, sigstruct, einittoken);
+
+out:
+	kunmap(initp_page);
+	__free_page(initp_page);
+	return ret;
+}
+
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct sgx_encl *encl = filep->private_data;
+	int ret, encl_flags;
+
+	encl_flags = atomic_fetch_or(SGX_ENCL_IOCTL, &encl->flags);
+	if (encl_flags & SGX_ENCL_IOCTL)
+		return -EBUSY;
+
+	if (encl_flags & SGX_ENCL_DEAD)
+		return -EFAULT;
+
+	switch (cmd) {
+	case SGX_IOC_ENCLAVE_CREATE:
+		ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
+		break;
+	case SGX_IOC_ENCLAVE_ADD_PAGES:
+		ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+		break;
+	case SGX_IOC_ENCLAVE_INIT:
+		ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	atomic_andnot(SGX_ENCL_IOCTL, &encl->flags);
+
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 6a37df61ae32..36a295a0272b 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -8,6 +8,7 @@
 #include <linux/ratelimit.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include "driver.h"
 #include "encls.h"
 
 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -196,6 +197,8 @@ static bool __init sgx_page_cache_init(void)
 
 static void __init sgx_init(void)
 {
+	int ret;
+
 	if (!boot_cpu_has(X86_FEATURE_SGX))
 		return;
 
@@ -205,8 +208,15 @@ static void __init sgx_init(void)
 	if (!sgx_page_reclaimer_init())
 		goto err_page_cache;
 
+	ret = sgx_drv_init();
+	if (ret)
+		goto err_kthread;
+
 	return;
 
+err_kthread:
+	kthread_stop(ksgxswapd_tsk);
+
 err_page_cache:
 	sgx_page_cache_teardown();
 }
diff --git a/arch/x86/kernel/cpu/sgx/reclaim.c b/arch/x86/kernel/cpu/sgx/reclaim.c
index f071158d34f6..bdb42f4326aa 100644
--- a/arch/x86/kernel/cpu/sgx/reclaim.c
+++ b/arch/x86/kernel/cpu/sgx/reclaim.c
@@ -10,6 +10,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include "encls.h"
+#include "driver.h"
 
 struct task_struct *ksgxswapd_tsk;
 
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v2 3/4] KEYS: Added BUILTIN_TRUSTED_KEYS enum to measure keys added to builtin_trusted_keys keyring
From: Mimi Zohar @ 2019-10-28 17:08 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian, dhowells, casey, sashal, jamorris,
	linux-security-module, linux-integrity, linux-kernel, keyrings
In-Reply-To: <8494baa1-c4db-f08b-26c9-2e56279075d0@linux.microsoft.com>

On Mon, 2019-10-28 at 08:12 -0700, Lakshmi Ramasubramanian wrote:
> On 10/27/19 7:33 AM, Mimi Zohar wrote:
> 
> > .builtin_trusted_keys is a trusted keyring, which is created by the
> > kernel.  It cannot be deleted or replaced by userspace, so it should
> > be possible to correlate a keyring name with a keyring number on
> > policy load.
> 
> Yes - at policy load we can map a keyring name to a keyring number.
> 
> But at runtime we still need to know if the keyring parameter passed to 
> the IMA hook function is configured to be measured.
> 
> void ima_post_key_create_or_update(struct key *keyring, struct key *key,
> 				   unsigned long flags, bool create);
> {
>     => Get the keyring number for the given "keyring".

There is no "getting" involved here.  Pass "keyring" to
process_buffer_measurement and on to ima_get_action().

>     => Check if the keyring number is in the configured IMA policy.

ima_get_action() should do a simple compare of the valued stored in
the IMA policy with the value returned by key_serial().

Mimi

>     => If yes, measure the key.
>     => Else, do nothing.
> }

> Did I misunderstand what you had stated?


^ permalink raw reply

* Re: CAAM hw key support status
From: Iuliana Prodan @ 2019-10-28 16:12 UTC (permalink / raw)
  To: Stefan Müller-Klieser
  Cc: Franck Lenormand, Horia Geanta, linux-crypto@vger.kernel.org,
	linux-security-module@vger.kernel.org
In-Reply-To: <2171b6f9-8e80-8f79-fffb-5a5b641ad196@phytec.de>

Hi Stefan,

On 10/21/2019 5:43 PM, Stefan Müller-Klieser wrote:
> Hi Franck,
> 
> you sent an RFC to the linux-security-module list:
> 
> March 1, 2019, 4:09 p.m. UTC
> [RFC,0/2] Create CAAM HW key in linux keyring and use in dmcrypt
> https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.kernel.org%2Fcover%2F10835635%2F&amp;data=02%7C01%7Ciuliana.prodan%40nxp.com%7Cd2a2160934414cb916e008d756350679%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C1%7C637072658050831257&amp;sdata=3ztN37KIopwP9X3tkHH96H4JFwsL2vPG0p%2BRqtlG7mM%3D&amp;reserved=0
> 
> Is there any update on this work? How do you want to move forward?

We just resumed this work. We want to see if we can use an already 
defined key type rather than implement a new one.

Regards,
Iulia



^ permalink raw reply

* Re: [PATCH v2 3/4] KEYS: Added BUILTIN_TRUSTED_KEYS enum to measure keys added to builtin_trusted_keys keyring
From: Lakshmi Ramasubramanian @ 2019-10-28 15:56 UTC (permalink / raw)
  To: Mimi Zohar, dhowells, casey, sashal, jamorris,
	linux-security-module, linux-integrity, linux-kernel, keyrings
In-Reply-To: <1572186810.4532.206.camel@linux.ibm.com>

On 10/27/19 7:33 AM, Mimi Zohar wrote:

> Other examples of trusted keyrings are: .ima, .evm, .platform,
> .blacklist, .builtin_regdb_keys.  Instead of defining a keyring
> specific method of getting the keyring number, define a generic
> method.  For example, the userspace command "keyctl describe
> %keyring:.builtin_trusted_keys" searches /proc/keys, but the kernel
> shouldn't need to access /proc/keys.

"description" field in "struct key" is set to ".builtin_trusted_keys" 
for Built-In Trusted Keys keyring.

Similarly, for other keyrings such as .ima, .evm, .blacklist, 
.builtin_regdb_keys, etc.

 > # measure keys on the BUILTIN and IMA keyrings into a different PCR
 > measure func=KEYRING_CHECK keyring=".builtin_trusted_keys|.ima" pcr=11

With IMA policy set like above, the keyring to keyring number mapping 
can be set at IMA policy load.

My earlier point about mapping the "keyring" to "keyring number" at 
runtime (when the IMA hook is called) still needs to be done to know if 
the given keyring is in the policy or not.

void ima_post_key_create_or_update(struct key *keyring, struct key *key,
				   unsigned long flags, bool create)

thanks,
  -lakshmi

^ permalink raw reply

* Re: [PATCH v10 13/25] LSM: Specify which LSM to display
From: Stephen Smalley @ 2019-10-28 15:25 UTC (permalink / raw)
  To: Casey Schaufler, casey.schaufler, jmorris, linux-security-module,
	selinux
  Cc: keescook, john.johansen, penguin-kernel, paul
In-Reply-To: <20191024205228.6922-14-casey@schaufler-ca.com>

On 10/24/19 4:52 PM, Casey Schaufler wrote:
> Create a new entry "display" in /proc/.../attr for controlling
> which LSM security information is displayed for a process.
> The name of an active LSM that supplies hooks for human readable
> data may be written to "display" to set the value. The name of the
> LSM currently in use can be read from "display". At this point there
> can only be one LSM capable of display active. A helper function
> lsm_task_display() is provided to get the display slot for a task_struct.
> 
> Setting the "display" requires that all security modules using
> setprocattr hooks allow the action. Each security module is
> responsible for defining its policy.
> 
> AppArmor hook provided by John Johansen <john.johansen@canonical.com>
> SELinux hook provided by Stephen Smalley <sds@tycho.nsa.gov>
> 
> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
> ---
>   fs/proc/base.c                       |   1 +
>   include/linux/lsm_hooks.h            |  15 +++
>   security/apparmor/include/apparmor.h |   3 +-
>   security/apparmor/lsm.c              |  36 ++++++
>   security/security.c                  | 159 ++++++++++++++++++++++++---
>   security/selinux/hooks.c             |  11 ++
>   security/selinux/include/classmap.h  |   2 +-
>   security/smack/smack_lsm.c           |   7 ++
>   8 files changed, 215 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ddef482f1334..7bf70e041315 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2618,6 +2618,7 @@ static const struct pid_entry attr_dir_stuff[] = {
>   	ATTR(NULL, "fscreate",		0666),
>   	ATTR(NULL, "keycreate",		0666),
>   	ATTR(NULL, "sockcreate",	0666),
> +	ATTR(NULL, "display",		0666),
>   #ifdef CONFIG_SECURITY_SMACK
>   	DIR("smack",			0555,
>   	    proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index 24b7d78a36b2..706fd6d3d46e 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -2132,4 +2132,19 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
>   
>   extern int lsm_inode_alloc(struct inode *inode);
>   
> +/**
> + * lsm_task_display - the "display" LSM for this task
> + * @task: The task to report on
> + *
> + * Returns the task's display LSM slot.
> + */
> +static inline int lsm_task_display(struct task_struct *task)
> +{
> +	int *display = task->security;
> +
> +	if (display)
> +		return *display;
> +	return LSMBLOB_INVALID;
> +}
> +
>   #endif /* ! __LINUX_LSM_HOOKS_H */
> diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h
> index 73d63b58d875..aaebfe979a68 100644
> --- a/security/apparmor/include/apparmor.h
> +++ b/security/apparmor/include/apparmor.h
> @@ -32,8 +32,9 @@
>   #define AA_CLASS_SIGNAL		10
>   #define AA_CLASS_NET		14
>   #define AA_CLASS_LABEL		16
> +#define AA_CLASS_DISPLAY_LSM	17
>   
> -#define AA_CLASS_LAST		AA_CLASS_LABEL
> +#define AA_CLASS_LAST		AA_CLASS_DISPLAY_LSM
>   
>   /* Control parameters settable through module/boot flags */
>   extern enum audit_mode aa_g_audit;
> diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
> index ec2e39aa9a84..c4835d05c5ea 100644
> --- a/security/apparmor/lsm.c
> +++ b/security/apparmor/lsm.c
> @@ -603,6 +603,25 @@ static int apparmor_getprocattr(struct task_struct *task, char *name,
>   	return error;
>   }
>   
> +
> +static int profile_display_lsm(struct aa_profile *profile,
> +			       struct common_audit_data *sa)
> +{
> +	struct aa_perms perms = { };
> +	unsigned int state;
> +
> +	state = PROFILE_MEDIATES(profile, AA_CLASS_DISPLAY_LSM);
> +	if (state) {
> +		aa_compute_perms(profile->policy.dfa, state, &perms);
> +		aa_apply_modes_to_perms(profile, &perms);
> +		aad(sa)->label = &profile->label;
> +
> +		return aa_check_perms(profile, &perms, AA_MAY_WRITE, sa, NULL);
> +	}
> +
> +	return 0;
> +}
> +
>   static int apparmor_setprocattr(const char *name, void *value,
>   				size_t size)
>   {
> @@ -614,6 +633,23 @@ static int apparmor_setprocattr(const char *name, void *value,
>   	if (size == 0)
>   		return -EINVAL;
>   
> +	/* ToDo: Decide on the AppArmor policy for switching the display */
> +	if (!strcmp(name, "display"))
> +		return 0;

Leftover from before jj's patch.

> +
> +	/* LSM infrastructure does actual setting of display if allowed */
> +	if (!strcmp(name, "display")) {
> +		struct aa_profile *profile;
> +		struct aa_label *label;
> +
> +		aad(&sa)->info = "set display lsm";
> +		label = begin_current_label_crit_section();
> +		error = fn_for_each_confined(label, profile,
> +					     profile_display_lsm(profile, &sa));
> +		end_current_label_crit_section(label);
> +		return error;
> +	}
> +
>   	/* AppArmor requires that the buffer must be null terminated atm */
>   	if (args[size - 1] != '\0') {
>   		/* null terminate */
> diff --git a/security/security.c b/security/security.c
> index 8368d1e726a0..687a5e184e57 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -31,6 +31,7 @@
>   #include <linux/backing-dev.h>
>   #include <linux/string.h>
>   #include <linux/msg.h>
> +#include <linux/binfmts.h>
>   #include <net/flow.h>
>   #include <net/sock.h>
>   
> @@ -46,7 +47,14 @@ static struct kmem_cache *lsm_file_cache;
>   static struct kmem_cache *lsm_inode_cache;
>   
>   char *lsm_names;
> -static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;
> +
> +/*
> + * The task blob includes the "display" slot used for
> + * chosing which module presents contexts.
> + */
> +static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init = {
> +	.lbs_task = sizeof(int),
> +};
>   
>   /* Boot-time LSM user choice */
>   static __initdata const char *chosen_lsm_order;
> @@ -415,8 +423,10 @@ static int lsm_append(const char *new, char **result)
>   
>   /*
>    * Current index to use while initializing the lsmblob secid list.
> + * Pointers to the LSM id structures for local use.
>    */
>   static int lsm_slot __lsm_ro_after_init;
> +static struct lsm_id *lsm_slotlist[LSMBLOB_ENTRIES];
>   
>   /**
>    * security_add_hooks - Add a modules hooks to the hook lists.
> @@ -436,6 +446,7 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
>   	if (lsmid->slot == LSMBLOB_NEEDED) {
>   		if (lsm_slot >= LSMBLOB_ENTRIES)
>   			panic("%s Too many LSMs registered.\n", __func__);
> +		lsm_slotlist[lsm_slot] = lsmid;
>   		lsmid->slot = lsm_slot++;
>   		init_debug("%s assigned lsmblob slot %d\n", lsmid->lsm,
>   			   lsmid->slot);
> @@ -556,6 +567,8 @@ int lsm_inode_alloc(struct inode *inode)
>    */
>   static int lsm_task_alloc(struct task_struct *task)
>   {
> +	int *display;
> +
>   	if (blob_sizes.lbs_task == 0) {
>   		task->security = NULL;
>   		return 0;
> @@ -564,6 +577,15 @@ static int lsm_task_alloc(struct task_struct *task)
>   	task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
>   	if (task->security == NULL)
>   		return -ENOMEM;
> +
> +	/*
> +	 * The start of the task blob contains the "display" LSM slot number.
> +	 * Start with it set to the invalid slot number, indicating that the
> +	 * default first registered LSM be displayed.
> +	 */
> +	display = task->security;
> +	*display = LSMBLOB_INVALID;
> +
>   	return 0;
>   }
>   
> @@ -1502,14 +1524,26 @@ int security_file_open(struct file *file)
>   
>   int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
>   {
> +	int *odisplay = current->security;
> +	int *ndisplay;
>   	int rc = lsm_task_alloc(task);
>   
> -	if (rc)
> +	if (unlikely(rc))
>   		return rc;
> +
>   	rc = call_int_hook(task_alloc, 0, task, clone_flags);
> -	if (unlikely(rc))
> +	if (unlikely(rc)) {
>   		security_task_free(task);
> -	return rc;
> +		return rc;
> +	}
> +
> +	if (odisplay) {
> +		ndisplay = task->security;
> +		if (ndisplay)
> +			*ndisplay = *odisplay;
> +	}
> +
> +	return 0;
>   }
>   
>   void security_task_free(struct task_struct *task)
> @@ -1906,23 +1940,100 @@ int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
>   				char **value)
>   {
>   	struct security_hook_list *hp;
> +	int display = lsm_task_display(current);
> +	int slot = 0;
> +
> +	if (!strcmp(name, "display")) {
> +		/*
> +		 * lsm_slot will be 0 if there are no displaying modules.
> +		 */
> +		if (lsm_slot == 0)
> +			return -EINVAL;
> +		if (display != LSMBLOB_INVALID)
> +			slot = display;
> +		*value = kstrdup(lsm_slotlist[slot]->lsm, GFP_KERNEL);
> +		if (*value)
> +			return strlen(*value);
> +		return -ENOMEM;
> +	}
>   
>   	hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
>   		if (lsm != NULL && strcmp(lsm, hp->lsmid->lsm))
>   			continue;
> +		if (lsm == NULL && display != LSMBLOB_INVALID &&
> +		    display != hp->lsmid->slot)
> +			continue;
>   		return hp->hook.getprocattr(p, name, value);
>   	}
>   	return -EINVAL;
>   }
>   
> +/**
> + * security_setprocattr - Set process attributes via /proc
> + * @lsm: name of module involved, or NULL
> + * @name: name of the attribute
> + * @value: value to set the attribute to
> + * @size: size of the value
> + *
> + * Set the process attribute for the specified security module
> + * to the specified value. Note that this can only be used to set
> + * the process attributes for the current, or "self" process.
> + * The /proc code has already done this check.
> + *
> + * Returns 0 on success, an appropriate code otherwise.
> + */
>   int security_setprocattr(const char *lsm, const char *name, void *value,
>   			 size_t size)
>   {
>   	struct security_hook_list *hp;
> +	char *term;
> +	char *cp;
> +	int *display = current->security;
> +	int rc = -EINVAL;
> +	int slot = 0;
> +
> +	if (!strcmp(name, "display")) {
> +		/*
> +		 * Change the "display" value only if all the security
> +		 * modules that support setting a procattr allow it.
> +		 * It is assumed that all such security modules will be
> +		 * cooperative.
> +		 */
> +		if (size == 0)
> +			return -EINVAL;
> +
> +		hlist_for_each_entry(hp, &security_hook_heads.setprocattr,
> +				     list) {
> +			rc = hp->hook.setprocattr(name, value, size);
> +			if (rc < 0)
> +				return rc;
> +		}
> +
> +		rc = -EINVAL;
> +
> +		term = kmemdup_nul(value, size, GFP_KERNEL);
> +		if (term == NULL)
> +			return -ENOMEM;
> +
> +		cp = strsep(&term, " \n");
> +
> +		for (slot = 0; slot < lsm_slot; slot++)
> +			if (!strcmp(cp, lsm_slotlist[slot]->lsm)) {
> +				*display = lsm_slotlist[slot]->slot;
> +				rc = size;
> +				break;
> +			}
> +
> +		kfree(cp);
> +		return rc;
> +	}
>   
>   	hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
>   		if (lsm != NULL && strcmp(lsm, hp->lsmid->lsm))
>   			continue;
> +		if (lsm == NULL && *display != LSMBLOB_INVALID &&
> +		    *display != hp->lsmid->slot)
> +			continue;
>   		return hp->hook.setprocattr(name, value, size);
>   	}
>   	return -EINVAL;
> @@ -1942,15 +2053,15 @@ EXPORT_SYMBOL(security_ismaclabel);
>   int security_secid_to_secctx(struct lsmblob *blob, char **secdata, u32 *seclen)
>   {
>   	struct security_hook_list *hp;
> -	int rc;
> +	int display = lsm_task_display(current);
>   
>   	hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) {
>   		if (WARN_ON(hp->lsmid->slot < 0 || hp->lsmid->slot >= lsm_slot))
>   			continue;
> -		rc = hp->hook.secid_to_secctx(blob->secid[hp->lsmid->slot],
> -					      secdata, seclen);
> -		if (rc != 0)
> -			return rc;
> +		if (display == LSMBLOB_INVALID || display == hp->lsmid->slot)
> +			return hp->hook.secid_to_secctx(
> +					blob->secid[hp->lsmid->slot],
> +					secdata, seclen);
>   	}
>   	return 0;
>   }
> @@ -1960,16 +2071,15 @@ int security_secctx_to_secid(const char *secdata, u32 seclen,
>   			     struct lsmblob *blob)
>   {
>   	struct security_hook_list *hp;
> -	int rc;
> +	int display = lsm_task_display(current);
>   
>   	lsmblob_init(blob, 0);
>   	hlist_for_each_entry(hp, &security_hook_heads.secctx_to_secid, list) {
>   		if (WARN_ON(hp->lsmid->slot < 0 || hp->lsmid->slot >= lsm_slot))
>   			continue;
> -		rc = hp->hook.secctx_to_secid(secdata, seclen,
> -					      &blob->secid[hp->lsmid->slot]);
> -		if (rc != 0)
> -			return rc;
> +		if (display == LSMBLOB_INVALID || display == hp->lsmid->slot)
> +			return hp->hook.secctx_to_secid(secdata, seclen,
> +						&blob->secid[hp->lsmid->slot]);
>   	}
>   	return 0;
>   }
> @@ -1977,7 +2087,15 @@ EXPORT_SYMBOL(security_secctx_to_secid);
>   
>   void security_release_secctx(char *secdata, u32 seclen)
>   {
> -	call_void_hook(release_secctx, secdata, seclen);
> +	struct security_hook_list *hp;
> +	int *display = current->security;
> +
> +	hlist_for_each_entry(hp, &security_hook_heads.release_secctx, list)
> +		if (display == NULL || *display == LSMBLOB_INVALID ||
> +		    *display == hp->lsmid->slot) {
> +			hp->hook.release_secctx(secdata, seclen);
> +			return;
> +		}
>   }
>   EXPORT_SYMBOL(security_release_secctx);
>   
> @@ -2102,8 +2220,15 @@ EXPORT_SYMBOL(security_sock_rcv_skb);
>   int security_socket_getpeersec_stream(struct socket *sock, char __user *optval,
>   				      int __user *optlen, unsigned len)
>   {
> -	return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock,
> -				optval, optlen, len);
> +	int display = lsm_task_display(current);
> +	struct security_hook_list *hp;
> +
> +	hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream,
> +			     list)
> +		if (display == LSMBLOB_INVALID || display == hp->lsmid->slot)
> +			return hp->hook.socket_getpeersec_stream(sock, optval,
> +								 optlen, len);
> +	return -ENOPROTOOPT;
>   }
>   
>   int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb,
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 7592b95b43c4..c9e377d13f0e 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -6170,6 +6170,17 @@ static int selinux_setprocattr(const char *name, void *value, size_t size)
>   	/*
>   	 * Basic control over ability to set these attributes at all.
>   	 */
> +
> +	/*
> +	 * For setting display, we only perform a permission check;
> +	 * the actual update to the display value is handled by the
> +	 * LSM framework.
> +	 */
> +	if (!strcmp(name, "display"))
> +		return avc_has_perm(&selinux_state,
> +				    mysid, mysid, SECCLASS_PROCESS2,
> +				    PROCESS2__SETDISPLAY, NULL);
> +
>   	if (!strcmp(name, "exec"))
>   		error = avc_has_perm(&selinux_state,
>   				     mysid, mysid, SECCLASS_PROCESS,
> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> index bd5fe0d3204a..eda6f6a7a666 100644
> --- a/security/selinux/include/classmap.h
> +++ b/security/selinux/include/classmap.h
> @@ -50,7 +50,7 @@ struct security_class_mapping secclass_map[] = {
>   	    "execmem", "execstack", "execheap", "setkeycreate",
>   	    "setsockcreate", "getrlimit", NULL } },
>   	{ "process2",
> -	  { "nnp_transition", "nosuid_transition", NULL } },
> +	  { "nnp_transition", "nosuid_transition", "setdisplay", NULL } },
>   	{ "system",
>   	  { "ipc_info", "syslog_read", "syslog_mod",
>   	    "syslog_console", "module_request", "module_load", NULL } },
> diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
> index 61e05fe86013..1a3041463c46 100644
> --- a/security/smack/smack_lsm.c
> +++ b/security/smack/smack_lsm.c
> @@ -3516,6 +3516,13 @@ static int smack_setprocattr(const char *name, void *value, size_t size)
>   	struct smack_known_list_elem *sklep;
>   	int rc;
>   
> +	/*
> +	 * Allow the /proc/.../attr/current and SO_PEERSEC "display"
> +	 * to be reset at will.
> +	 */
> +	if (strcmp(name, "display") == 0)
> +		return 0;
> +
>   	if (!smack_privileged(CAP_MAC_ADMIN) && list_empty(&tsp->smk_relabel))
>   		return -EPERM;
>   
> 


^ permalink raw reply

* Re: [PATCH v2 3/4] KEYS: Added BUILTIN_TRUSTED_KEYS enum to measure keys added to builtin_trusted_keys keyring
From: Lakshmi Ramasubramanian @ 2019-10-28 15:12 UTC (permalink / raw)
  To: Mimi Zohar, dhowells, casey, sashal, jamorris,
	linux-security-module, linux-integrity, linux-kernel, keyrings
In-Reply-To: <1572186810.4532.206.camel@linux.ibm.com>

On 10/27/19 7:33 AM, Mimi Zohar wrote:

> .builtin_trusted_keys is a trusted keyring, which is created by the
> kernel.  It cannot be deleted or replaced by userspace, so it should
> be possible to correlate a keyring name with a keyring number on
> policy load.

Yes - at policy load we can map a keyring name to a keyring number.

But at runtime we still need to know if the keyring parameter passed to 
the IMA hook function is configured to be measured.

void ima_post_key_create_or_update(struct key *keyring, struct key *key,
				   unsigned long flags, bool create);
{
    => Get the keyring number for the given "keyring".
    => Check if the keyring number is in the configured IMA policy.
    => If yes, measure the key.
    => Else, do nothing.
}

Did I misunderstand what you had stated?

>> diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
>> index fc376a323908..25566c74e679 100644
>> --- a/Documentation/ABI/testing/ima_policy
>> +++ b/Documentation/ABI/testing/ima_policy
>> @@ -29,6 +29,7 @@ Description:
>>   				[FIRMWARE_CHECK]
>>   				[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
>>   				[KEXEC_CMDLINE]
>> +				[BUILTIN_TRUSTED_KEYS]
> 
> The .builtin_trusted_keys is the name of a keyring, not of an IMA
> hook.  Define a new IMA policy "keyring=" option, where keyring is
> optional.  Some IMA policy rules might look like:
> 
> # measure all keys
> measure func=KEYRING_CHECK
> 
> # measure keys on the IMA keyring
> measure func=KEYRING_CHECK keyring=".ima"
> 
> # measure keys on the BUILTIN and IMA keyrings into a different PCR
> measure func=KEYRING_CHECK keyring=".builtin_trusted_keys|.ima" pcr=11

I agree - I'll take a look at this.

> "func", in this case, should be something like "KEYRING_CHECK".  No
> mapping is necessary.
Agree.

> 
>>   	if (!ima_initialized) {
>> -		ima_queue_key_for_measurement(key, NONE);
>> +		ima_queue_key_for_measurement(key, func);
>>   		return;
>>   	}
>>   
>>   	pk = key->payload.data[asym_crypto];
>>   	process_buffer_measurement(pk->key, pk->keylen,
>>   				   key->description,
>> -				   NONE, 0);
>> +				   func, 0);
> 
> Pass the "keyring" to process_buffer_measurement() and on to
> ima_get_action(), so that ima_get_action() determines whether the
> keyring is in policy.

Agree.

thanks,
  -lakshmi



^ permalink raw reply

* Re: [PATCH v2 1/4] KEYS: Defined an ima hook for measuring keys on key create or update
From: Lakshmi Ramasubramanian @ 2019-10-28 14:58 UTC (permalink / raw)
  To: Mimi Zohar, dhowells, casey, sashal, jamorris,
	linux-security-module, linux-integrity, linux-kernel, keyrings
In-Reply-To: <1572187644.4532.211.camel@linux.ibm.com>

On 10/27/19 7:47 AM, Mimi Zohar wrote:

>>> There's no reason to define a new variable to determine if IMA is
>>> initialized.  Use ima_policy_flag.
>>
>> Please correct me if I am wrong -
>>
>> ima_policy_flag will be set to 0 if IMA is not yet initialized
>> OR
>> IMA is initialized, but ima_policy_flag could be still set to 0 (say,
>> due to the configured policy).
>>
>> In the latter case the measurement request should be a NOP immediately.
> 
> I'm not sure.  The builtin keys most likely will be loaded prior to a
> custom IMA policy containing "keyring" rules are defined.
> 
> Mimi

I am not sure if I described it clearly - let me clarify:

Say, we use ima_policy_flag to determine whether to
measure the key immediately or
queue the key for measurement and, measure when IMA is initialized.

We can incorrectly keep queuing keys in the case when IMA is 
initialized, but due to the way IMA policy is configured ima_policy_flag 
is still 0.

That's why I feel a separate boolean flag would be needed to know 
whether IMA is initialized or not.

If IMA is initialized, ima_policy_flag will dictate whether to measure 
the key or not.

thanks,
  -lakshmi

^ permalink raw reply

* Re: [RFC PATCH 11/10] pipe: Add fsync() support [ver #2]
From: Konstantin Khlebnikov @ 2019-10-27 16:04 UTC (permalink / raw)
  To: David Howells, torvalds
  Cc: Rasmus Villemoes, Greg Kroah-Hartman, Peter Zijlstra,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	linux-kernel
In-Reply-To: <30394.1571936252@warthog.procyon.org.uk>

On 24/10/2019 19.57, David Howells wrote:
> pipe: Add fsync() support
> 
> The keyrings testsuite needs the ability to wait for all the outstanding
> notifications in the queue to have been processed so that it can then go
> through them to find out whether the notifications it expected have been
> emitted.

Similar synchronization is required for reusing memory after vmsplice()?
I don't see other way how sender could safely change these pages.

> 
> Implement fsync() support for pipes to provide this.  The tailmost buffer
> at the point of calling is marked and fsync adds itself to the list of
> waiters, noting the tail position to be waited for and marking the buffer
> as no longer mergeable.  Then when the buffer is consumed, if the flag is
> set, any matching waiters are woken up.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>   fs/fuse/dev.c             |    1
>   fs/pipe.c                 |   61 ++++++++++++++++++++++++++++++++++++++++++++++
>   fs/splice.c               |    3 ++
>   include/linux/pipe_fs_i.h |   22 ++++++++++++++++
>   lib/iov_iter.c            |    2 -
>   5 files changed, 88 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 5ef57a322cb8..9617a35579cb 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -1983,6 +1983,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>   		if (rem >= ibuf->len) {
>   			*obuf = *ibuf;
>   			ibuf->ops = NULL;
> +			pipe_wake_fsync(pipe, ibuf, tail);
>   			tail++;
>   			pipe_commit_read(pipe, tail);
>   		} else {
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 6a982a88f658..8e5fd7314be1 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -30,6 +30,12 @@
>   
>   #include "internal.h"
>   
> +struct pipe_fsync {
> +	struct list_head	link;		/* Link in pipe->fsync */
> +	struct completion	done;
> +	unsigned int		tail;		/* The buffer being waited for */
> +};
> +
>   /*
>    * The max size that a non-root user is allowed to grow the pipe. Can
>    * be set by root in /proc/sys/fs/pipe-max-size
> @@ -269,6 +275,58 @@ static bool pipe_buf_can_merge(struct pipe_buffer *buf)
>   	return buf->ops == &anon_pipe_buf_ops;
>   }
>   
> +/*
> + * Wait for all the data currently in the pipe to be consumed.
> + */
> +static int pipe_fsync(struct file *file, loff_t a, loff_t b, int datasync)
> +{
> +	struct pipe_inode_info *pipe = file->private_data;
> +	struct pipe_buffer *buf;
> +	struct pipe_fsync fsync;
> +	unsigned int head, tail, mask;
> +
> +	pipe_lock(pipe);
> +
> +	head = pipe->head;
> +	tail = pipe->tail;
> +	mask = pipe->ring_size - 1;
> +
> +	if (pipe_empty(head, tail)) {
> +		pipe_unlock(pipe);
> +		return 0;
> +	}
> +
> +	init_completion(&fsync.done);
> +	fsync.tail = tail;
> +	buf = &pipe->bufs[tail & mask];
> +	buf->flags |= PIPE_BUF_FLAG_FSYNC;
> +	pipe_buf_mark_unmergeable(buf);
> +	list_add_tail(&fsync.link, &pipe->fsync);
> +	pipe_unlock(pipe);
> +
> +	if (wait_for_completion_interruptible(&fsync.done) < 0) {
> +		pipe_lock(pipe);
> +		list_del(&fsync.link);
> +		pipe_unlock(pipe);
> +		return -EINTR;
> +	}
> +
> +	return 0;
> +}
> +
> +void __pipe_wake_fsync(struct pipe_inode_info *pipe, unsigned int tail)
> +{
> +	struct pipe_fsync *fsync, *p;
> +
> +	list_for_each_entry_safe(fsync, p, &pipe->fsync, link) {
> +		if (fsync->tail == tail) {
> +			list_del_init(&fsync->link);
> +			complete(&fsync->done);
> +		}
> +	}
> +}
> +EXPORT_SYMBOL(__pipe_wake_fsync);
> +
>   static ssize_t
>   pipe_read(struct kiocb *iocb, struct iov_iter *to)
>   {
> @@ -325,6 +383,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
>   			if (!buf->len) {
>   				pipe_buf_release(pipe, buf);
>   				spin_lock_irq(&pipe->wait.lock);
> +				pipe_wake_fsync(pipe, buf, tail);
>   				tail++;
>   				pipe_commit_read(pipe, tail);
>   				do_wakeup = 1;
> @@ -717,6 +776,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
>   		pipe->ring_size = pipe_bufs;
>   		pipe->user = user;
>   		mutex_init(&pipe->mutex);
> +		INIT_LIST_HEAD(&pipe->fsync);
>   		return pipe;
>   	}
>   
> @@ -1060,6 +1120,7 @@ const struct file_operations pipefifo_fops = {
>   	.llseek		= no_llseek,
>   	.read_iter	= pipe_read,
>   	.write_iter	= pipe_write,
> +	.fsync		= pipe_fsync,
>   	.poll		= pipe_poll,
>   	.unlocked_ioctl	= pipe_ioctl,
>   	.release	= pipe_release,
> diff --git a/fs/splice.c b/fs/splice.c
> index 3f72bc31b6ec..e106367e1be6 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -523,6 +523,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
>   
>   		if (!buf->len) {
>   			pipe_buf_release(pipe, buf);
> +			pipe_wake_fsync(pipe, buf, tail);
>   			tail++;
>   			pipe_commit_read(pipe, tail);
>   			if (pipe->files)
> @@ -771,6 +772,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
>   				ret -= buf->len;
>   				buf->len = 0;
>   				pipe_buf_release(pipe, buf);
> +				pipe_wake_fsync(pipe, buf, tail);
>   				tail++;
>   				pipe_commit_read(pipe, tail);
>   				if (pipe->files)
> @@ -1613,6 +1615,7 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
>   			 */
>   			*obuf = *ibuf;
>   			ibuf->ops = NULL;
> +			pipe_wake_fsync(ipipe, ibuf, i_tail);
>   			i_tail++;
>   			pipe_commit_read(ipipe, i_tail);
>   			input_wakeup = true;
> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
> index 90055ff16550..1a3027089558 100644
> --- a/include/linux/pipe_fs_i.h
> +++ b/include/linux/pipe_fs_i.h
> @@ -8,6 +8,7 @@
>   #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
>   #define PIPE_BUF_FLAG_GIFT	0x04	/* page is a gift */
>   #define PIPE_BUF_FLAG_PACKET	0x08	/* read() as a packet */
> +#define PIPE_BUF_FLAG_FSYNC	0x10	/* fsync() is waiting for this buffer to die */
>   
>   /**
>    *	struct pipe_buffer - a linux kernel pipe buffer
> @@ -43,6 +44,7 @@ struct pipe_buffer {
>    *	@w_counter: writer counter
>    *	@fasync_readers: reader side fasync
>    *	@fasync_writers: writer side fasync
> + *	@fsync: Waiting fsyncs
>    *	@bufs: the circular array of pipe buffers
>    *	@user: the user who created this pipe
>    **/
> @@ -62,6 +64,7 @@ struct pipe_inode_info {
>   	struct page *tmp_page;
>   	struct fasync_struct *fasync_readers;
>   	struct fasync_struct *fasync_writers;
> +	struct list_head fsync;
>   	struct pipe_buffer *bufs;
>   	struct user_struct *user;
>   };
> @@ -268,6 +271,25 @@ extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
>   long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
>   struct pipe_inode_info *get_pipe_info(struct file *file);
>   
> +void __pipe_wake_fsync(struct pipe_inode_info *pipe, unsigned int tail);
> +
> +/**
> + * pipe_wake_fsync - Wake up anyone waiting with fsync for this point
> + * @pipe: The pipe that owns the buffer
> + * @buf: The pipe buffer in question
> + * @tail: The index in the ring of the buffer
> + *
> + * Check to see if anyone is waiting for the pipe ring to clear up to and
> + * including this buffer, and, if they are, wake them up.
> + */
> +static inline void pipe_wake_fsync(struct pipe_inode_info *pipe,
> +				   struct pipe_buffer *buf,
> +				   unsigned int tail)
> +{
> +	if (unlikely(buf->flags & PIPE_BUF_FLAG_FSYNC))
> +		__pipe_wake_fsync(pipe, tail);
> +}
> +
>   int create_pipe_files(struct file **, int);
>   unsigned int round_pipe_size(unsigned long size);
>   
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index e22f4e283f6d..38d52524cd21 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -404,7 +404,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
>   	buf->offset = offset;
>   	buf->len = bytes;
>   
> -	pipe_commit_read(pipe, i_head);
> +	pipe_commit_write(pipe, i_head);
>   	i->iov_offset = offset + bytes;
>   	i->head = i_head;
>   out:
> 
> 


^ permalink raw reply

* Re: [RFC PATCH 07/10] pipe: Conditionalise wakeup in pipe_read() [ver #2]
From: Konstantin Khlebnikov @ 2019-10-27 15:57 UTC (permalink / raw)
  To: David Howells, torvalds
  Cc: Rasmus Villemoes, Greg Kroah-Hartman, Peter Zijlstra,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	linux-kernel
In-Reply-To: <157186189069.3995.10292601951655075484.stgit@warthog.procyon.org.uk>

On 23/10/2019 23.18, David Howells wrote:
> Only do a wakeup in pipe_read() if we made space in a completely full
> buffer.  The producer shouldn't be waiting on pipe->wait otherwise.

We could go further and wakeup writer only when at least half of buffer is empty.
This gives better batching and reduces rate of context switches.

https://lore.kernel.org/lkml/157219118016.7078.16223055699799396042.stgit@buzz/T/#u

> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>   fs/pipe.c |   15 ++++++---------
>   1 file changed, 6 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 1274305772fb..e3a8f10750c9 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -327,11 +327,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
>   				spin_lock_irq(&pipe->wait.lock);
>   				tail++;
>   				pipe_commit_read(pipe, tail);
> -				do_wakeup = 0;
> -				wake_up_interruptible_sync_poll_locked(
> -					&pipe->wait, EPOLLOUT | EPOLLWRNORM);
> +				do_wakeup = 1;
> +				if (head - (tail - 1) == pipe->max_usage)
> +					wake_up_interruptible_sync_poll_locked(
> +						&pipe->wait, EPOLLOUT | EPOLLWRNORM);
>   				spin_unlock_irq(&pipe->wait.lock);
> -				kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
> +				if (head - (tail - 1) == pipe->max_usage)
> +					kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
>   			}
>   			total_len -= chars;
>   			if (!total_len)
> @@ -360,11 +362,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
>   				ret = -ERESTARTSYS;
>   			break;
>   		}
> -		if (do_wakeup) {
> -			wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
> - 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
> -			do_wakeup = 0;
> -		}
>   		pipe_wait(pipe);
>   	}
>   	__pipe_unlock(pipe);
> 
> 


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox