Linux Security Modules development
 help / color / mirror / Atom feed
* [PATCH bpf-next v1 04/13] bpf: lsm: Allow btf_id based attachment for LSM hooks
From: KP Singh @ 2019-12-20 15:41 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, James Morris, Kees Cook,
	Thomas Garnier, Michael Halcrow, Paul Turner, Brendan Gregg,
	Jann Horn, Matthew Garrett, Christian Brauner,
	Mickaël Salaün, Florent Revest, Brendan Jackman,
	Martin KaFai Lau, Song Liu, Yonghong Song, Serge E. Hallyn,
	Mauro Carvalho Chehab, David S. Miller, Greg Kroah-Hartman,
	Nicolas Ferre, Stanislav Fomichev, Quentin Monnet, Andrey Ignatov,
	Joe Stringer
In-Reply-To: <20191220154208.15895-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

Refactor and re-use most of the logic for BPF_PROG_TYPE_TRACING with a few
changes.

- The LSM hook BTF types are prefixed with "lsm_btf_"
- These types do not need the first (void *) pointer argument. The verifier
  only looks for this argument if prod->aux->attach_btf_trace is set.

Signed-off-by: KP Singh <kpsingh@google.com>
---
 kernel/bpf/syscall.c  |  1 +
 kernel/bpf/verifier.c | 83 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5a773fc6f9f5..4fcaf6042c07 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1642,6 +1642,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 {
 	switch (prog_type) {
 	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_LSM:
 		if (btf_id > BTF_MAX_TYPE)
 			return -EINVAL;
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a0482e1c4a77..0d1231d9c1ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9504,7 +9504,71 @@ static void print_verification_stats(struct bpf_verifier_env *env)
 		env->peak_states, env->longest_mark_read_walk);
 }
 
-static int check_attach_btf_id(struct bpf_verifier_env *env)
+/*
+ * LSM hooks have a typedef associated with them. The BTF information for this
+ * type is used by the verifier to validate memory accesses made by the
+ * attached information.
+ *
+ * For example the:
+ *
+ *	int bprm_check_security(struct linux_binprm *brpm)
+ *
+ * has the following typedef:
+ *
+ *	typedef int (*lsm_btf_bprm_check_security)(struct linux_binprm *bprm);
+ */
+#define BTF_LSM_PREFIX "lsm_btf_"
+
+static inline int check_attach_btf_id_lsm(struct bpf_verifier_env *env)
+{
+	struct bpf_prog *prog = env->prog;
+	u32 btf_id = prog->aux->attach_btf_id;
+	const struct btf_type *t;
+	const char *tname;
+
+	if (!btf_id) {
+		verbose(env, "LSM programs must provide btf_id\n");
+		return -EINVAL;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, btf_id);
+	if (!t) {
+		verbose(env, "attach_btf_id %u is invalid\n", btf_id);
+		return -EINVAL;
+	}
+
+	tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+	if (!tname) {
+		verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id);
+		return -EINVAL;
+	}
+
+	if (!btf_type_is_typedef(t)) {
+		verbose(env, "attach_btf_id %u is not a typedef\n", btf_id);
+		return -EINVAL;
+	}
+	if (strncmp(BTF_LSM_PREFIX, tname, sizeof(BTF_LSM_PREFIX) - 1)) {
+		verbose(env, "attach_btf_id %u points to wrong type name %s\n",
+			btf_id, tname);
+		return -EINVAL;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	/* should never happen in valid vmlinux build */
+	if (!btf_type_is_ptr(t))
+		return -EINVAL;
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	/* should never happen in valid vmlinux build */
+	if (!btf_type_is_func_proto(t))
+		return -EINVAL;
+
+	tname += sizeof(BTF_LSM_PREFIX) - 1;
+	prog->aux->attach_func_name = tname;
+	prog->aux->attach_func_proto = t;
+	return 0;
+}
+
+static int check_attach_btf_id_tracing(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
 	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
@@ -9519,9 +9583,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	long addr;
 	u64 key;
 
-	if (prog->type != BPF_PROG_TYPE_TRACING)
-		return 0;
-
 	if (!btf_id) {
 		verbose(env, "Tracing programs must provide btf_id\n");
 		return -EINVAL;
@@ -9659,6 +9720,20 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	}
 }
 
+static int check_attach_btf_id(struct bpf_verifier_env *env)
+{
+	struct bpf_prog *prog = env->prog;
+
+	switch (prog->type) {
+	case BPF_PROG_TYPE_TRACING:
+		return check_attach_btf_id_tracing(env);
+	case BPF_PROG_TYPE_LSM:
+		return check_attach_btf_id_lsm(env);
+	default:
+		return 0;
+	}
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	      union bpf_attr __user *uattr)
 {
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next v1 02/13] bpf: lsm: Add a skeleton and config options
From: KP Singh @ 2019-12-20 15:41 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, James Morris, Kees Cook,
	Thomas Garnier, Michael Halcrow, Paul Turner, Brendan Gregg,
	Jann Horn, Matthew Garrett, Christian Brauner,
	Mickaël Salaün, Florent Revest, Brendan Jackman,
	Martin KaFai Lau, Song Liu, Yonghong Song, Serge E. Hallyn,
	Mauro Carvalho Chehab, David S. Miller, Greg Kroah-Hartman,
	Nicolas Ferre, Stanislav Fomichev, Quentin Monnet, Andrey Ignatov,
	Joe Stringer
In-Reply-To: <20191220154208.15895-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

The LSM can be enabled by CONFIG_SECURITY_BPF.
Without CONFIG_SECURITY_BPF_ENFORCE, the LSM will run the
attached eBPF programs but not enforce MAC policy based
on the return value of the attached programs.

Signed-off-by: KP Singh <kpsingh@google.com>
---
 MAINTAINERS           |  7 +++++++
 security/Kconfig      | 11 ++++++-----
 security/Makefile     |  2 ++
 security/bpf/Kconfig  | 25 +++++++++++++++++++++++++
 security/bpf/Makefile |  5 +++++
 security/bpf/lsm.c    | 28 ++++++++++++++++++++++++++++
 6 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 security/bpf/Kconfig
 create mode 100644 security/bpf/Makefile
 create mode 100644 security/bpf/lsm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 8f075b866aaf..3b82d8ff21fb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3175,6 +3175,13 @@ S:	Supported
 F:	arch/x86/net/
 X:	arch/x86/net/bpf_jit_comp32.c
 
+BPF SECURITY MODULE
+M:	KP Singh <kpsingh@chromium.org>
+L:	linux-security-module@vger.kernel.org
+L:	bpf@vger.kernel.org
+S:	Maintained
+F:	security/bpf/
+
 BROADCOM B44 10/100 ETHERNET DRIVER
 M:	Michael Chan <michael.chan@broadcom.com>
 L:	netdev@vger.kernel.org
diff --git a/security/Kconfig b/security/Kconfig
index 2a1a2d396228..6f1aab195e7d 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -236,6 +236,7 @@ source "security/tomoyo/Kconfig"
 source "security/apparmor/Kconfig"
 source "security/loadpin/Kconfig"
 source "security/yama/Kconfig"
+source "security/bpf/Kconfig"
 source "security/safesetid/Kconfig"
 source "security/lockdown/Kconfig"
 
@@ -277,11 +278,11 @@ endchoice
 
 config LSM
 	string "Ordered list of enabled LSMs"
-	default "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK
-	default "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR
-	default "lockdown,yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO
-	default "lockdown,yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC
-	default "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
+	default "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK
+	default "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR
+	default "lockdown,yama,loadpin,safesetid,integrity,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
+	default "lockdown,yama,loadpin,safesetid,integrity,bpf" if DEFAULT_SECURITY_DAC
+	default "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor,bpf"
 	help
 	  A comma-separated list of LSMs, in initialization order.
 	  Any LSMs left off this list will be ignored. This can be
diff --git a/security/Makefile b/security/Makefile
index be1dd9d2cb2f..50e6821dd7b7 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -12,6 +12,7 @@ subdir-$(CONFIG_SECURITY_YAMA)		+= yama
 subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
 subdir-$(CONFIG_SECURITY_SAFESETID)    += safesetid
 subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown
+subdir-$(CONFIG_SECURITY_BPF)		+= bpf
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -29,6 +30,7 @@ obj-$(CONFIG_SECURITY_YAMA)		+= yama/
 obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
 obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
+obj-$(CONFIG_SECURITY_BPF)		+= bpf/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
 
 # Object integrity file lists
diff --git a/security/bpf/Kconfig b/security/bpf/Kconfig
new file mode 100644
index 000000000000..1aaa1340e795
--- /dev/null
+++ b/security/bpf/Kconfig
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+config SECURITY_BPF
+	bool "BPF-based MAC and audit policy"
+	depends on SECURITY
+	depends on SECURITYFS
+	depends on BPF
+	depends on BPF_SYSCALL
+	help
+	  This enables instrumentation of the security hooks with
+	  eBPF programs. The LSM creates per-hook files in securityfs to which
+	  eBPF programs can be attached.
+
+	  If you are unsure how to answer this question, answer N.
+
+config SECURITY_BPF_ENFORCE
+	bool "Deny operations based on the evaluation of the attached programs"
+	depends on SECURITY_BPF
+	help
+	  eBPF programs attached to hooks can be used for both auditing and
+	  enforcement. Enabling enforcement implies that the evaluation result
+	  from the attached eBPF programs will allow or deny the operation
+	  guarded by the security hook.
diff --git a/security/bpf/Makefile b/security/bpf/Makefile
new file mode 100644
index 000000000000..26a0ab6f99b7
--- /dev/null
+++ b/security/bpf/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+obj-$(CONFIG_SECURITY_BPF) := lsm.o
diff --git a/security/bpf/lsm.c b/security/bpf/lsm.c
new file mode 100644
index 000000000000..fe5c65bbdd45
--- /dev/null
+++ b/security/bpf/lsm.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019 Google LLC.
+ */
+
+#include <linux/lsm_hooks.h>
+
+static int process_execution(struct linux_binprm *bprm)
+{
+	return 0;
+}
+
+static struct security_hook_list lsm_hooks[] __lsm_ro_after_init = {
+	LSM_HOOK_INIT(bprm_check_security, process_execution),
+};
+
+static int __init lsm_init(void)
+{
+	security_add_hooks(lsm_hooks, ARRAY_SIZE(lsm_hooks), "bpf");
+	pr_info("eBPF and LSM are friends now.\n");
+	return 0;
+}
+
+DEFINE_LSM(bpf) = {
+	.name = "bpf",
+	.init = lsm_init,
+};
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next v1 01/13] bpf: Refactor BPF_EVENT context macros to its own header.
From: KP Singh @ 2019-12-20 15:41 UTC (permalink / raw)
  To: linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, James Morris, Kees Cook,
	Thomas Garnier, Michael Halcrow, Paul Turner, Brendan Gregg,
	Jann Horn, Matthew Garrett, Christian Brauner,
	Mickaël Salaün, Florent Revest, Brendan Jackman,
	Martin KaFai Lau, Song Liu, Yonghong Song, Serge E. Hallyn,
	Mauro Carvalho Chehab, David S. Miller, Greg Kroah-Hartman,
	Nicolas Ferre, Stanislav Fomichev, Quentin Monnet, Andrey Ignatov,
	Joe Stringer
In-Reply-To: <20191220154208.15895-1-kpsingh@chromium.org>

From: KP Singh <kpsingh@google.com>

These macros are useful for other program types than tracing.
i.e. KRSI (an upccoming BPF based LSM) which does not use
BPF_PROG_TYPE_TRACE but uses verifiable BTF accesses similar
to raw tracepoints.

Signed-off-by: KP Singh <kpsingh@google.com>
---
 include/linux/bpf_event.h | 78 +++++++++++++++++++++++++++++++++++++++
 include/trace/bpf_probe.h | 30 +--------------
 kernel/trace/bpf_trace.c  | 24 +-----------
 3 files changed, 81 insertions(+), 51 deletions(-)
 create mode 100644 include/linux/bpf_event.h

diff --git a/include/linux/bpf_event.h b/include/linux/bpf_event.h
new file mode 100644
index 000000000000..353eb1f5a3d0
--- /dev/null
+++ b/include/linux/bpf_event.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+
+/*
+ * Copyright (c) 2018 Facebook
+ * Copyright 2019 Google LLC.
+ */
+
+#ifndef _LINUX_BPF_EVENT_H
+#define _LINUX_BPF_EVENT_H
+
+#ifdef CONFIG_BPF_EVENTS
+
+/* cast any integer, pointer, or small struct to u64 */
+#define UINTTYPE(size) \
+	__typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
+		   __builtin_choose_expr(size == 2, (u16)2, \
+		   __builtin_choose_expr(size == 4, (u32)3, \
+		   __builtin_choose_expr(size == 8, (u64)4, \
+					 (void)5)))))
+#define __CAST_TO_U64(x) ({ \
+	typeof(x) __src = (x); \
+	UINTTYPE(sizeof(x)) __dst; \
+	memcpy(&__dst, &__src, sizeof(__dst)); \
+	(u64)__dst; })
+
+#define __CAST0(...) 0
+#define __CAST1(a, ...) __CAST_TO_U64(a)
+#define __CAST2(a, ...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
+#define __CAST3(a, ...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
+#define __CAST4(a, ...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
+#define __CAST5(a, ...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
+#define __CAST6(a, ...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
+#define __CAST7(a, ...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
+#define __CAST8(a, ...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
+#define __CAST9(a, ...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
+#define __CAST10(a ,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
+#define __CAST11(a, ...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
+#define __CAST12(a, ...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
+/* tracepoints with more than 12 arguments will hit build error */
+#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define UINTTYPE(size) \
+	__typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
+		   __builtin_choose_expr(size == 2, (u16)2, \
+		   __builtin_choose_expr(size == 4, (u32)3, \
+		   __builtin_choose_expr(size == 8, (u64)4, \
+					 (void)5)))))
+
+#define UNPACK(...)			__VA_ARGS__
+#define REPEAT_1(FN, DL, X, ...)	FN(X)
+#define REPEAT_2(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
+#define REPEAT_3(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
+#define REPEAT_4(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
+#define REPEAT_5(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
+#define REPEAT_6(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
+#define REPEAT_7(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
+#define REPEAT_8(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
+#define REPEAT_9(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
+#define REPEAT_10(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
+#define REPEAT_11(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
+#define REPEAT_12(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
+#define REPEAT(X, FN, DL, ...)		REPEAT_##X(FN, DL, __VA_ARGS__)
+
+#define SARG(X)		u64 arg##X
+#ifdef COPY
+#undef COPY
+#endif
+
+#define COPY(X)		args[X] = arg##X
+#define __DL_COM	(,)
+#define __DL_SEM	(;)
+
+#define __SEQ_0_11	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+
+#endif
+#endif /* _LINUX_BPF_EVENT_H */
+
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index b04c29270973..5165dbc66098 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include <linux/bpf_event.h>
+
 #undef TRACE_SYSTEM_VAR
 
 #ifdef CONFIG_BPF_EVENTS
@@ -27,34 +29,6 @@
 #undef __perf_task
 #define __perf_task(t)	(t)
 
-/* cast any integer, pointer, or small struct to u64 */
-#define UINTTYPE(size) \
-	__typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
-		   __builtin_choose_expr(size == 2, (u16)2, \
-		   __builtin_choose_expr(size == 4, (u32)3, \
-		   __builtin_choose_expr(size == 8, (u64)4, \
-					 (void)5)))))
-#define __CAST_TO_U64(x) ({ \
-	typeof(x) __src = (x); \
-	UINTTYPE(sizeof(x)) __dst; \
-	memcpy(&__dst, &__src, sizeof(__dst)); \
-	(u64)__dst; })
-
-#define __CAST1(a,...) __CAST_TO_U64(a)
-#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
-#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
-#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
-#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
-#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
-#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
-#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
-#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
-#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
-#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
-#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
-/* tracepoints with more than 12 arguments will hit build error */
-#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
-
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace void							\
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ffc91d4935ac..3fb02fe799ab 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/bpf.h>
 #include <linux/bpf_perf_event.h>
+#include <linux/bpf_event.h>
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
@@ -1461,29 +1462,6 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 	rcu_read_unlock();
 }
 
-#define UNPACK(...)			__VA_ARGS__
-#define REPEAT_1(FN, DL, X, ...)	FN(X)
-#define REPEAT_2(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
-#define REPEAT_3(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
-#define REPEAT_4(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
-#define REPEAT_5(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
-#define REPEAT_6(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
-#define REPEAT_7(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
-#define REPEAT_8(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
-#define REPEAT_9(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
-#define REPEAT_10(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
-#define REPEAT_11(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
-#define REPEAT_12(FN, DL, X, ...)	FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
-#define REPEAT(X, FN, DL, ...)		REPEAT_##X(FN, DL, __VA_ARGS__)
-
-#define SARG(X)		u64 arg##X
-#define COPY(X)		args[X] = arg##X
-
-#define __DL_COM	(,)
-#define __DL_SEM	(;)
-
-#define __SEQ_0_11	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
-
 #define BPF_TRACE_DEFN_x(x)						\
 	void bpf_trace_run##x(struct bpf_prog *prog,			\
 			      REPEAT(x, SARG, __DL_COM, __SEQ_0_11))	\
-- 
2.20.1


^ permalink raw reply related

* [PATCH] ima: add the ability to query ima for the hash of a given file.
From: Florent Revest @ 2019-12-20 16:31 UTC (permalink / raw)
  To: linux-integrity
  Cc: kpsingh, mjg59, zohar, linux-kernel, linux-security-module,
	Florent Revest

From: Florent Revest <revest@google.com>

This allows other parts of the kernel (perhaps a stacked LSM allowing
system monitoring, eg. the proposed KRSI LSM [1]) to retrieve the hash
of a given file from IMA if it's present in the iint cache.

It's true that the existence of the hash means that it's also in the
audit logs or in /sys/kernel/security/ima/ascii_runtime_measurements,
but it can be difficult to pull that information out for every
subsequent exec.  This is especially true if a given host has been up
for a long time and the file was first measured a long time ago.

This is based on Peter Moody's patch:
 https://sourceforge.net/p/linux-ima/mailman/message/33036180/

[1] https://lkml.org/lkml/2019/9/10/393

Signed-off-by: Florent Revest <revest@google.com>
---
 include/linux/ima.h               |  6 +++++
 security/integrity/ima/ima_main.c | 41 +++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 6d904754d858..d621c65ba9a5 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -23,6 +23,7 @@ extern int ima_read_file(struct file *file, enum kernel_read_file_id id);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
+extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(const void *buf, int size);
 
 #ifdef CONFIG_IMA_KEXEC
@@ -91,6 +92,11 @@ static inline void ima_post_path_mknod(struct dentry *dentry)
 	return;
 }
 
+static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void ima_kexec_cmdline(const void *buf, int size) {}
 #endif /* CONFIG_IMA */
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index d7e987baf127..f054ddf4364e 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -445,6 +445,47 @@ int ima_file_check(struct file *file, int mask)
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
+/**
+ * ima_file_hash - return the stored measurement if a file has been hashed.
+ * @file: pointer to the file
+ * @buf: buffer in which to store the hash
+ * @buf_size: length of the buffer
+ *
+ * On success, output the hash into buf and return the hash algorithm (as
+ * defined in the enum hash_algo).
+ * If the hash is larger than buf, then only size bytes will be copied. It
+ * generally just makes sense to pass a buffer capable of holding the largest
+ * possible hash: IMA_MAX_DIGEST_SIZE
+ *
+ * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
+ * If the parameters are incorrect, return -EINVAL.
+ */
+int ima_file_hash(struct file *file, char *buf, size_t buf_size)
+{
+	struct inode *inode;
+	struct integrity_iint_cache *iint;
+	size_t copied_size;
+
+	if (!file || !buf)
+		return -EINVAL;
+
+	if (!ima_policy_flag)
+		return -EOPNOTSUPP;
+
+	inode = file_inode(file);
+	iint = integrity_iint_find(inode);
+	if (!iint)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&iint->mutex);
+	copied_size = min_t(size_t, iint->ima_hash->length, buf_size);
+	memcpy(buf, iint->ima_hash->digest, copied_size);
+	mutex_unlock(&iint->mutex);
+
+	return iint->ima_hash->algo;
+}
+EXPORT_SYMBOL_GPL(ima_file_hash);
+
 /**
  * ima_post_create_tmpfile - mark newly created tmpfile as new
  * @file : newly created tmpfile
-- 
2.24.1.735.g03f4e72817-goog


^ permalink raw reply related

* Re: [PATCH] ima: add the ability to query ima for the hash of a given file.
From: Lakshmi Ramasubramanian @ 2019-12-20 16:48 UTC (permalink / raw)
  To: Florent Revest, linux-integrity
  Cc: kpsingh, mjg59, zohar, linux-kernel, linux-security-module,
	Florent Revest
In-Reply-To: <20191220163136.25010-1-revest@chromium.org>

On 12/20/2019 8:31 AM, Florent Revest wrote:

>   
> +/**
> + * ima_file_hash - return the stored measurement if a file has been hashed.
> + * @file: pointer to the file
> + * @buf: buffer in which to store the hash
> + * @buf_size: length of the buffer
> + *
> + * On success, output the hash into buf and return the hash algorithm (as
> + * defined in the enum hash_algo).

> + * If the hash is larger than buf, then only size bytes will be copied. It
> + * generally just makes sense to pass a buffer capable of holding the largest
> + * possible hash: IMA_MAX_DIGEST_SIZE

If the given buffer is smaller than the hash length, wouldn't it be 
better to return the required size and a status indicating the buffer is 
not enough. The caller can then call back with the required buffer.

If the hash is truncated the caller may not know if the hash is partial 
or not.

> + *
> + * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
> + * If the parameters are incorrect, return -EINVAL.
> + */
> +int ima_file_hash(struct file *file, char *buf, size_t buf_size)
> +{
> +	struct inode *inode;
> +	struct integrity_iint_cache *iint;
> +	size_t copied_size;
> +
> +	if (!file || !buf)
> +		return -EINVAL;
> +
> +	if (!ima_policy_flag)
> +		return -EOPNOTSUPP;
> +
> +	inode = file_inode(file);
> +	iint = integrity_iint_find(inode);
> +	if (!iint)
> +		return -EOPNOTSUPP;
> +
> +	mutex_lock(&iint->mutex);
> +	copied_size = min_t(size_t, iint->ima_hash->length, buf_size);
> +	memcpy(buf, iint->ima_hash->digest, copied_size);
> +	mutex_unlock(&iint->mutex);
> +
> +	return iint->ima_hash->algo;

Should the hash algorithm be copied from iinit->ima_hash to a local 
variable while holding the mutex and that one returned?

I assume iinit->mutex  is taken to ensure iinit->ima_hash is not removed 
while this function is accessing it.

thanks,
  -lakshmi


^ permalink raw reply

* Re: [PATCH bpf-next v1 00/13] MAC and Audit policy using eBPF (KRSI)
From: Casey Schaufler @ 2019-12-20 17:17 UTC (permalink / raw)
  To: KP Singh, linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, James Morris, Kees Cook,
	Thomas Garnier, Michael Halcrow, Paul Turner, Brendan Gregg,
	Jann Horn, Matthew Garrett, Christian Brauner,
	Mickaël Salaün, Florent Revest, Brendan Jackman,
	Martin KaFai Lau, Song Liu, Yonghong Song, Serge E. Hallyn,
	Mauro Carvalho Chehab, David S. Miller, Greg Kroah-Hartman,
	Nicolas Ferre, Stanislav Fomichev, Quentin Monnet, Andrey Ignatov,
	Joe Stringer
In-Reply-To: <20191220154208.15895-1-kpsingh@chromium.org>

On 12/20/2019 7:41 AM, KP Singh wrote:
> From: KP Singh <kpsingh@google.com>
>
> This patch series is a continuation of the KRSI RFC
> (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/)
>
> # Motivation
>
> Google does rich analysis of runtime security data collected from
> internal Linux deployments (corporate devices and servers) to detect and
> thwart threats in real-time. Currently, this is done in custom kernel
> modules but we would like to replace this with something that's upstream
> and useful to others.
>
> The current kernel infrastructure for providing telemetry (Audit, Perf
> etc.) is disjoint from access enforcement (i.e. LSMs).  Augmenting the
> information provided by audit requires kernel changes to audit, its
> policy language and user-space components. Furthermore, building a MAC
> policy based on the newly added telemetry data requires changes to
> various LSMs and their respective policy languages.
>
> This patchset proposes a new stackable and privileged LSM which allows
> the LSM hooks to be implemented using eBPF. This facilitates a unified
> and dynamic (not requiring re-compilation of the kernel) audit and MAC
> policy.
>
> # Why an LSM?
>
> Linux Security Modules target security behaviours rather than the
> kernel's API. For example, it's easy to miss out a newly added system
> call for executing processes (eg. execve, execveat etc.) but the LSM
> framework ensures that all process executions trigger the relevant hooks
> irrespective of how the process was executed.
>
> Allowing users to implement LSM hooks at runtime also benefits the LSM
> eco-system by enabling a quick feedback loop from the security community
> about the kind of behaviours that the LSM Framework should be targeting.
>
> # How does it work?
>
> The LSM introduces a new eBPF (https://docs.cilium.io/en/v1.6/bpf/)
> program type, BPF_PROG_TYPE_LSM, which can only be attached to a LSM
> hook.  All LSM hooks are exposed as files in securityfs. Attachment
> requires CAP_SYS_ADMIN for loading eBPF programs and CAP_MAC_ADMIN for
> modifying MAC policies.
>
> The eBPF programs are passed the same arguments as the LSM hooks and
> executed in the body of the hook.

This effectively exposes the LSM hooks as external APIs.
It would mean that we can't change or delete them. That
would be bad.


>  If any of the eBPF programs returns an
> error (like ENOPERM), the behaviour represented by the hook is denied.
>
> Audit logs can be written using a format chosen by the eBPF program to
> the perf events buffer and can be further processed in user-space.
>
> # Limitations of RFC v1
>
> In the previous design
> (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/),
> the BPF programs received a context which could be queried to retrieve
> specific pieces of information using specific helpers.
>
> For example, a program that attaches to the file_mprotect LSM hook and
> queries the VMA region could have had the following context:
>
> // Special context for the hook.
> struct bpf_mprotect_ctx {
> 	struct vm_area_struct *vma;
> };
>
> and accessed the fields using a hypothetical helper
> "bpf_mprotect_vma_get_start:
>
> SEC("lsm/file_mprotect")
> int mprotect_audit(bpf_mprotect_ctx *ctx)
> {
> 	unsigned long vm_start = bpf_mprotect_vma_get_start(ctx);
> 	return 0;
> }
>
> or directly read them from the context by updating the verifier to allow
> accessing the fields:
>
> int mprotect_audit(bpf_mprotect_ctx *ctx)
> {
> 	unsigned long vm_start = ctx->vma->vm_start;
> 	return 0;
> }
>
> As we prototyped policies based on this design, we realized that this
> approach is not general enough. Adding helpers or verifier code for all
> usages would imply a high maintenance cost while severely restricting
> the instrumentation capabilities which is the key value add of our
> eBPF-based LSM.
>
> Feedback from the BPF maintainers at Linux Plumbers also pushed us
> towards the following, more general, approach.
>
> # BTF Based Design
>
> The current design uses BTF
> (https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html,
> https://lwn.net/Articles/803258/) which allows verifiable read-only
> structure accesses by field names rather than fixed offsets. This allows
> accessing the hook parameters using a dynamically created context which
> provides a certain degree of ABI stability:
>
> /* Clang builtin to handle field accesses. */
> #define _(P) (__builtin_preserve_access_index(P))
>
> // Only declare the structure and fields intended to be used
> // in the program
> struct vm_area_struct {
> 	unsigned long vm_start;
> };
>
> // Declare the eBPF program mprotect_audit which attaches to
> // to the file_mprotect LSM hook and accepts three arguments.
> BPF_TRACE_3("lsm/file_mprotect", mprotect_audit,
> 	    struct vm_area_struct *, vma,
> 	    unsigned long, reqprot, unsigned long, prot
> {
> 	unsigned long vm_start = _(vma->vm_start);
> 	return 0;
> }
>
> By relocating field offsets, BTF makes a large portion of kernel data
> structures readily accessible across kernel versions without requiring a
> large corpus of BPF helper functions and requiring recompilation with
> every kernel version. The limitations of BTF compatibility are described
> in BPF Co-Re (http://vger.kernel.org/bpfconf2019_talks/bpf-core.pdf,
> i.e. field renames, #defines and changes to the signature of LSM hooks).
>
> This design imposes that the MAC policy (eBPF programs) be updated when
> the inspected kernel structures change outside of BTF compatibility
> guarantees. In practice, this is only required when a structure field
> used by a current policy is removed (or renamed) or when the used LSM
> hooks change. We expect the maintenance cost of these changes to be
> acceptable as compared to the previous design
> (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/).
>
> # Distinction from Landlock
>
> We believe there exist two distinct use-cases with distinct set of users:
>
> * Unprivileged processes voluntarily relinquishing privileges with the
>   primary users being software developers.
>
> * Flexible privileged (CAP_MAC_ADMIN, CAP_SYS_ADMIN) MAC and Audit with
>   the primary users being system policy admins.
>
> These use-cases imply different APIs and trade-offs:
>
> * The unprivileged use case requires defining more stable and custom APIs
>   (through opaque contexts and precise helpers).
>
> * Privileged Audit and MAC requires deeper introspection of the kernel
>   data structures to maximise the flexibility that can be achieved without
>   kernel modification.
>
> Landlock has demonstrated filesystem sandboxes and now Ptrace access
> control in its patches which are excellent use cases for an unprivileged
> process voluntarily relinquishing privileges.
>
> However, Landlock has expanded its original goal, "towards unprivileged
> sandboxing", to being a "low-level framework to build
> access-control/audit systems" (https://landlock.io). We feel that the
> design and implementation are still driven by the constraints and
> trade-offs of the former use-case, and do not provide a satisfactory
> solution to the latter.
>
> We also believe that our approach, direct access to common kernel data
> structures as with BTF, is inappropriate for unprivileged processes and
> probably not a good option for Landlock.
>
> In conclusion, we feel that the design for a privileged LSM and
> unprivileged LSM are mutually exclusive and that one cannot be built
> "on-top-of" the other. Doing so would limit the capabilities of what can
> be done for an LSM that provides flexible audit and MAC capabilities or
> provide in-appropriate access to kernel internals to an unprivileged
> process.
>
> Furthermore, the Landlock design supports its historical use-case only
> when unprivileged eBPF is allowed. This is something that warrants
> discussion before an unprivileged LSM that uses eBPF is upstreamed.
>
> # Why not tracepoints or kprobes?
>
> In order to do MAC with tracepoints or kprobes, we would need to
> override the return value of the security hook. This is not possible
> with tracepoints or call-site kprobes.
>
> Attaching to the return boundary (kretprobe) implies that BPF programs
> would always get called after all the other LSM hooks are called and
> clobber the pre-existing LSM semantics.
>
> Enforcing MAC policy with an actual LSM helps leverage the verified
> semantics of the framework.
>
> # Usage Examples
>
> A simple example and some documentation is included in the patchset.
>
> In order to better illustrate the capabilities of the framework some
> more advanced prototype code has also been published separately:
>
> * Logging execution events (including environment variables and arguments):
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
> * Detecting deletion of running executables:
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_detect_exec_unlink.c
> * Detection of writes to /proc/<pid>/mem:
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
>
> We have updated Google's internal telemetry infrastructure and have
> started deploying this LSM on our Linux Workstations. This gives us more
> confidence in the real-world applications of such a system.
>
> KP Singh (13):
>   bpf: Refactor BPF_EVENT context macros to its own header.
>   bpf: lsm: Add a skeleton and config options
>   bpf: lsm: Introduce types for eBPF based LSM
>   bpf: lsm: Allow btf_id based attachment for LSM hooks
>   tools/libbpf: Add support in libbpf for BPF_PROG_TYPE_LSM
>   bpf: lsm: Init Hooks and create files in securityfs
>   bpf: lsm: Implement attach, detach and execution.
>   bpf: lsm: Show attached program names in hook read handler.
>   bpf: lsm: Add a helper function bpf_lsm_event_output
>   bpf: lsm: Handle attachment of the same program
>   tools/libbpf: Add bpf_program__attach_lsm
>   bpf: lsm: Add selftests for BPF_PROG_TYPE_LSM
>   bpf: lsm: Add Documentation
>
>  Documentation/security/bpf.rst                |  164 +++
>  Documentation/security/index.rst              |    1 +
>  MAINTAINERS                                   |   11 +
>  include/linux/bpf_event.h                     |   78 ++
>  include/linux/bpf_lsm.h                       |   25 +
>  include/linux/bpf_types.h                     |    4 +
>  include/trace/bpf_probe.h                     |   30 +-
>  include/uapi/linux/bpf.h                      |   12 +-
>  kernel/bpf/syscall.c                          |   10 +
>  kernel/bpf/verifier.c                         |   84 +-
>  kernel/trace/bpf_trace.c                      |   24 +-
>  security/Kconfig                              |   11 +-
>  security/Makefile                             |    2 +
>  security/bpf/Kconfig                          |   25 +
>  security/bpf/Makefile                         |    7 +
>  security/bpf/include/bpf_lsm.h                |   63 +
>  security/bpf/include/fs.h                     |   23 +
>  security/bpf/include/hooks.h                  | 1015 +++++++++++++++++
>  security/bpf/lsm.c                            |  160 +++
>  security/bpf/lsm_fs.c                         |  176 +++
>  security/bpf/ops.c                            |  224 ++++
>  tools/include/uapi/linux/bpf.h                |   12 +-
>  tools/lib/bpf/bpf.c                           |    2 +-
>  tools/lib/bpf/bpf.h                           |    6 +
>  tools/lib/bpf/libbpf.c                        |  163 ++-
>  tools/lib/bpf/libbpf.h                        |    4 +
>  tools/lib/bpf/libbpf.map                      |    7 +
>  tools/lib/bpf/libbpf_probes.c                 |    1 +
>  .../bpf/prog_tests/lsm_mprotect_audit.c       |  129 +++
>  .../selftests/bpf/progs/lsm_mprotect_audit.c  |   58 +
>  30 files changed, 2451 insertions(+), 80 deletions(-)
>  create mode 100644 Documentation/security/bpf.rst
>  create mode 100644 include/linux/bpf_event.h
>  create mode 100644 include/linux/bpf_lsm.h
>  create mode 100644 security/bpf/Kconfig
>  create mode 100644 security/bpf/Makefile
>  create mode 100644 security/bpf/include/bpf_lsm.h
>  create mode 100644 security/bpf/include/fs.h
>  create mode 100644 security/bpf/include/hooks.h
>  create mode 100644 security/bpf/lsm.c
>  create mode 100644 security/bpf/lsm_fs.c
>  create mode 100644 security/bpf/ops.c
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/lsm_mprotect_audit.c
>  create mode 100644 tools/testing/selftests/bpf/progs/lsm_mprotect_audit.c
>


^ permalink raw reply

* Re: [PATCH bpf-next v1 00/13] MAC and Audit policy using eBPF (KRSI)
From: KP Singh @ 2019-12-20 17:38 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: open list, bpf, linux-security-module, Alexei Starovoitov,
	Daniel Borkmann, James Morris, Kees Cook, Thomas Garnier,
	Michael Halcrow, Paul Turner, Brendan Gregg, Jann Horn,
	Matthew Garrett, Christian Brauner, Mickaël Salaün,
	Florent Revest, Brendan Jackman, Martin KaFai Lau, Song Liu,
	Yonghong Song, Serge E. Hallyn, Mauro Carvalho Chehab,
	David S. Miller, Greg Kroah-Hartman, Nicolas Ferre,
	Stanislav Fomichev, Quentin Monnet, Andrey Ignatov, Joe Stringer
In-Reply-To: <95036040-6b1c-116c-bd6b-684f00174b4f@schaufler-ca.com>

Hi Casey,

Thanks for taking a look!

On Fri, Dec 20, 2019 at 6:17 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
>
> On 12/20/2019 7:41 AM, KP Singh wrote:
> > From: KP Singh <kpsingh@google.com>
> >
> > This patch series is a continuation of the KRSI RFC
> > (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/)
> >
> > # Motivation
> >
> > Google does rich analysis of runtime security data collected from
> > internal Linux deployments (corporate devices and servers) to detect and
> > thwart threats in real-time. Currently, this is done in custom kernel
> > modules but we would like to replace this with something that's upstream
> > and useful to others.
> >
> > The current kernel infrastructure for providing telemetry (Audit, Perf
> > etc.) is disjoint from access enforcement (i.e. LSMs).  Augmenting the
> > information provided by audit requires kernel changes to audit, its
> > policy language and user-space components. Furthermore, building a MAC
> > policy based on the newly added telemetry data requires changes to
> > various LSMs and their respective policy languages.
> >
> > This patchset proposes a new stackable and privileged LSM which allows
> > the LSM hooks to be implemented using eBPF. This facilitates a unified
> > and dynamic (not requiring re-compilation of the kernel) audit and MAC
> > policy.
> >
> > # Why an LSM?
> >
> > Linux Security Modules target security behaviours rather than the
> > kernel's API. For example, it's easy to miss out a newly added system
> > call for executing processes (eg. execve, execveat etc.) but the LSM
> > framework ensures that all process executions trigger the relevant hooks
> > irrespective of how the process was executed.
> >
> > Allowing users to implement LSM hooks at runtime also benefits the LSM
> > eco-system by enabling a quick feedback loop from the security community
> > about the kind of behaviours that the LSM Framework should be targeting.
> >
> > # How does it work?
> >
> > The LSM introduces a new eBPF (https://docs.cilium.io/en/v1.6/bpf/)
> > program type, BPF_PROG_TYPE_LSM, which can only be attached to a LSM
> > hook.  All LSM hooks are exposed as files in securityfs. Attachment
> > requires CAP_SYS_ADMIN for loading eBPF programs and CAP_MAC_ADMIN for
> > modifying MAC policies.
> >
> > The eBPF programs are passed the same arguments as the LSM hooks and
> > executed in the body of the hook.
>
> This effectively exposes the LSM hooks as external APIs.
> It would mean that we can't change or delete them. That
> would be bad.

Perhaps this should have been clearer, we *do not* want to make LSM hooks
a stable API and expect the eBPF programs to adapt when such changes occur.

Based on our comparison with the previous approach, this still ends up
being a better trade-off (w.r.t. maintenance) when compared to adding
specific helpers or verifier logic for  each new hook or field that
needs to be exposed.

- KP

>
>
> >  If any of the eBPF programs returns an
> > error (like ENOPERM), the behaviour represented by the hook is denied.
> >
> > Audit logs can be written using a format chosen by the eBPF program to
> > the perf events buffer and can be further processed in user-space.
> >
> > # Limitations of RFC v1
> >
> > In the previous design
> > (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/),
> > the BPF programs received a context which could be queried to retrieve
> > specific pieces of information using specific helpers.
> >
> > For example, a program that attaches to the file_mprotect LSM hook and
> > queries the VMA region could have had the following context:
> >
> > // Special context for the hook.
> > struct bpf_mprotect_ctx {
> >       struct vm_area_struct *vma;
> > };
> >
> > and accessed the fields using a hypothetical helper
> > "bpf_mprotect_vma_get_start:
> >
> > SEC("lsm/file_mprotect")
> > int mprotect_audit(bpf_mprotect_ctx *ctx)
> > {
> >       unsigned long vm_start = bpf_mprotect_vma_get_start(ctx);
> >       return 0;
> > }
> >
> > or directly read them from the context by updating the verifier to allow
> > accessing the fields:
> >
> > int mprotect_audit(bpf_mprotect_ctx *ctx)
> > {
> >       unsigned long vm_start = ctx->vma->vm_start;
> >       return 0;
> > }
> >
> > As we prototyped policies based on this design, we realized that this
> > approach is not general enough. Adding helpers or verifier code for all
> > usages would imply a high maintenance cost while severely restricting
> > the instrumentation capabilities which is the key value add of our
> > eBPF-based LSM.
> >
> > Feedback from the BPF maintainers at Linux Plumbers also pushed us
> > towards the following, more general, approach.
> >
> > # BTF Based Design
> >
> > The current design uses BTF
> > (https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html,
> > https://lwn.net/Articles/803258/) which allows verifiable read-only
> > structure accesses by field names rather than fixed offsets. This allows
> > accessing the hook parameters using a dynamically created context which
> > provides a certain degree of ABI stability:
> >
> > /* Clang builtin to handle field accesses. */
> > #define _(P) (__builtin_preserve_access_index(P))
> >
> > // Only declare the structure and fields intended to be used
> > // in the program
> > struct vm_area_struct {
> >       unsigned long vm_start;
> > };
> >
> > // Declare the eBPF program mprotect_audit which attaches to
> > // to the file_mprotect LSM hook and accepts three arguments.
> > BPF_TRACE_3("lsm/file_mprotect", mprotect_audit,
> >           struct vm_area_struct *, vma,
> >           unsigned long, reqprot, unsigned long, prot
> > {
> >       unsigned long vm_start = _(vma->vm_start);
> >       return 0;
> > }
> >
> > By relocating field offsets, BTF makes a large portion of kernel data
> > structures readily accessible across kernel versions without requiring a
> > large corpus of BPF helper functions and requiring recompilation with
> > every kernel version. The limitations of BTF compatibility are described
> > in BPF Co-Re (http://vger.kernel.org/bpfconf2019_talks/bpf-core.pdf,
> > i.e. field renames, #defines and changes to the signature of LSM hooks).
> >
> > This design imposes that the MAC policy (eBPF programs) be updated when
> > the inspected kernel structures change outside of BTF compatibility
> > guarantees. In practice, this is only required when a structure field
> > used by a current policy is removed (or renamed) or when the used LSM
> > hooks change. We expect the maintenance cost of these changes to be
> > acceptable as compared to the previous design
> > (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/).
> >
> > # Distinction from Landlock
> >
> > We believe there exist two distinct use-cases with distinct set of users:
> >
> > * Unprivileged processes voluntarily relinquishing privileges with the
> >   primary users being software developers.
> >
> > * Flexible privileged (CAP_MAC_ADMIN, CAP_SYS_ADMIN) MAC and Audit with
> >   the primary users being system policy admins.
> >
> > These use-cases imply different APIs and trade-offs:
> >
> > * The unprivileged use case requires defining more stable and custom APIs
> >   (through opaque contexts and precise helpers).
> >
> > * Privileged Audit and MAC requires deeper introspection of the kernel
> >   data structures to maximise the flexibility that can be achieved without
> >   kernel modification.
> >
> > Landlock has demonstrated filesystem sandboxes and now Ptrace access
> > control in its patches which are excellent use cases for an unprivileged
> > process voluntarily relinquishing privileges.
> >
> > However, Landlock has expanded its original goal, "towards unprivileged
> > sandboxing", to being a "low-level framework to build
> > access-control/audit systems" (https://landlock.io). We feel that the
> > design and implementation are still driven by the constraints and
> > trade-offs of the former use-case, and do not provide a satisfactory
> > solution to the latter.
> >
> > We also believe that our approach, direct access to common kernel data
> > structures as with BTF, is inappropriate for unprivileged processes and
> > probably not a good option for Landlock.
> >
> > In conclusion, we feel that the design for a privileged LSM and
> > unprivileged LSM are mutually exclusive and that one cannot be built
> > "on-top-of" the other. Doing so would limit the capabilities of what can
> > be done for an LSM that provides flexible audit and MAC capabilities or
> > provide in-appropriate access to kernel internals to an unprivileged
> > process.
> >
> > Furthermore, the Landlock design supports its historical use-case only
> > when unprivileged eBPF is allowed. This is something that warrants
> > discussion before an unprivileged LSM that uses eBPF is upstreamed.
> >
> > # Why not tracepoints or kprobes?
> >
> > In order to do MAC with tracepoints or kprobes, we would need to
> > override the return value of the security hook. This is not possible
> > with tracepoints or call-site kprobes.
> >
> > Attaching to the return boundary (kretprobe) implies that BPF programs
> > would always get called after all the other LSM hooks are called and
> > clobber the pre-existing LSM semantics.
> >
> > Enforcing MAC policy with an actual LSM helps leverage the verified
> > semantics of the framework.
> >
> > # Usage Examples
> >
> > A simple example and some documentation is included in the patchset.
> >
> > In order to better illustrate the capabilities of the framework some
> > more advanced prototype code has also been published separately:
> >
> > * Logging execution events (including environment variables and arguments):
> > https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
> > * Detecting deletion of running executables:
> > https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_detect_exec_unlink.c
> > * Detection of writes to /proc/<pid>/mem:
> > https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
> >
> > We have updated Google's internal telemetry infrastructure and have
> > started deploying this LSM on our Linux Workstations. This gives us more
> > confidence in the real-world applications of such a system.
> >
> > KP Singh (13):
> >   bpf: Refactor BPF_EVENT context macros to its own header.
> >   bpf: lsm: Add a skeleton and config options
> >   bpf: lsm: Introduce types for eBPF based LSM
> >   bpf: lsm: Allow btf_id based attachment for LSM hooks
> >   tools/libbpf: Add support in libbpf for BPF_PROG_TYPE_LSM
> >   bpf: lsm: Init Hooks and create files in securityfs
> >   bpf: lsm: Implement attach, detach and execution.
> >   bpf: lsm: Show attached program names in hook read handler.
> >   bpf: lsm: Add a helper function bpf_lsm_event_output
> >   bpf: lsm: Handle attachment of the same program
> >   tools/libbpf: Add bpf_program__attach_lsm
> >   bpf: lsm: Add selftests for BPF_PROG_TYPE_LSM
> >   bpf: lsm: Add Documentation
> >
> >  Documentation/security/bpf.rst                |  164 +++
> >  Documentation/security/index.rst              |    1 +
> >  MAINTAINERS                                   |   11 +
> >  include/linux/bpf_event.h                     |   78 ++
> >  include/linux/bpf_lsm.h                       |   25 +
> >  include/linux/bpf_types.h                     |    4 +
> >  include/trace/bpf_probe.h                     |   30 +-
> >  include/uapi/linux/bpf.h                      |   12 +-
> >  kernel/bpf/syscall.c                          |   10 +
> >  kernel/bpf/verifier.c                         |   84 +-
> >  kernel/trace/bpf_trace.c                      |   24 +-
> >  security/Kconfig                              |   11 +-
> >  security/Makefile                             |    2 +
> >  security/bpf/Kconfig                          |   25 +
> >  security/bpf/Makefile                         |    7 +
> >  security/bpf/include/bpf_lsm.h                |   63 +
> >  security/bpf/include/fs.h                     |   23 +
> >  security/bpf/include/hooks.h                  | 1015 +++++++++++++++++
> >  security/bpf/lsm.c                            |  160 +++
> >  security/bpf/lsm_fs.c                         |  176 +++
> >  security/bpf/ops.c                            |  224 ++++
> >  tools/include/uapi/linux/bpf.h                |   12 +-
> >  tools/lib/bpf/bpf.c                           |    2 +-
> >  tools/lib/bpf/bpf.h                           |    6 +
> >  tools/lib/bpf/libbpf.c                        |  163 ++-
> >  tools/lib/bpf/libbpf.h                        |    4 +
> >  tools/lib/bpf/libbpf.map                      |    7 +
> >  tools/lib/bpf/libbpf_probes.c                 |    1 +
> >  .../bpf/prog_tests/lsm_mprotect_audit.c       |  129 +++
> >  .../selftests/bpf/progs/lsm_mprotect_audit.c  |   58 +
> >  30 files changed, 2451 insertions(+), 80 deletions(-)
> >  create mode 100644 Documentation/security/bpf.rst
> >  create mode 100644 include/linux/bpf_event.h
> >  create mode 100644 include/linux/bpf_lsm.h
> >  create mode 100644 security/bpf/Kconfig
> >  create mode 100644 security/bpf/Makefile
> >  create mode 100644 security/bpf/include/bpf_lsm.h
> >  create mode 100644 security/bpf/include/fs.h
> >  create mode 100644 security/bpf/include/hooks.h
> >  create mode 100644 security/bpf/lsm.c
> >  create mode 100644 security/bpf/lsm_fs.c
> >  create mode 100644 security/bpf/ops.c
> >  create mode 100644 tools/testing/selftests/bpf/prog_tests/lsm_mprotect_audit.c
> >  create mode 100644 tools/testing/selftests/bpf/progs/lsm_mprotect_audit.c
> >
>

^ permalink raw reply

* Re: [PATCH bpf-next v1 01/13] bpf: Refactor BPF_EVENT context macros to its own header.
From: Andrii Nakryiko @ 2019-12-20 20:10 UTC (permalink / raw)
  To: KP Singh
  Cc: open list, bpf, linux-security-module, Alexei Starovoitov,
	Daniel Borkmann, James Morris, Kees Cook, Thomas Garnier,
	Michael Halcrow, Paul Turner, Brendan Gregg, Jann Horn,
	Matthew Garrett, Christian Brauner, Mickaël Salaün,
	Florent Revest, Brendan Jackman, Martin KaFai Lau, Song Liu,
	Yonghong Song, Serge E. Hallyn, Mauro Carvalho Chehab,
	David S. Miller, Greg Kroah-Hartman, Nicolas Ferre,
	Stanislav Fomichev, Quentin Monnet, Andrey Ignatov, Joe Stringer
In-Reply-To: <20191220154208.15895-2-kpsingh@chromium.org>

On Fri, Dec 20, 2019 at 7:43 AM KP Singh <kpsingh@chromium.org> wrote:
>
> From: KP Singh <kpsingh@google.com>
>
> These macros are useful for other program types than tracing.
> i.e. KRSI (an upccoming BPF based LSM) which does not use
> BPF_PROG_TYPE_TRACE but uses verifiable BTF accesses similar
> to raw tracepoints.
>
> Signed-off-by: KP Singh <kpsingh@google.com>
> ---
>  include/linux/bpf_event.h | 78 +++++++++++++++++++++++++++++++++++++++
>  include/trace/bpf_probe.h | 30 +--------------
>  kernel/trace/bpf_trace.c  | 24 +-----------
>  3 files changed, 81 insertions(+), 51 deletions(-)
>  create mode 100644 include/linux/bpf_event.h
>
> diff --git a/include/linux/bpf_event.h b/include/linux/bpf_event.h
> new file mode 100644
> index 000000000000..353eb1f5a3d0
> --- /dev/null
> +++ b/include/linux/bpf_event.h
> @@ -0,0 +1,78 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +
> +/*
> + * Copyright (c) 2018 Facebook
> + * Copyright 2019 Google LLC.
> + */
> +
> +#ifndef _LINUX_BPF_EVENT_H
> +#define _LINUX_BPF_EVENT_H
> +
> +#ifdef CONFIG_BPF_EVENTS
> +
> +/* cast any integer, pointer, or small struct to u64 */
> +#define UINTTYPE(size) \
> +       __typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
> +                  __builtin_choose_expr(size == 2, (u16)2, \
> +                  __builtin_choose_expr(size == 4, (u32)3, \
> +                  __builtin_choose_expr(size == 8, (u64)4, \
> +                                        (void)5)))))
> +#define __CAST_TO_U64(x) ({ \
> +       typeof(x) __src = (x); \
> +       UINTTYPE(sizeof(x)) __dst; \
> +       memcpy(&__dst, &__src, sizeof(__dst)); \
> +       (u64)__dst; })
> +
> +#define __CAST0(...) 0
> +#define __CAST1(a, ...) __CAST_TO_U64(a)
> +#define __CAST2(a, ...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
> +#define __CAST3(a, ...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
> +#define __CAST4(a, ...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
> +#define __CAST5(a, ...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
> +#define __CAST6(a, ...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
> +#define __CAST7(a, ...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
> +#define __CAST8(a, ...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
> +#define __CAST9(a, ...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
> +#define __CAST10(a ,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
> +#define __CAST11(a, ...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
> +#define __CAST12(a, ...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
> +/* tracepoints with more than 12 arguments will hit build error */
> +#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
> +
> +#define UINTTYPE(size) \
> +       __typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
> +                  __builtin_choose_expr(size == 2, (u16)2, \
> +                  __builtin_choose_expr(size == 4, (u32)3, \
> +                  __builtin_choose_expr(size == 8, (u64)4, \
> +                                        (void)5)))))

Is it the same macro as above?

> +

[...]

^ permalink raw reply

* Re: [PATCH bpf-next v1 01/13] bpf: Refactor BPF_EVENT context macros to its own header.
From: KP Singh @ 2019-12-20 20:26 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: open list, bpf, linux-security-module, Alexei Starovoitov,
	Daniel Borkmann, James Morris, Kees Cook, Thomas Garnier,
	Michael Halcrow, Paul Turner, Brendan Gregg, Jann Horn,
	Matthew Garrett, Christian Brauner, Mickaël Salaün,
	Florent Revest, Brendan Jackman, Martin KaFai Lau, Song Liu,
	Yonghong Song, Serge E. Hallyn, Mauro Carvalho Chehab,
	David S. Miller, Greg Kroah-Hartman, Nicolas Ferre,
	Stanislav Fomichev, Quentin Monnet, Andrey Ignatov, Joe Stringer
In-Reply-To: <CAEf4BzZJf_GBCXYYmE0Hk7O2GdeOr-3VCxdAaxCUk-MHRar3Og@mail.gmail.com>

On 20-Dez 12:10, Andrii Nakryiko wrote:
> On Fri, Dec 20, 2019 at 7:43 AM KP Singh <kpsingh@chromium.org> wrote:
> >
> > From: KP Singh <kpsingh@google.com>
> >
> > These macros are useful for other program types than tracing.
> > i.e. KRSI (an upccoming BPF based LSM) which does not use
> > BPF_PROG_TYPE_TRACE but uses verifiable BTF accesses similar
> > to raw tracepoints.
> >
> > Signed-off-by: KP Singh <kpsingh@google.com>
> > ---
> >  include/linux/bpf_event.h | 78 +++++++++++++++++++++++++++++++++++++++
> >  include/trace/bpf_probe.h | 30 +--------------
> >  kernel/trace/bpf_trace.c  | 24 +-----------
> >  3 files changed, 81 insertions(+), 51 deletions(-)
> >  create mode 100644 include/linux/bpf_event.h
> >
> > diff --git a/include/linux/bpf_event.h b/include/linux/bpf_event.h
> > new file mode 100644
> > index 000000000000..353eb1f5a3d0
> > --- /dev/null
> > +++ b/include/linux/bpf_event.h
> > @@ -0,0 +1,78 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +
> > +/*
> > + * Copyright (c) 2018 Facebook
> > + * Copyright 2019 Google LLC.
> > + */
> > +
> > +#ifndef _LINUX_BPF_EVENT_H
> > +#define _LINUX_BPF_EVENT_H
> > +
> > +#ifdef CONFIG_BPF_EVENTS
> > +
> > +/* cast any integer, pointer, or small struct to u64 */
> > +#define UINTTYPE(size) \
> > +       __typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
> > +                  __builtin_choose_expr(size == 2, (u16)2, \
> > +                  __builtin_choose_expr(size == 4, (u32)3, \
> > +                  __builtin_choose_expr(size == 8, (u64)4, \
> > +                                        (void)5)))))
> > +#define __CAST_TO_U64(x) ({ \
> > +       typeof(x) __src = (x); \
> > +       UINTTYPE(sizeof(x)) __dst; \
> > +       memcpy(&__dst, &__src, sizeof(__dst)); \
> > +       (u64)__dst; })
> > +
> > +#define __CAST0(...) 0
> > +#define __CAST1(a, ...) __CAST_TO_U64(a)
> > +#define __CAST2(a, ...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
> > +#define __CAST3(a, ...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
> > +#define __CAST4(a, ...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
> > +#define __CAST5(a, ...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
> > +#define __CAST6(a, ...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
> > +#define __CAST7(a, ...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
> > +#define __CAST8(a, ...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
> > +#define __CAST9(a, ...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
> > +#define __CAST10(a ,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
> > +#define __CAST11(a, ...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
> > +#define __CAST12(a, ...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
> > +/* tracepoints with more than 12 arguments will hit build error */
> > +#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
> > +
> > +#define UINTTYPE(size) \
> > +       __typeof__(__builtin_choose_expr(size == 1,  (u8)1, \
> > +                  __builtin_choose_expr(size == 2, (u16)2, \
> > +                  __builtin_choose_expr(size == 4, (u32)3, \
> > +                  __builtin_choose_expr(size == 8, (u64)4, \
> > +                                        (void)5)))))
> 
> Is it the same macro as above?

Yes, sorry did not notice this. Will fix it in the next revision.

- KP

> 
> > +
> 
> [...]

^ permalink raw reply

* Re: [PATCH bpf-next v1 00/13] MAC and Audit policy using eBPF (KRSI)
From: Mickaël Salaün @ 2019-12-20 22:46 UTC (permalink / raw)
  To: KP Singh, linux-kernel, bpf, linux-security-module
  Cc: Alexei Starovoitov, Daniel Borkmann, James Morris, Kees Cook,
	Thomas Garnier, Michael Halcrow, Paul Turner, Brendan Gregg,
	Jann Horn, Matthew Garrett, Christian Brauner, Florent Revest,
	Brendan Jackman, Martin KaFai Lau, Song Liu, Yonghong Song,
	Serge E. Hallyn, Mauro Carvalho Chehab, David S. Miller,
	Greg Kroah-Hartman, Nicolas Ferre, Stanislav Fomichev,
	Quentin Monnet, Andrey Ignatov, Joe Stringer,
	Mickaël Salaün
In-Reply-To: <20191220154208.15895-1-kpsingh@chromium.org>


On 20/12/2019 16:41, KP Singh wrote:
> From: KP Singh <kpsingh@google.com>
> 
> This patch series is a continuation of the KRSI RFC
> (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/)
> 
> # Motivation
> 
> Google does rich analysis of runtime security data collected from
> internal Linux deployments (corporate devices and servers) to detect and
> thwart threats in real-time. Currently, this is done in custom kernel
> modules but we would like to replace this with something that's upstream
> and useful to others.
> 
> The current kernel infrastructure for providing telemetry (Audit, Perf
> etc.) is disjoint from access enforcement (i.e. LSMs).  Augmenting the
> information provided by audit requires kernel changes to audit, its
> policy language and user-space components. Furthermore, building a MAC
> policy based on the newly added telemetry data requires changes to
> various LSMs and their respective policy languages.
> 
> This patchset proposes a new stackable and privileged LSM which allows
> the LSM hooks to be implemented using eBPF. This facilitates a unified
> and dynamic (not requiring re-compilation of the kernel) audit and MAC
> policy.
> 
> # Why an LSM?
> 
> Linux Security Modules target security behaviours rather than the
> kernel's API. For example, it's easy to miss out a newly added system
> call for executing processes (eg. execve, execveat etc.) but the LSM
> framework ensures that all process executions trigger the relevant hooks
> irrespective of how the process was executed.
> 
> Allowing users to implement LSM hooks at runtime also benefits the LSM
> eco-system by enabling a quick feedback loop from the security community
> about the kind of behaviours that the LSM Framework should be targeting.
> 
> # How does it work?
> 
> The LSM introduces a new eBPF (https://docs.cilium.io/en/v1.6/bpf/)
> program type, BPF_PROG_TYPE_LSM, which can only be attached to a LSM
> hook.  All LSM hooks are exposed as files in securityfs. Attachment
> requires CAP_SYS_ADMIN for loading eBPF programs and CAP_MAC_ADMIN for
> modifying MAC policies.
> 
> The eBPF programs are passed the same arguments as the LSM hooks and
> executed in the body of the hook. If any of the eBPF programs returns an
> error (like ENOPERM), the behaviour represented by the hook is denied.
> 
> Audit logs can be written using a format chosen by the eBPF program to
> the perf events buffer and can be further processed in user-space.
> 
> # Limitations of RFC v1
> 
> In the previous design
> (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/),
> the BPF programs received a context which could be queried to retrieve
> specific pieces of information using specific helpers.
> 
> For example, a program that attaches to the file_mprotect LSM hook and
> queries the VMA region could have had the following context:
> 
> // Special context for the hook.
> struct bpf_mprotect_ctx {
> 	struct vm_area_struct *vma;
> };
> 
> and accessed the fields using a hypothetical helper
> "bpf_mprotect_vma_get_start:
> 
> SEC("lsm/file_mprotect")
> int mprotect_audit(bpf_mprotect_ctx *ctx)
> {
> 	unsigned long vm_start = bpf_mprotect_vma_get_start(ctx);
> 	return 0;
> }
> 
> or directly read them from the context by updating the verifier to allow
> accessing the fields:
> 
> int mprotect_audit(bpf_mprotect_ctx *ctx)
> {
> 	unsigned long vm_start = ctx->vma->vm_start;
> 	return 0;
> }
> 
> As we prototyped policies based on this design, we realized that this
> approach is not general enough. Adding helpers or verifier code for all
> usages would imply a high maintenance cost while severely restricting
> the instrumentation capabilities which is the key value add of our
> eBPF-based LSM.
> 
> Feedback from the BPF maintainers at Linux Plumbers also pushed us
> towards the following, more general, approach.
> 
> # BTF Based Design
> 
> The current design uses BTF
> (https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html,
> https://lwn.net/Articles/803258/) which allows verifiable read-only
> structure accesses by field names rather than fixed offsets. This allows
> accessing the hook parameters using a dynamically created context which
> provides a certain degree of ABI stability:
> 
> /* Clang builtin to handle field accesses. */
> #define _(P) (__builtin_preserve_access_index(P))
> 
> // Only declare the structure and fields intended to be used
> // in the program
> struct vm_area_struct {
> 	unsigned long vm_start;
> };
> 
> // Declare the eBPF program mprotect_audit which attaches to
> // to the file_mprotect LSM hook and accepts three arguments.
> BPF_TRACE_3("lsm/file_mprotect", mprotect_audit,
> 	    struct vm_area_struct *, vma,
> 	    unsigned long, reqprot, unsigned long, prot
> {
> 	unsigned long vm_start = _(vma->vm_start);
> 	return 0;
> }
> 
> By relocating field offsets, BTF makes a large portion of kernel data
> structures readily accessible across kernel versions without requiring a
> large corpus of BPF helper functions and requiring recompilation with
> every kernel version. The limitations of BTF compatibility are described
> in BPF Co-Re (http://vger.kernel.org/bpfconf2019_talks/bpf-core.pdf,
> i.e. field renames, #defines and changes to the signature of LSM hooks).
> 
> This design imposes that the MAC policy (eBPF programs) be updated when
> the inspected kernel structures change outside of BTF compatibility
> guarantees. In practice, this is only required when a structure field
> used by a current policy is removed (or renamed) or when the used LSM
> hooks change. We expect the maintenance cost of these changes to be
> acceptable as compared to the previous design
> (https://lore.kernel.org/bpf/20190910115527.5235-1-kpsingh@chromium.org/).
> 
> # Distinction from Landlock
> 
> We believe there exist two distinct use-cases with distinct set of users:
> 
> * Unprivileged processes voluntarily relinquishing privileges with the
>    primary users being software developers.
> 
> * Flexible privileged (CAP_MAC_ADMIN, CAP_SYS_ADMIN) MAC and Audit with
>    the primary users being system policy admins.
> 
> These use-cases imply different APIs and trade-offs:
> 
> * The unprivileged use case requires defining more stable and custom APIs
>    (through opaque contexts and precise helpers).
> 
> * Privileged Audit and MAC requires deeper introspection of the kernel
>    data structures to maximise the flexibility that can be achieved without
>    kernel modification.
> 
> Landlock has demonstrated filesystem sandboxes and now Ptrace access
> control in its patches which are excellent use cases for an unprivileged
> process voluntarily relinquishing privileges.
> 
> However, Landlock has expanded its original goal, "towards unprivileged
> sandboxing", to being a "low-level framework to build
> access-control/audit systems" (https://landlock.io). We feel that the
> design and implementation are still driven by the constraints and
> trade-offs of the former use-case, and do not provide a satisfactory
> solution to the latter.
> 
> We also believe that our approach, direct access to common kernel data
> structures as with BTF, is inappropriate for unprivileged processes and
> probably not a good option for Landlock.
> 
> In conclusion, we feel that the design for a privileged LSM and
> unprivileged LSM are mutually exclusive and that one cannot be built
> "on-top-of" the other. Doing so would limit the capabilities of what can
> be done for an LSM that provides flexible audit and MAC capabilities or
> provide in-appropriate access to kernel internals to an unprivileged
> process.

I don't think that the design for a privileged LSM and an unprivileged 
LSM are necessarily mutually exclusive, however I do agree that the 
design of an *introspection* LSM like this version of KRSI, and an 
unprivileged LSM are mutually exclusive.

> 
> Furthermore, the Landlock design supports its historical use-case only
> when unprivileged eBPF is allowed. This is something that warrants
> discussion before an unprivileged LSM that uses eBPF is upstreamed.

Because of seccomp-bpf, on which the first version of Landlock was based 
on, I then used eBPF as a way to define a security policy which could be 
updated on the fly (thanks to eBPF maps) and evolves over time. The main 
goal of Landlock was and still is to bring sandboxing features to all 
users, which means to have an unprivileged access-control. However, I've 
reach a similar conclusion about eBPF for unprivileged access-control, 
but for slightly different reasons.

eBPF is very powerful and I proved with Landlock that it is possible to 
implement an access-control with it. However, a programmatic 
access-control does not fit well with unprivileged principles (i.e. 
innocuous composability). First, it can be used for side-channel attacks 
against (other) access-controls. Second, composability of eBPF programs 
imply to execute a stack of programs, which does not scale well. Third, 
as shown by multiple attacks, it is quite challenging to safely expose 
eBPF to malicious processes.

I'm working on a version of Landlock without eBPF, but still with the 
initial sought properties: safe unprivileged composability, modularity, 
and dynamic update. I'll send this version soon.

I hope that the work and experience from Landlock to bring eBPF to LSM 
will continue to be used through KRSI. Landlock will now focus on the 
unprivileged sandboxing part, without eBPF. Stay tuned!


> 
> # Why not tracepoints or kprobes?
> 
> In order to do MAC with tracepoints or kprobes, we would need to
> override the return value of the security hook. This is not possible
> with tracepoints or call-site kprobes.
> 
> Attaching to the return boundary (kretprobe) implies that BPF programs
> would always get called after all the other LSM hooks are called and
> clobber the pre-existing LSM semantics.
> 
> Enforcing MAC policy with an actual LSM helps leverage the verified
> semantics of the framework.
> 
> # Usage Examples
> 
> A simple example and some documentation is included in the patchset.
> 
> In order to better illustrate the capabilities of the framework some
> more advanced prototype code has also been published separately:
> 
> * Logging execution events (including environment variables and arguments):
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
> * Detecting deletion of running executables:
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_detect_exec_unlink.c
> * Detection of writes to /proc/<pid>/mem:
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
> 
> We have updated Google's internal telemetry infrastructure and have
> started deploying this LSM on our Linux Workstations. This gives us more
> confidence in the real-world applications of such a system.
> 
> KP Singh (13):
>    bpf: Refactor BPF_EVENT context macros to its own header.
>    bpf: lsm: Add a skeleton and config options
>    bpf: lsm: Introduce types for eBPF based LSM
>    bpf: lsm: Allow btf_id based attachment for LSM hooks
>    tools/libbpf: Add support in libbpf for BPF_PROG_TYPE_LSM
>    bpf: lsm: Init Hooks and create files in securityfs
>    bpf: lsm: Implement attach, detach and execution.
>    bpf: lsm: Show attached program names in hook read handler.
>    bpf: lsm: Add a helper function bpf_lsm_event_output
>    bpf: lsm: Handle attachment of the same program
>    tools/libbpf: Add bpf_program__attach_lsm
>    bpf: lsm: Add selftests for BPF_PROG_TYPE_LSM
>    bpf: lsm: Add Documentation
> 
>   Documentation/security/bpf.rst                |  164 +++
>   Documentation/security/index.rst              |    1 +
>   MAINTAINERS                                   |   11 +
>   include/linux/bpf_event.h                     |   78 ++
>   include/linux/bpf_lsm.h                       |   25 +
>   include/linux/bpf_types.h                     |    4 +
>   include/trace/bpf_probe.h                     |   30 +-
>   include/uapi/linux/bpf.h                      |   12 +-
>   kernel/bpf/syscall.c                          |   10 +
>   kernel/bpf/verifier.c                         |   84 +-
>   kernel/trace/bpf_trace.c                      |   24 +-
>   security/Kconfig                              |   11 +-
>   security/Makefile                             |    2 +
>   security/bpf/Kconfig                          |   25 +
>   security/bpf/Makefile                         |    7 +
>   security/bpf/include/bpf_lsm.h                |   63 +
>   security/bpf/include/fs.h                     |   23 +
>   security/bpf/include/hooks.h                  | 1015 +++++++++++++++++
>   security/bpf/lsm.c                            |  160 +++
>   security/bpf/lsm_fs.c                         |  176 +++
>   security/bpf/ops.c                            |  224 ++++
>   tools/include/uapi/linux/bpf.h                |   12 +-
>   tools/lib/bpf/bpf.c                           |    2 +-
>   tools/lib/bpf/bpf.h                           |    6 +
>   tools/lib/bpf/libbpf.c                        |  163 ++-
>   tools/lib/bpf/libbpf.h                        |    4 +
>   tools/lib/bpf/libbpf.map                      |    7 +
>   tools/lib/bpf/libbpf_probes.c                 |    1 +
>   .../bpf/prog_tests/lsm_mprotect_audit.c       |  129 +++
>   .../selftests/bpf/progs/lsm_mprotect_audit.c  |   58 +
>   30 files changed, 2451 insertions(+), 80 deletions(-)
>   create mode 100644 Documentation/security/bpf.rst
>   create mode 100644 include/linux/bpf_event.h
>   create mode 100644 include/linux/bpf_lsm.h
>   create mode 100644 security/bpf/Kconfig
>   create mode 100644 security/bpf/Makefile
>   create mode 100644 security/bpf/include/bpf_lsm.h
>   create mode 100644 security/bpf/include/fs.h
>   create mode 100644 security/bpf/include/hooks.h
>   create mode 100644 security/bpf/lsm.c
>   create mode 100644 security/bpf/lsm_fs.c
>   create mode 100644 security/bpf/ops.c
>   create mode 100644 tools/testing/selftests/bpf/prog_tests/lsm_mprotect_audit.c
>   create mode 100644 tools/testing/selftests/bpf/progs/lsm_mprotect_audit.c
> 

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Serge E. Hallyn @ 2019-12-21  3:08 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Michael Kerrisk, Serge E. Hallyn, Andy Lutomirski,
	linux-security-module, ksrot
In-Reply-To: <CALQRfL4_kVKX=qJ0nDnEGgVeUHtQG5BDTCX72KfZr0toc-Mpuw@mail.gmail.com>

I believe this is coming from the kernel.  I booted an old 12.04
ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows 
Capabilities for `1': =ep
This was running
libcap 2.22.  Same version of libcap compiled on my laptop gives me the
long output.  booting an old 14.04 livecd, running 4.4 kernel,
getpcap also gives the long output.

On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> Serge might want to comment. I seem to recall that the issue was relevant
> for Serge's namespaces and checkpointing/restarting over a kernel upgrade.
> But it has been more than a decade and I can't seem to find the email
> exchange we had back then.
> 
> Cheers
> 
> Andrew
> 
> 
> On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> mtk.manpages@gmail.com> wrote:
> 
> > Hello Andrew,
> >
> > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > [Resend with reply-all this time.]
> > >
> > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk (man-pages)
> > > <mtk.manpages@gmail.com> wrote:
> > >>
> > >> Hello Andrew,
> > >>
> > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <morgan@kernel.org>
> > wrote:
> > >>>
> > >>> This changed with this commit I think:
> > >>>
> > >>>
> > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > >>>
> > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > >>> all the bitmasks in e and p. When we started to clip the
> > >>> bounding set to only defined capabilities, it meant that the
> > >>> text output started to look like
> > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > >>> was quite terrible.
> > >>
> > >> Was that really the change that caused this? That's a 2008 commit.
> > >> Certainly, I recall in 2014 or 2015 still being able to see =ep, and I
> > >> presume (but do not recall for sure) that I was using a system with a
> > >> libcap more recent than v2.11 (of which that commit was a part).
> > >
> > > The only other commit that seems relevant was this one:
> > >
> > >
> > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > >
> > > But I think this was all part of the same effort.
> > >
> > >>> So, this was seen as the least worst option.
> > >>
> > >> But surely this is fixable? Or, to put it another was, I presume
> > >> there's something that makes this difficult to fix in getpcaps, but
> > >> what is that something?
> > >
> > > I recall spending a day or more trying to avoid it, but I can't see
> > > how it is really fixable because there are too many moving parts.
> > >
> > > If the kernel adds another capability, then =ep could reasonably mean
> > > both the "old full set" or the "new full set". There are contexts
> > > where the difference matters. For example, where folk are using text
> > > representations for things like package manager shell-script setups.
> > > What they get when they say "=ep
> > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might suddenly be
> > > inappropriate when the new kernel adds "cap_self_destruct". At least
> > > the way it is at present, we are very explicit about what is in
> > > effect
> >
> > I can sort of see how the idea you are expressing might
> > apply when *setting* capabilities on *files*, but:
> >
> > a) I'm talking about the *display* of capabilities of a running
> > *process* using getpcaps(8).
> >
> > b) In practice, the logic that actually applies when setting
> > capabilities on files seems to run *counter* to the idea
> > that you express above (if I understand you correctly),
> > and your argument seems more relevant to files (especially
> > when *setting* file capabilities) than to processes.
> >
> > Consider the following examples:
> >
> > 1. A binary that has all but one capability is described in a
> > compact way by getcap(8):
> >
> >         $ sudo setcap "=p cap_kill-p" mypog
> >         $ getcap mypog
> >         mypog =p cap_kill-p
> >
> > When that same binary is run, the process's capabilities
> > are described with a very different format by getpcaps(8)
> >
> >         $ getpcaps $(pidof i_syscall)
> >         Capabilities for `152006': =
> >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> >         cap_block_suspend,cap_audit_read+p
> >
> > That is quite inconsistent! And also, the second notation is
> > simply very hard to read. How many capabilities are listed there?
> > Is it all of them? (When a process does have all caps in permitted,
> > the display differs only by one item.) A security-related notation
> > that is difficult to read is a risk... [*]
> >
> > 2. I just now tried the following, and it really surprised me
> > (although the reasons are quickly obvious when one considers
> > the point I made earlier in this mail thread that 'setcap =p' is
> > setting *64* bits in the file's permittted set):
> >
> >         # Set "all" permitted capabilities on a file:
> >
> >         $ sudo setcap =p myprog
> >         $ getcap myprog
> >         myprog =p
> >
> >         # Set "all" 38 permitted capabilities on a file, by specifying
> >         # them individually:
> >
> >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> >         echo -n ",$c"; done)=p myprog
> >         $ getcap myprog
> >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> >         cap_block_suspend,cap_audit_read+p
> >
> > I think there would be few users who would *not* be surprised
> > about the fact that two steps that seem equivalent produce
> > quite differnt output from getcap(8)!
> >
> > 3. Suppose I set all permitted capabilities on a binary:
> >
> >         $ sudo setcap =p myprog
> >
> > Then actually, I have set not just the 38 existing capabilities,
> > but also 26 future capabilities, so that when "cap_self_destruct"
> > is added to the kernel, 'myprog' already has it. This seems to
> > run directly counter to your argument above (if I have understood
> > it correctly).
> >
> > My summary from the above:
> >
> > * The output notation from getpcaps(8) is (1) inconsistent (with
> > getcap(8)), and (2) difficult to read, two things that strike me
> > as risk factors in a security-related notation.
> >
> > * The fact that "setcap =p ..." sets the top 26 (currently unused)
> > bits in the permitted set is surprising, and perhaps also a
> > security risk when new capabilities are actually added to the
> > kernel, since existing binaries will automatically have those
> > capabilities.
> >
> > Thanks,
> >
> > Michael
> >
> > [*] I often joke that the cap_to_text(3) notation is one that is
> > "human-readable, but not necessarily human-comprehensible", but
> > at the same time I also note that the notation has one virtue:
> > it is compact. However, that one virtue seems to have gone out
> > the window for getpcaps(8).
> >
> > --
> > Michael Kerrisk
> > Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> > Linux/UNIX System Programming Training: http://man7.org/training/
> >

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Andrew G. Morgan @ 2019-12-21  3:27 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Michael Kerrisk, Andy Lutomirski, linux-security-module, ksrot
In-Reply-To: <20191221030845.GA3626@mail.hallyn.com>

/proc/1/status changed at some point to drop the undefined bits. When
it did I changed libcap to try harder to not display unnamed
capabilities.

Cheers

Andrew


On Fri, Dec 20, 2019 at 7:08 PM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> I believe this is coming from the kernel.  I booted an old 12.04
> ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows
> Capabilities for `1': =ep
> This was running
> libcap 2.22.  Same version of libcap compiled on my laptop gives me the
> long output.  booting an old 14.04 livecd, running 4.4 kernel,
> getpcap also gives the long output.
>
> On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> > Serge might want to comment. I seem to recall that the issue was relevant
> > for Serge's namespaces and checkpointing/restarting over a kernel upgrade.
> > But it has been more than a decade and I can't seem to find the email
> > exchange we had back then.
> >
> > Cheers
> >
> > Andrew
> >
> >
> > On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> > mtk.manpages@gmail.com> wrote:
> >
> > > Hello Andrew,
> > >
> > > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > > [Resend with reply-all this time.]
> > > >
> > > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk (man-pages)
> > > > <mtk.manpages@gmail.com> wrote:
> > > >>
> > > >> Hello Andrew,
> > > >>
> > > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <morgan@kernel.org>
> > > wrote:
> > > >>>
> > > >>> This changed with this commit I think:
> > > >>>
> > > >>>
> > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > > >>>
> > > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > > >>> all the bitmasks in e and p. When we started to clip the
> > > >>> bounding set to only defined capabilities, it meant that the
> > > >>> text output started to look like
> > > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > > >>> was quite terrible.
> > > >>
> > > >> Was that really the change that caused this? That's a 2008 commit.
> > > >> Certainly, I recall in 2014 or 2015 still being able to see =ep, and I
> > > >> presume (but do not recall for sure) that I was using a system with a
> > > >> libcap more recent than v2.11 (of which that commit was a part).
> > > >
> > > > The only other commit that seems relevant was this one:
> > > >
> > > >
> > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > > >
> > > > But I think this was all part of the same effort.
> > > >
> > > >>> So, this was seen as the least worst option.
> > > >>
> > > >> But surely this is fixable? Or, to put it another was, I presume
> > > >> there's something that makes this difficult to fix in getpcaps, but
> > > >> what is that something?
> > > >
> > > > I recall spending a day or more trying to avoid it, but I can't see
> > > > how it is really fixable because there are too many moving parts.
> > > >
> > > > If the kernel adds another capability, then =ep could reasonably mean
> > > > both the "old full set" or the "new full set". There are contexts
> > > > where the difference matters. For example, where folk are using text
> > > > representations for things like package manager shell-script setups.
> > > > What they get when they say "=ep
> > > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might suddenly be
> > > > inappropriate when the new kernel adds "cap_self_destruct". At least
> > > > the way it is at present, we are very explicit about what is in
> > > > effect
> > >
> > > I can sort of see how the idea you are expressing might
> > > apply when *setting* capabilities on *files*, but:
> > >
> > > a) I'm talking about the *display* of capabilities of a running
> > > *process* using getpcaps(8).
> > >
> > > b) In practice, the logic that actually applies when setting
> > > capabilities on files seems to run *counter* to the idea
> > > that you express above (if I understand you correctly),
> > > and your argument seems more relevant to files (especially
> > > when *setting* file capabilities) than to processes.
> > >
> > > Consider the following examples:
> > >
> > > 1. A binary that has all but one capability is described in a
> > > compact way by getcap(8):
> > >
> > >         $ sudo setcap "=p cap_kill-p" mypog
> > >         $ getcap mypog
> > >         mypog =p cap_kill-p
> > >
> > > When that same binary is run, the process's capabilities
> > > are described with a very different format by getpcaps(8)
> > >
> > >         $ getpcaps $(pidof i_syscall)
> > >         Capabilities for `152006': =
> > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > >         cap_block_suspend,cap_audit_read+p
> > >
> > > That is quite inconsistent! And also, the second notation is
> > > simply very hard to read. How many capabilities are listed there?
> > > Is it all of them? (When a process does have all caps in permitted,
> > > the display differs only by one item.) A security-related notation
> > > that is difficult to read is a risk... [*]
> > >
> > > 2. I just now tried the following, and it really surprised me
> > > (although the reasons are quickly obvious when one considers
> > > the point I made earlier in this mail thread that 'setcap =p' is
> > > setting *64* bits in the file's permittted set):
> > >
> > >         # Set "all" permitted capabilities on a file:
> > >
> > >         $ sudo setcap =p myprog
> > >         $ getcap myprog
> > >         myprog =p
> > >
> > >         # Set "all" 38 permitted capabilities on a file, by specifying
> > >         # them individually:
> > >
> > >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> > >         echo -n ",$c"; done)=p myprog
> > >         $ getcap myprog
> > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > >         cap_block_suspend,cap_audit_read+p
> > >
> > > I think there would be few users who would *not* be surprised
> > > about the fact that two steps that seem equivalent produce
> > > quite differnt output from getcap(8)!
> > >
> > > 3. Suppose I set all permitted capabilities on a binary:
> > >
> > >         $ sudo setcap =p myprog
> > >
> > > Then actually, I have set not just the 38 existing capabilities,
> > > but also 26 future capabilities, so that when "cap_self_destruct"
> > > is added to the kernel, 'myprog' already has it. This seems to
> > > run directly counter to your argument above (if I have understood
> > > it correctly).
> > >
> > > My summary from the above:
> > >
> > > * The output notation from getpcaps(8) is (1) inconsistent (with
> > > getcap(8)), and (2) difficult to read, two things that strike me
> > > as risk factors in a security-related notation.
> > >
> > > * The fact that "setcap =p ..." sets the top 26 (currently unused)
> > > bits in the permitted set is surprising, and perhaps also a
> > > security risk when new capabilities are actually added to the
> > > kernel, since existing binaries will automatically have those
> > > capabilities.
> > >
> > > Thanks,
> > >
> > > Michael
> > >
> > > [*] I often joke that the cap_to_text(3) notation is one that is
> > > "human-readable, but not necessarily human-comprehensible", but
> > > at the same time I also note that the notation has one virtue:
> > > it is compact. However, that one virtue seems to have gone out
> > > the window for getpcaps(8).
> > >
> > > --
> > > Michael Kerrisk
> > > Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> > > Linux/UNIX System Programming Training: http://man7.org/training/
> > >

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Serge E. Hallyn @ 2019-12-21  4:36 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Michael Kerrisk, Andy Lutomirski,
	linux-security-module, ksrot
In-Reply-To: <CALQRfL4O2CgkAdOxhsqN87MBDDaVhRaHBRVMu+hecQq-Fu8q-A@mail.gmail.com>

I think the thing to do is use /proc/sys/kernel/cap_last_cap (if it
exists) to compare *caps to ~(cap_last_cap-1) and for each bit where
it's the same, show the full set.  Been looking at the fn for a bit,
and I might be comfortable enough to try it at this point.  But, if
you want to do it (to make sure it's done right :), please let me know.

-serge

On Fri, Dec 20, 2019 at 07:27:46PM -0800, Andrew G. Morgan wrote:
> /proc/1/status changed at some point to drop the undefined bits. When
> it did I changed libcap to try harder to not display unnamed
> capabilities.
> 
> Cheers
> 
> Andrew
> 
> 
> On Fri, Dec 20, 2019 at 7:08 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > I believe this is coming from the kernel.  I booted an old 12.04
> > ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows
> > Capabilities for `1': =ep
> > This was running
> > libcap 2.22.  Same version of libcap compiled on my laptop gives me the
> > long output.  booting an old 14.04 livecd, running 4.4 kernel,
> > getpcap also gives the long output.
> >
> > On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> > > Serge might want to comment. I seem to recall that the issue was relevant
> > > for Serge's namespaces and checkpointing/restarting over a kernel upgrade.
> > > But it has been more than a decade and I can't seem to find the email
> > > exchange we had back then.
> > >
> > > Cheers
> > >
> > > Andrew
> > >
> > >
> > > On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> > > mtk.manpages@gmail.com> wrote:
> > >
> > > > Hello Andrew,
> > > >
> > > > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > > > [Resend with reply-all this time.]
> > > > >
> > > > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk (man-pages)
> > > > > <mtk.manpages@gmail.com> wrote:
> > > > >>
> > > > >> Hello Andrew,
> > > > >>
> > > > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <morgan@kernel.org>
> > > > wrote:
> > > > >>>
> > > > >>> This changed with this commit I think:
> > > > >>>
> > > > >>>
> > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > > > >>>
> > > > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > > > >>> all the bitmasks in e and p. When we started to clip the
> > > > >>> bounding set to only defined capabilities, it meant that the
> > > > >>> text output started to look like
> > > > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > > > >>> was quite terrible.
> > > > >>
> > > > >> Was that really the change that caused this? That's a 2008 commit.
> > > > >> Certainly, I recall in 2014 or 2015 still being able to see =ep, and I
> > > > >> presume (but do not recall for sure) that I was using a system with a
> > > > >> libcap more recent than v2.11 (of which that commit was a part).
> > > > >
> > > > > The only other commit that seems relevant was this one:
> > > > >
> > > > >
> > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > > > >
> > > > > But I think this was all part of the same effort.
> > > > >
> > > > >>> So, this was seen as the least worst option.
> > > > >>
> > > > >> But surely this is fixable? Or, to put it another was, I presume
> > > > >> there's something that makes this difficult to fix in getpcaps, but
> > > > >> what is that something?
> > > > >
> > > > > I recall spending a day or more trying to avoid it, but I can't see
> > > > > how it is really fixable because there are too many moving parts.
> > > > >
> > > > > If the kernel adds another capability, then =ep could reasonably mean
> > > > > both the "old full set" or the "new full set". There are contexts
> > > > > where the difference matters. For example, where folk are using text
> > > > > representations for things like package manager shell-script setups.
> > > > > What they get when they say "=ep
> > > > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might suddenly be
> > > > > inappropriate when the new kernel adds "cap_self_destruct". At least
> > > > > the way it is at present, we are very explicit about what is in
> > > > > effect
> > > >
> > > > I can sort of see how the idea you are expressing might
> > > > apply when *setting* capabilities on *files*, but:
> > > >
> > > > a) I'm talking about the *display* of capabilities of a running
> > > > *process* using getpcaps(8).
> > > >
> > > > b) In practice, the logic that actually applies when setting
> > > > capabilities on files seems to run *counter* to the idea
> > > > that you express above (if I understand you correctly),
> > > > and your argument seems more relevant to files (especially
> > > > when *setting* file capabilities) than to processes.
> > > >
> > > > Consider the following examples:
> > > >
> > > > 1. A binary that has all but one capability is described in a
> > > > compact way by getcap(8):
> > > >
> > > >         $ sudo setcap "=p cap_kill-p" mypog
> > > >         $ getcap mypog
> > > >         mypog =p cap_kill-p
> > > >
> > > > When that same binary is run, the process's capabilities
> > > > are described with a very different format by getpcaps(8)
> > > >
> > > >         $ getpcaps $(pidof i_syscall)
> > > >         Capabilities for `152006': =
> > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > >         cap_block_suspend,cap_audit_read+p
> > > >
> > > > That is quite inconsistent! And also, the second notation is
> > > > simply very hard to read. How many capabilities are listed there?
> > > > Is it all of them? (When a process does have all caps in permitted,
> > > > the display differs only by one item.) A security-related notation
> > > > that is difficult to read is a risk... [*]
> > > >
> > > > 2. I just now tried the following, and it really surprised me
> > > > (although the reasons are quickly obvious when one considers
> > > > the point I made earlier in this mail thread that 'setcap =p' is
> > > > setting *64* bits in the file's permittted set):
> > > >
> > > >         # Set "all" permitted capabilities on a file:
> > > >
> > > >         $ sudo setcap =p myprog
> > > >         $ getcap myprog
> > > >         myprog =p
> > > >
> > > >         # Set "all" 38 permitted capabilities on a file, by specifying
> > > >         # them individually:
> > > >
> > > >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> > > >         echo -n ",$c"; done)=p myprog
> > > >         $ getcap myprog
> > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > >         cap_block_suspend,cap_audit_read+p
> > > >
> > > > I think there would be few users who would *not* be surprised
> > > > about the fact that two steps that seem equivalent produce
> > > > quite differnt output from getcap(8)!
> > > >
> > > > 3. Suppose I set all permitted capabilities on a binary:
> > > >
> > > >         $ sudo setcap =p myprog
> > > >
> > > > Then actually, I have set not just the 38 existing capabilities,
> > > > but also 26 future capabilities, so that when "cap_self_destruct"
> > > > is added to the kernel, 'myprog' already has it. This seems to
> > > > run directly counter to your argument above (if I have understood
> > > > it correctly).
> > > >
> > > > My summary from the above:
> > > >
> > > > * The output notation from getpcaps(8) is (1) inconsistent (with
> > > > getcap(8)), and (2) difficult to read, two things that strike me
> > > > as risk factors in a security-related notation.
> > > >
> > > > * The fact that "setcap =p ..." sets the top 26 (currently unused)
> > > > bits in the permitted set is surprising, and perhaps also a
> > > > security risk when new capabilities are actually added to the
> > > > kernel, since existing binaries will automatically have those
> > > > capabilities.
> > > >
> > > > Thanks,
> > > >
> > > > Michael
> > > >
> > > > [*] I often joke that the cap_to_text(3) notation is one that is
> > > > "human-readable, but not necessarily human-comprehensible", but
> > > > at the same time I also note that the notation has one virtue:
> > > > it is compact. However, that one virtue seems to have gone out
> > > > the window for getpcaps(8).
> > > >
> > > > --
> > > > Michael Kerrisk
> > > > Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> > > > Linux/UNIX System Programming Training: http://man7.org/training/
> > > >

^ permalink raw reply

* Re: [PATCH v1 - RFC] ima: export the measurement list when needed
From: Janne Karhunen @ 2019-12-21 10:41 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: linux-integrity, linux-security-module, Ken Goldman,
	david.safford, monty.wiseman
In-Reply-To: <1576850665.5241.52.camel@linux.ibm.com>

On Fri, Dec 20, 2019 at 4:04 PM Mimi Zohar <zohar@linux.ibm.com> wrote:

> Should the kernel be involved in writing the IMA measurement list to a
> file or, as Dave suggested, this should be delegated to a userspace
> application?

That is a good question. I went this way as it did not feel right to
me that the kernel would depend on periodic, reliable userspace
functionality to stay running (we would have a circular dependency).
The thing is, once the kernel starts to run low on memory, it may kill
that periodic daemon flushing the data for reasons unrelated to IMA.


-- 
Janne

^ permalink raw reply

* Re: [PATCH v1 - RFC] ima: export the measurement list when needed
From: Janne Karhunen @ 2019-12-21 11:03 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: linux-integrity, linux-security-module, Ken Goldman,
	david.safford, monty.wiseman
In-Reply-To: <CAE=NcrZUyLe1Ftk5wOuEMJBPnw+DBx9LACbk1JPJcpg8VdDiJQ@mail.gmail.com>

On Sat, Dec 21, 2019 at 12:41 PM Janne Karhunen
<janne.karhunen@gmail.com> wrote:

> > Should the kernel be involved in writing the IMA measurement list to a
> > file or, as Dave suggested, this should be delegated to a userspace
> > application?
>
> That is a good question. I went this way as it did not feel right to
> me that the kernel would depend on periodic, reliable userspace
> functionality to stay running (we would have a circular dependency).
> The thing is, once the kernel starts to run low on memory, it may kill
> that periodic daemon flushing the data for reasons unrelated to IMA.

Besides the dependency, I think the requirement should be that we can
survive the basic test of 'while true; do touch $RANDOM; done' at
least until we run out of allocated diskspace. While arranging this
with userspace flushers is not impossible, it is order of magnitude
more complex to do correctly than just letting the kernel write the
file. Even if it feels somewhat unorthodox.

Above patch survives that test case with 3 line addition via a
workqueue. Once the admin points IMA to some mount, the above test
case (while loop creating files full speed) will run a long, long
time. Effectively this is really just kernel doing its own memory
management as it should. Flush out the dirty pages you do not really
need to stay running.


--
Janne

^ permalink raw reply

* Re: Looks like issue in handling active_nodes count in 4.19 kernel
From: Paul Moore @ 2019-12-21 16:02 UTC (permalink / raw)
  To: Ravi Kumar Siddojigari
  Cc: selinux, linux-security-module, Stephen Smalley, Jaihind Yadav
In-Reply-To: <1576843382-24184-1-git-send-email-rsiddoji@codeaurora.org>

On Fri, Dec 20, 2019 at 7:03 AM Ravi Kumar Siddojigari
<rsiddoji@codeaurora.org> wrote:
> Thanks for correcting , Adding the signoff of orginal author in the
> following commit .
>
> From 6308b405e2097ab9d82c5a3894815daf7331e0b6 Mon Sep 17 00:00:00 2001
> From: Jaihind Yadav <jaihindyadav@codeaurora.org>
> Date: Tue, 17 Dec 2019 17:25:47 +0530
> Subject: [PATCH] selinux: ensure we cleanup the internal AVC counters on error
>  in avc_update()
> To: rsiddoji@codeaurora.org
>
> In AVC update we don't call avc_node_kill() when avc_xperms_populate()
> fails, resulting in the avc->avc_cache.active_nodes counter having a
> false value.In last patch this changes was missed , so correcting it.
>
>
> Signed-off-by: Jaihind Yadav <jaihindyadav@codeaurora.org>
> Signed-off-by: Ravi Kumar Siddojigari <rsiddoji@codeaurora.org>
> ---
>  security/selinux/avc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Merged into selinux/next, thanks!

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH bpf-next v1 00/13] MAC and Audit policy using eBPF (KRSI)
From: Alexei Starovoitov @ 2019-12-22  1:27 UTC (permalink / raw)
  To: KP Singh
  Cc: linux-kernel, bpf, linux-security-module, Alexei Starovoitov,
	Daniel Borkmann, James Morris, Kees Cook, Thomas Garnier,
	Michael Halcrow, Paul Turner, Brendan Gregg, Jann Horn,
	Matthew Garrett, Christian Brauner, Mickaël Salaün,
	Florent Revest, Brendan Jackman, Martin KaFai Lau, Song Liu,
	Yonghong Song, Serge E. Hallyn, Mauro Carvalho Chehab,
	David S. Miller, Greg Kroah-Hartman, Nicolas Ferre,
	Stanislav Fomichev, Quentin Monnet, Andrey Ignatov, Joe Stringer
In-Reply-To: <20191220154208.15895-1-kpsingh@chromium.org>

On Fri, Dec 20, 2019 at 04:41:55PM +0100, KP Singh wrote:
> // Declare the eBPF program mprotect_audit which attaches to
> // to the file_mprotect LSM hook and accepts three arguments.
> BPF_TRACE_3("lsm/file_mprotect", mprotect_audit,
> 	    struct vm_area_struct *, vma,
> 	    unsigned long, reqprot, unsigned long, prot
> {
> 	unsigned long vm_start = _(vma->vm_start);
> 	return 0;
> }

I think the only sore point of the patchset is:
security/bpf/include/hooks.h   | 1015 ++++++++++++++++++++++++++++++++
With bpf trampoline this type of 'kernel types -> bpf types' converters
are no longer necessary. Please take a look at tcp-congestion-control patchset:
https://patchwork.ozlabs.org/cover/1214417/
Instead of doing similar thing (like your patch 1 plus patch 6) it's using
trampoline to provide bpf based congestion control callbacks into tcp stack.
The same trampoline-based mechanism can be reused by bpf_lsm.
Then all manual work of doing BPF_LSM_HOOK(...) for every hook won't be
necessary. It will also prove the point that attaching BPF to raw LSM hooks
doesn't freeze them into stable abi.
The programs can keep the same syntax as in your examples:
BPF_TRACE_3("lsm/file_mprotect", mprotect_audit,
libbpf will find file_mprotect's btf_id in kernel vmlinux and pass it to
the kernel for attaching. Just like fentry/fexit bpf progs are doing
and just like bpf-based cc is doing as well.

> In order to better illustrate the capabilities of the framework some
> more advanced prototype code has also been published separately:
> 
> * Logging execution events (including environment variables and arguments):
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c
> * Detecting deletion of running executables:
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_detect_exec_unlink.c
> * Detection of writes to /proc/<pid>/mem:
> https://github.com/sinkap/linux-krsi/blob/patch/v1/examples/samples/bpf/lsm_audit_env.c

Thank you for sharing these examples. That definitely helps to see more
complete picture. I noticed that the examples are using the pattern:
  u32 map_id = 0;
  env = bpf_map_lookup_elem(&env_map, &map_id);
Essentially they're global variables. libbpf already supports them.
bpf prog can use them as:
  struct env_value env;
  int bpf_prog(..)
  {
    env.name... env.value..
  }
That will make progs a bit easier to read and faster too.
Accesing global vars from user space is also trivial with skeleton work:
  lsm_audit_env_skel->bss->env.name... env.value.
Both bpf side and user side can access globals as normal C variables.

There is a small issue in the patches 8 and 10.
bpf program names are not unique and bpf-lsm should not require them to be different.
bpf_attr->prog_name is also short at 16 bytes. It's for introspection only.
Longer program names are supplied via btf's func_info.
It feels that:
cat /sys/kernel/security/bpf/process_execution
env_dumper__v2
is reinventing the wheel. bpftool is the main introspection tool.
It can print progs attached to perf, cgroup, networking. I think it's better to
stay consistent and do the same with bpf-lsm.

Another issue is in proposed attaching method:
hook_fd = open("/sys/kernel/security/bpf/process_execution");
sys_bpf(attach, prog_fd, hook_fd);
With bpf tracing we moved to FD-based attaching, because permanent attaching is
problematic in production. We're going to provide FD-based api to attach to
networking as well, because xdp/tc/cgroup prog attaching suffers from the same
production issues. Mainly with permanent attaching there is no ownership of
attachment. Everything is global and permanent. It's not clear what
process/script suppose to detach/cleanup. I suggest bpf-lsm use FD-based
attaching from the beginning. Take a look at raw_tp/tp_btf/fentry/fexit style
of attaching. All of them return FD which represents what libbpf calls
'bpf_link' concept. Once refcnt of that FD goes to zero that link (attachment)
is destroyed and program is detached _by the kernel_. To make such links
permanent the application can pin them in bpffs. The pinning patches haven't
landed yet, but the concept of the link is quite powerful and much more
production friendly than permanent attaching.
bpf-lsm will still be able to attach multiple progs to the same hook and
see what is attached via bpftool.

The rest looks good. Thank you for working on it.

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Andrew G. Morgan @ 2019-12-22 18:00 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Michael Kerrisk, Andy Lutomirski, linux-security-module, ksrot
In-Reply-To: <20191221043622.GA4580@mail.hallyn.com>

I can take a look, but I'm not really convinced this isn't going to
cause me grief at some point in the future when the kernel adds
support for another capability and all' does not equal all. I've done
some digging and I thought it was worth writing up.

Today, the kernel treats 'all' (as it is defined by libcap today)
differently for file and process capabilities. I suspect this is
contributing to expectations about what all means, because all as
defined today is effectively useless, and somewhere in the kernel
we've worked around it for filecaps when we shouldn't have.

Explanation of kernel bug (no use of 'all' here):

1) placing, say, bit-63 (undef today) and bit-0 (cap_chown) on the
permitted (fP) vector of a file, yields a file that can execute - it
gets pP=cap_chown.

cd libcap/progs ; make ; cp capsh pcapsh ; sudo setcap
63,cap_setpcap,cap_chown=p pcapsh

./pcapsh --print

2) dropping cap_chown from the bounding set causes this same file to
not run (insufficient privilege) because the file has been setup to
require cap_chown, but can't have it so (by design) we deny execution
altogether -- this fails:

./pcapsh --drop=cap_chown --print == --print

To be consistent, (1) should fail because bit-63 is being dropped by
the bounding set at execution time. That is, a binary that requires
some future definition for capability #63 to run successfully should
fail on a kernel that does not support it today.  This but was
introduced with this kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=7d8b6c63751cfbbe5eef81a48c22978b3407a3ad

Ironically, the commit message declares the fact that (4) now passes
is the fix. The real fix is to think about what we mean by 'all'.

Cheers

Andrew

On Fri, Dec 20, 2019 at 8:36 PM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> I think the thing to do is use /proc/sys/kernel/cap_last_cap (if it
> exists) to compare *caps to ~(cap_last_cap-1) and for each bit where
> it's the same, show the full set.  Been looking at the fn for a bit,
> and I might be comfortable enough to try it at this point.  But, if
> you want to do it (to make sure it's done right :), please let me know.
>
> -serge
>
> On Fri, Dec 20, 2019 at 07:27:46PM -0800, Andrew G. Morgan wrote:
> > /proc/1/status changed at some point to drop the undefined bits. When
> > it did I changed libcap to try harder to not display unnamed
> > capabilities.
> >
> > Cheers
> >
> > Andrew
> >
> >
> > On Fri, Dec 20, 2019 at 7:08 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > >
> > > I believe this is coming from the kernel.  I booted an old 12.04
> > > ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows
> > > Capabilities for `1': =ep
> > > This was running
> > > libcap 2.22.  Same version of libcap compiled on my laptop gives me the
> > > long output.  booting an old 14.04 livecd, running 4.4 kernel,
> > > getpcap also gives the long output.
> > >
> > > On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> > > > Serge might want to comment. I seem to recall that the issue was relevant
> > > > for Serge's namespaces and checkpointing/restarting over a kernel upgrade.
> > > > But it has been more than a decade and I can't seem to find the email
> > > > exchange we had back then.
> > > >
> > > > Cheers
> > > >
> > > > Andrew
> > > >
> > > >
> > > > On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> > > > mtk.manpages@gmail.com> wrote:
> > > >
> > > > > Hello Andrew,
> > > > >
> > > > > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > > > > [Resend with reply-all this time.]
> > > > > >
> > > > > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk (man-pages)
> > > > > > <mtk.manpages@gmail.com> wrote:
> > > > > >>
> > > > > >> Hello Andrew,
> > > > > >>
> > > > > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <morgan@kernel.org>
> > > > > wrote:
> > > > > >>>
> > > > > >>> This changed with this commit I think:
> > > > > >>>
> > > > > >>>
> > > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > > > > >>>
> > > > > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > > > > >>> all the bitmasks in e and p. When we started to clip the
> > > > > >>> bounding set to only defined capabilities, it meant that the
> > > > > >>> text output started to look like
> > > > > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > > > > >>> was quite terrible.
> > > > > >>
> > > > > >> Was that really the change that caused this? That's a 2008 commit.
> > > > > >> Certainly, I recall in 2014 or 2015 still being able to see =ep, and I
> > > > > >> presume (but do not recall for sure) that I was using a system with a
> > > > > >> libcap more recent than v2.11 (of which that commit was a part).
> > > > > >
> > > > > > The only other commit that seems relevant was this one:
> > > > > >
> > > > > >
> > > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > > > > >
> > > > > > But I think this was all part of the same effort.
> > > > > >
> > > > > >>> So, this was seen as the least worst option.
> > > > > >>
> > > > > >> But surely this is fixable? Or, to put it another was, I presume
> > > > > >> there's something that makes this difficult to fix in getpcaps, but
> > > > > >> what is that something?
> > > > > >
> > > > > > I recall spending a day or more trying to avoid it, but I can't see
> > > > > > how it is really fixable because there are too many moving parts.
> > > > > >
> > > > > > If the kernel adds another capability, then =ep could reasonably mean
> > > > > > both the "old full set" or the "new full set". There are contexts
> > > > > > where the difference matters. For example, where folk are using text
> > > > > > representations for things like package manager shell-script setups.
> > > > > > What they get when they say "=ep
> > > > > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might suddenly be
> > > > > > inappropriate when the new kernel adds "cap_self_destruct". At least
> > > > > > the way it is at present, we are very explicit about what is in
> > > > > > effect
> > > > >
> > > > > I can sort of see how the idea you are expressing might
> > > > > apply when *setting* capabilities on *files*, but:
> > > > >
> > > > > a) I'm talking about the *display* of capabilities of a running
> > > > > *process* using getpcaps(8).
> > > > >
> > > > > b) In practice, the logic that actually applies when setting
> > > > > capabilities on files seems to run *counter* to the idea
> > > > > that you express above (if I understand you correctly),
> > > > > and your argument seems more relevant to files (especially
> > > > > when *setting* file capabilities) than to processes.
> > > > >
> > > > > Consider the following examples:
> > > > >
> > > > > 1. A binary that has all but one capability is described in a
> > > > > compact way by getcap(8):
> > > > >
> > > > >         $ sudo setcap "=p cap_kill-p" mypog
> > > > >         $ getcap mypog
> > > > >         mypog =p cap_kill-p
> > > > >
> > > > > When that same binary is run, the process's capabilities
> > > > > are described with a very different format by getpcaps(8)
> > > > >
> > > > >         $ getpcaps $(pidof i_syscall)
> > > > >         Capabilities for `152006': =
> > > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> > > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > >         cap_block_suspend,cap_audit_read+p
> > > > >
> > > > > That is quite inconsistent! And also, the second notation is
> > > > > simply very hard to read. How many capabilities are listed there?
> > > > > Is it all of them? (When a process does have all caps in permitted,
> > > > > the display differs only by one item.) A security-related notation
> > > > > that is difficult to read is a risk... [*]
> > > > >
> > > > > 2. I just now tried the following, and it really surprised me
> > > > > (although the reasons are quickly obvious when one considers
> > > > > the point I made earlier in this mail thread that 'setcap =p' is
> > > > > setting *64* bits in the file's permittted set):
> > > > >
> > > > >         # Set "all" permitted capabilities on a file:
> > > > >
> > > > >         $ sudo setcap =p myprog
> > > > >         $ getcap myprog
> > > > >         myprog =p
> > > > >
> > > > >         # Set "all" 38 permitted capabilities on a file, by specifying
> > > > >         # them individually:
> > > > >
> > > > >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> > > > >         echo -n ",$c"; done)=p myprog
> > > > >         $ getcap myprog
> > > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> > > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > >         cap_block_suspend,cap_audit_read+p
> > > > >
> > > > > I think there would be few users who would *not* be surprised
> > > > > about the fact that two steps that seem equivalent produce
> > > > > quite differnt output from getcap(8)!
> > > > >
> > > > > 3. Suppose I set all permitted capabilities on a binary:
> > > > >
> > > > >         $ sudo setcap =p myprog
> > > > >
> > > > > Then actually, I have set not just the 38 existing capabilities,
> > > > > but also 26 future capabilities, so that when "cap_self_destruct"
> > > > > is added to the kernel, 'myprog' already has it. This seems to
> > > > > run directly counter to your argument above (if I have understood
> > > > > it correctly).
> > > > >
> > > > > My summary from the above:
> > > > >
> > > > > * The output notation from getpcaps(8) is (1) inconsistent (with
> > > > > getcap(8)), and (2) difficult to read, two things that strike me
> > > > > as risk factors in a security-related notation.
> > > > >
> > > > > * The fact that "setcap =p ..." sets the top 26 (currently unused)
> > > > > bits in the permitted set is surprising, and perhaps also a
> > > > > security risk when new capabilities are actually added to the
> > > > > kernel, since existing binaries will automatically have those
> > > > > capabilities.
> > > > >
> > > > > Thanks,
> > > > >
> > > > > Michael
> > > > >
> > > > > [*] I often joke that the cap_to_text(3) notation is one that is
> > > > > "human-readable, but not necessarily human-comprehensible", but
> > > > > at the same time I also note that the notation has one virtue:
> > > > > it is compact. However, that one virtue seems to have gone out
> > > > > the window for getpcaps(8).
> > > > >
> > > > > --
> > > > > Michael Kerrisk
> > > > > Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> > > > > Linux/UNIX System Programming Training: http://man7.org/training/
> > > > >

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Andrew G. Morgan @ 2019-12-22 23:36 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Michael Kerrisk, Andy Lutomirski, linux-security-module, ksrot
In-Reply-To: <CALQRfL50+JUGcWk_NyyKvviCeNUPQdjjP=OGoewvH-mU6Pn2hQ@mail.gmail.com>

New libcap behavior implemented:

  https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/?id=afef3ef1c62613e1cac12a2bbec6017f7d5e033e

Cheers

Andrew

On Sun, Dec 22, 2019 at 10:00 AM Andrew G. Morgan <morgan@kernel.org> wrote:
>
> I can take a look, but I'm not really convinced this isn't going to
> cause me grief at some point in the future when the kernel adds
> support for another capability and all' does not equal all. I've done
> some digging and I thought it was worth writing up.
>
> Today, the kernel treats 'all' (as it is defined by libcap today)
> differently for file and process capabilities. I suspect this is
> contributing to expectations about what all means, because all as
> defined today is effectively useless, and somewhere in the kernel
> we've worked around it for filecaps when we shouldn't have.
>
> Explanation of kernel bug (no use of 'all' here):
>
> 1) placing, say, bit-63 (undef today) and bit-0 (cap_chown) on the
> permitted (fP) vector of a file, yields a file that can execute - it
> gets pP=cap_chown.
>
> cd libcap/progs ; make ; cp capsh pcapsh ; sudo setcap
> 63,cap_setpcap,cap_chown=p pcapsh
>
> ./pcapsh --print
>
> 2) dropping cap_chown from the bounding set causes this same file to
> not run (insufficient privilege) because the file has been setup to
> require cap_chown, but can't have it so (by design) we deny execution
> altogether -- this fails:
>
> ./pcapsh --drop=cap_chown --print == --print
>
> To be consistent, (1) should fail because bit-63 is being dropped by
> the bounding set at execution time. That is, a binary that requires
> some future definition for capability #63 to run successfully should
> fail on a kernel that does not support it today.  This but was
> introduced with this kernel commit:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=7d8b6c63751cfbbe5eef81a48c22978b3407a3ad
>
> Ironically, the commit message declares the fact that (4) now passes
> is the fix. The real fix is to think about what we mean by 'all'.
>
> Cheers
>
> Andrew
>
> On Fri, Dec 20, 2019 at 8:36 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > I think the thing to do is use /proc/sys/kernel/cap_last_cap (if it
> > exists) to compare *caps to ~(cap_last_cap-1) and for each bit where
> > it's the same, show the full set.  Been looking at the fn for a bit,
> > and I might be comfortable enough to try it at this point.  But, if
> > you want to do it (to make sure it's done right :), please let me know.
> >
> > -serge
> >
> > On Fri, Dec 20, 2019 at 07:27:46PM -0800, Andrew G. Morgan wrote:
> > > /proc/1/status changed at some point to drop the undefined bits. When
> > > it did I changed libcap to try harder to not display unnamed
> > > capabilities.
> > >
> > > Cheers
> > >
> > > Andrew
> > >
> > >
> > > On Fri, Dec 20, 2019 at 7:08 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > > >
> > > > I believe this is coming from the kernel.  I booted an old 12.04
> > > > ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows
> > > > Capabilities for `1': =ep
> > > > This was running
> > > > libcap 2.22.  Same version of libcap compiled on my laptop gives me the
> > > > long output.  booting an old 14.04 livecd, running 4.4 kernel,
> > > > getpcap also gives the long output.
> > > >
> > > > On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> > > > > Serge might want to comment. I seem to recall that the issue was relevant
> > > > > for Serge's namespaces and checkpointing/restarting over a kernel upgrade.
> > > > > But it has been more than a decade and I can't seem to find the email
> > > > > exchange we had back then.
> > > > >
> > > > > Cheers
> > > > >
> > > > > Andrew
> > > > >
> > > > >
> > > > > On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> > > > > mtk.manpages@gmail.com> wrote:
> > > > >
> > > > > > Hello Andrew,
> > > > > >
> > > > > > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > > > > > [Resend with reply-all this time.]
> > > > > > >
> > > > > > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk (man-pages)
> > > > > > > <mtk.manpages@gmail.com> wrote:
> > > > > > >>
> > > > > > >> Hello Andrew,
> > > > > > >>
> > > > > > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <morgan@kernel.org>
> > > > > > wrote:
> > > > > > >>>
> > > > > > >>> This changed with this commit I think:
> > > > > > >>>
> > > > > > >>>
> > > > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > > > > > >>>
> > > > > > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > > > > > >>> all the bitmasks in e and p. When we started to clip the
> > > > > > >>> bounding set to only defined capabilities, it meant that the
> > > > > > >>> text output started to look like
> > > > > > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > > > > > >>> was quite terrible.
> > > > > > >>
> > > > > > >> Was that really the change that caused this? That's a 2008 commit.
> > > > > > >> Certainly, I recall in 2014 or 2015 still being able to see =ep, and I
> > > > > > >> presume (but do not recall for sure) that I was using a system with a
> > > > > > >> libcap more recent than v2.11 (of which that commit was a part).
> > > > > > >
> > > > > > > The only other commit that seems relevant was this one:
> > > > > > >
> > > > > > >
> > > > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > > > > > >
> > > > > > > But I think this was all part of the same effort.
> > > > > > >
> > > > > > >>> So, this was seen as the least worst option.
> > > > > > >>
> > > > > > >> But surely this is fixable? Or, to put it another was, I presume
> > > > > > >> there's something that makes this difficult to fix in getpcaps, but
> > > > > > >> what is that something?
> > > > > > >
> > > > > > > I recall spending a day or more trying to avoid it, but I can't see
> > > > > > > how it is really fixable because there are too many moving parts.
> > > > > > >
> > > > > > > If the kernel adds another capability, then =ep could reasonably mean
> > > > > > > both the "old full set" or the "new full set". There are contexts
> > > > > > > where the difference matters. For example, where folk are using text
> > > > > > > representations for things like package manager shell-script setups.
> > > > > > > What they get when they say "=ep
> > > > > > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might suddenly be
> > > > > > > inappropriate when the new kernel adds "cap_self_destruct". At least
> > > > > > > the way it is at present, we are very explicit about what is in
> > > > > > > effect
> > > > > >
> > > > > > I can sort of see how the idea you are expressing might
> > > > > > apply when *setting* capabilities on *files*, but:
> > > > > >
> > > > > > a) I'm talking about the *display* of capabilities of a running
> > > > > > *process* using getpcaps(8).
> > > > > >
> > > > > > b) In practice, the logic that actually applies when setting
> > > > > > capabilities on files seems to run *counter* to the idea
> > > > > > that you express above (if I understand you correctly),
> > > > > > and your argument seems more relevant to files (especially
> > > > > > when *setting* file capabilities) than to processes.
> > > > > >
> > > > > > Consider the following examples:
> > > > > >
> > > > > > 1. A binary that has all but one capability is described in a
> > > > > > compact way by getcap(8):
> > > > > >
> > > > > >         $ sudo setcap "=p cap_kill-p" mypog
> > > > > >         $ getcap mypog
> > > > > >         mypog =p cap_kill-p
> > > > > >
> > > > > > When that same binary is run, the process's capabilities
> > > > > > are described with a very different format by getpcaps(8)
> > > > > >
> > > > > >         $ getpcaps $(pidof i_syscall)
> > > > > >         Capabilities for `152006': =
> > > > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> > > > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > > >         cap_block_suspend,cap_audit_read+p
> > > > > >
> > > > > > That is quite inconsistent! And also, the second notation is
> > > > > > simply very hard to read. How many capabilities are listed there?
> > > > > > Is it all of them? (When a process does have all caps in permitted,
> > > > > > the display differs only by one item.) A security-related notation
> > > > > > that is difficult to read is a risk... [*]
> > > > > >
> > > > > > 2. I just now tried the following, and it really surprised me
> > > > > > (although the reasons are quickly obvious when one considers
> > > > > > the point I made earlier in this mail thread that 'setcap =p' is
> > > > > > setting *64* bits in the file's permittted set):
> > > > > >
> > > > > >         # Set "all" permitted capabilities on a file:
> > > > > >
> > > > > >         $ sudo setcap =p myprog
> > > > > >         $ getcap myprog
> > > > > >         myprog =p
> > > > > >
> > > > > >         # Set "all" 38 permitted capabilities on a file, by specifying
> > > > > >         # them individually:
> > > > > >
> > > > > >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> > > > > >         echo -n ",$c"; done)=p myprog
> > > > > >         $ getcap myprog
> > > > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> > > > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > > >         cap_block_suspend,cap_audit_read+p
> > > > > >
> > > > > > I think there would be few users who would *not* be surprised
> > > > > > about the fact that two steps that seem equivalent produce
> > > > > > quite differnt output from getcap(8)!
> > > > > >
> > > > > > 3. Suppose I set all permitted capabilities on a binary:
> > > > > >
> > > > > >         $ sudo setcap =p myprog
> > > > > >
> > > > > > Then actually, I have set not just the 38 existing capabilities,
> > > > > > but also 26 future capabilities, so that when "cap_self_destruct"
> > > > > > is added to the kernel, 'myprog' already has it. This seems to
> > > > > > run directly counter to your argument above (if I have understood
> > > > > > it correctly).
> > > > > >
> > > > > > My summary from the above:
> > > > > >
> > > > > > * The output notation from getpcaps(8) is (1) inconsistent (with
> > > > > > getcap(8)), and (2) difficult to read, two things that strike me
> > > > > > as risk factors in a security-related notation.
> > > > > >
> > > > > > * The fact that "setcap =p ..." sets the top 26 (currently unused)
> > > > > > bits in the permitted set is surprising, and perhaps also a
> > > > > > security risk when new capabilities are actually added to the
> > > > > > kernel, since existing binaries will automatically have those
> > > > > > capabilities.
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > Michael
> > > > > >
> > > > > > [*] I often joke that the cap_to_text(3) notation is one that is
> > > > > > "human-readable, but not necessarily human-comprehensible", but
> > > > > > at the same time I also note that the notation has one virtue:
> > > > > > it is compact. However, that one virtue seems to have gone out
> > > > > > the window for getpcaps(8).
> > > > > >
> > > > > > --
> > > > > > Michael Kerrisk
> > > > > > Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> > > > > > Linux/UNIX System Programming Training: http://man7.org/training/
> > > > > >

^ permalink raw reply

* Re: [PATCH v23 12/24] x86/sgx: Linux Enclave Driver
From: Jarkko Sakkinen @ 2019-12-23 11:01 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-kernel, x86, linux-sgx, akpm, dave.hansen,
	sean.j.christopherson, nhorman, npmccallum, serge.ayoun,
	shay.katz-zamir, haitao.huang, andriy.shevchenko, tglx, kai.svahn,
	bp, josh, luto, kai.huang, rientjes, cedric.xing, puiterwijk,
	linux-security-module, Suresh Siddha
In-Reply-To: <20191209195719.GH19243@linux.intel.com>

On Mon, Dec 09, 2019 at 09:57:19PM +0200, Jarkko Sakkinen wrote:
> On Sat, Dec 07, 2019 at 09:09:39AM +0100, Greg KH wrote:
> > On Fri, Dec 06, 2019 at 10:38:07PM +0200, Jarkko Sakkinen wrote:
> > > > Why a whole cdev?
> > > > 
> > > > Why not use a misc device?  YOu only have 2 devices right?  Why not 2
> > > > misc devices then?  That saves the use of a whole major number and makes
> > > > your code a _LOT_ simpler.
> > > 
> > > The downside would be that if we ever want to add sysfs attributes, that
> > > could not be done synchronously with the device creation.
> > 
> > That is what the groups member of struct misc_device is for.
> 
> OK, cool, then there is no problem changing to misc.
> 
> I haven't seen misc drivers (not that I've looked through every single
> of them so I suppose there are such) to use it and somehow have been
> blind to seeing it that it si there.
> 
> Thanks again for the feedback. I'll fix this for the next patch set
> version.

Back in 2014 we bumped with the TPM driver to this issue with sysfs
attributes and converted the driver to have its own class to get around
the issue.

The feature came into misc in 2015 [*]. When I sent the first versions
of the patch set in 2017 I was not aware of this change (should have
checked tho). Agree that the bus was not a great choice but I just
used what I thought I had available.

[*] bd735995308b553cc3c7f6a975aa284b270c7e2c

/Jarkko

^ permalink raw reply

* Report: suspicious RCU usage in security code
From: John Garry @ 2019-12-23 11:21 UTC (permalink / raw)
  To: jmorris@namei.org, serge@hallyn.com
  Cc: Anders Roxell, paulmck@kernel.org, linux-security-module,
	linux-kernel@vger.kernel.org

Hi guys,

I have noticed this WARN on Kernel v5.5-rc3 on my arm64 system:

[   25.952600] =============================
[   25.952602] WARNING: suspicious RCU usage
[   25.952606] 5.5.0-rc3-dirty #816 Not tainted
[   25.952609] -----------------------------
[   25.952613] security/device_cgroup.c:355 RCU-list traversed in 
non-reader section!!
[   25.952615]
                other info that might help us debug this:

[   25.952618]
                rcu_scheduler_active = 2, debug_locks = 1
[   25.952621] 4 locks held by systemd/1:
[   25.952624]  #0: ffff0023de3c4410 (sb_writers#8){.+.+}, at: 
vfs_write+0x1c0/0x1e0
[   25.952637]  #1: ffff0023e732f880 (&of->mutex){+.+.}, at: 
kernfs_fop_write+0x12c/0x250
[   25.952648]  #2: ffff0023e45c4288 (kn->count#30){++++}, at: 
kernfs_fop_write+0x134/0x250
[   25.952656]  #3: ffff800011c4e098 (devcgroup_mutex){+.+.}, at: 
devcgroup_access_write+0x4c/0x6d0
[   25.952663]
                stack backtrace:
[   25.952668] CPU: 6 PID: 1 Comm: systemd Not tainted 5.5.0-rc3-dirty #816
[   25.952670] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI 
RC0 - V1.16.01 03/15/2019
[   25.952672] Call trace:
[   25.952675]  dump_backtrace+0x0/0x1a0
[   25.952678]  show_stack+0x14/0x20
[   25.952681]  dump_stack+0xe8/0x150
[   25.952685]  lockdep_rcu_suspicious+0xcc/0x110
[   25.952689]  match_exception_partial+0x118/0x120
[   25.952691]  verify_new_ex+0x64/0xf0
[   25.952694]  devcgroup_access_write+0x3c8/0x6d0
[   25.952697]  cgroup_file_write+0x88/0x1e0
[   25.952700]  kernfs_fop_write+0x15c/0x250
[   25.952703]  __vfs_write+0x18/0x38
[   25.952705]  vfs_write+0xb4/0x1e0
[   25.952708]  ksys_write+0x68/0xf8
[   25.952711]  __arm64_sys_write+0x18/0x20
[   25.952715]  el0_svc_common.constprop.2+0x74/0x170
[   25.952717]  el0_svc_handler+0x20/0x80
[   25.952720]  el0_sync_handler+0x114/0x1d0
[   25.952722]  el0_sync+0x140/0x180
john@ubuntu:~$
john@ubuntu:~$


RCU Kconfig options:

more .config | grep RCU
# RCU Subsystem
CONFIG_PREEMPT_RCU=y
CONFIG_RCU_EXPERT=y
CONFIG_SRCU=y
CONFIG_TREE_SRCU=y
CONFIG_TASKS_RCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_RCU_NEED_SEGCBLIST=y
CONFIG_RCU_FANOUT=64
CONFIG_RCU_FANOUT_LEAF=16
# CONFIG_RCU_FAST_NO_HZ is not set
# CONFIG_RCU_BOOST is not set
# CONFIG_RCU_NOCB_CPU is not set
# end of RCU Subsystem
CONFIG_HAVE_RCU_TABLE_FREE=y
# RCU Debugging
CONFIG_PROVE_RCU=y
CONFIG_PROVE_RCU_LIST=y
# CONFIG_RCU_PERF_TEST is not set
# CONFIG_RCU_TORTURE_TEST is not set
CONFIG_RCU_CPU_STALL_TIMEOUT=21
# CONFIG_RCU_TRACE is not set
# CONFIG_RCU_EQS_DEBUG is not set
# end of RCU Debugging
john@john-ThinkCentre-M93p:~/kernel-dev$

I notice that verfiy_new_ex() has a RCU lockdep check warning, so the 
condition may just need to be extended to the match_exception_partial() 
RCU list iterator just to remove the WARN.

Note: I am finishing for Christmas vacation today, so can't help further 
ATM.

Cheers,
John

^ permalink raw reply

* Re: [PATCH] ima: add the ability to query ima for the hash of a given file.
From: Mimi Zohar @ 2019-12-23 17:39 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian, Florent Revest, linux-integrity
  Cc: kpsingh, mjg59, linux-kernel, linux-security-module,
	Florent Revest
In-Reply-To: <8f4d9c4e-735d-8ba9-b84a-4f341030e0cf@linux.microsoft.com>

On Fri, 2019-12-20 at 08:48 -0800, Lakshmi Ramasubramanian wrote:
> On 12/20/2019 8:31 AM, Florent Revest wrote:
> 
> >   
> > +/**
> > + * ima_file_hash - return the stored measurement if a file has been hashed.
> > + * @file: pointer to the file
> > + * @buf: buffer in which to store the hash
> > + * @buf_size: length of the buffer
> > + *
> > + * On success, output the hash into buf and return the hash algorithm (as
> > + * defined in the enum hash_algo).
> 
> > + * If the hash is larger than buf, then only size bytes will be copied. It
> > + * generally just makes sense to pass a buffer capable of holding the largest
> > + * possible hash: IMA_MAX_DIGEST_SIZE
> 
> If the given buffer is smaller than the hash length, wouldn't it be 
> better to return the required size and a status indicating the buffer is 
> not enough. The caller can then call back with the required buffer.
> 
> If the hash is truncated the caller may not know if the hash is partial 
> or not.

Based on the hash algorithm, the caller would know if the buffer
provided was too small and was truncated.

> 
> > + *
> > + * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
> > + * If the parameters are incorrect, return -EINVAL.
> > + */
> > +int ima_file_hash(struct file *file, char *buf, size_t buf_size)
> > +{
> > +	struct inode *inode;
> > +	struct integrity_iint_cache *iint;
> > +	size_t copied_size;
> > +
> > +	if (!file || !buf)
> > +		return -EINVAL;
> > +

Other kernel functions provide a means of determining the needed
buffer size by passing a NULL field.  Instead of failing here, if buf
is NULL, how about returning the hash algorithm?

Mimi

> > +	if (!ima_policy_flag)
> > +		return -EOPNOTSUPP;
> > +


^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Serge E. Hallyn @ 2019-12-23 18:05 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Michael Kerrisk, Andy Lutomirski,
	linux-security-module, ksrot
In-Reply-To: <CALQRfL530ySipAtBrWi=i19VKZ+VGdRtJzTymHaEYk5XHK4E0A@mail.gmail.com>

Oh, hm, now that's interesting.  It did fix getpcaps, but broke
getcap:

	serge@sl:~/src/libcap$ sudo setcap all+pe /tmp/true.1
old behavior:
	serge@sl:~/src/libcap$ getcap /tmp/true.1
	/tmp/true.1 =ep
	serge@sl:~/src/libcap$ getpcaps 1
	Capabilities for `1': = cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,cap_block_suspend,cap_audit_read+ep

new behavior:
	serge@sl:~/src/libcap$ ./progs/getcap /tmp/true
	/tmp/true =ep 38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63+ep
	serge@sl:~/src/libcap$ ./progs/getpcaps 1
	Capabilities for `1': =ep

-serge

On Sun, Dec 22, 2019 at 03:36:22PM -0800, Andrew G. Morgan wrote:
> New libcap behavior implemented:
> 
>   https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/?id=afef3ef1c62613e1cac12a2bbec6017f7d5e033e
> 
> Cheers
> 
> Andrew
> 
> On Sun, Dec 22, 2019 at 10:00 AM Andrew G. Morgan <morgan@kernel.org> wrote:
> >
> > I can take a look, but I'm not really convinced this isn't going to
> > cause me grief at some point in the future when the kernel adds
> > support for another capability and all' does not equal all. I've done
> > some digging and I thought it was worth writing up.
> >
> > Today, the kernel treats 'all' (as it is defined by libcap today)
> > differently for file and process capabilities. I suspect this is
> > contributing to expectations about what all means, because all as
> > defined today is effectively useless, and somewhere in the kernel
> > we've worked around it for filecaps when we shouldn't have.
> >
> > Explanation of kernel bug (no use of 'all' here):
> >
> > 1) placing, say, bit-63 (undef today) and bit-0 (cap_chown) on the
> > permitted (fP) vector of a file, yields a file that can execute - it
> > gets pP=cap_chown.
> >
> > cd libcap/progs ; make ; cp capsh pcapsh ; sudo setcap
> > 63,cap_setpcap,cap_chown=p pcapsh
> >
> > ./pcapsh --print
> >
> > 2) dropping cap_chown from the bounding set causes this same file to
> > not run (insufficient privilege) because the file has been setup to
> > require cap_chown, but can't have it so (by design) we deny execution
> > altogether -- this fails:
> >
> > ./pcapsh --drop=cap_chown --print == --print
> >
> > To be consistent, (1) should fail because bit-63 is being dropped by
> > the bounding set at execution time. That is, a binary that requires
> > some future definition for capability #63 to run successfully should
> > fail on a kernel that does not support it today.  This but was
> > introduced with this kernel commit:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=7d8b6c63751cfbbe5eef81a48c22978b3407a3ad
> >
> > Ironically, the commit message declares the fact that (4) now passes
> > is the fix. The real fix is to think about what we mean by 'all'.
> >
> > Cheers
> >
> > Andrew
> >
> > On Fri, Dec 20, 2019 at 8:36 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > >
> > > I think the thing to do is use /proc/sys/kernel/cap_last_cap (if it
> > > exists) to compare *caps to ~(cap_last_cap-1) and for each bit where
> > > it's the same, show the full set.  Been looking at the fn for a bit,
> > > and I might be comfortable enough to try it at this point.  But, if
> > > you want to do it (to make sure it's done right :), please let me know.
> > >
> > > -serge
> > >
> > > On Fri, Dec 20, 2019 at 07:27:46PM -0800, Andrew G. Morgan wrote:
> > > > /proc/1/status changed at some point to drop the undefined bits. When
> > > > it did I changed libcap to try harder to not display unnamed
> > > > capabilities.
> > > >
> > > > Cheers
> > > >
> > > > Andrew
> > > >
> > > >
> > > > On Fri, Dec 20, 2019 at 7:08 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > > > >
> > > > > I believe this is coming from the kernel.  I booted an old 12.04
> > > > > ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows
> > > > > Capabilities for `1': =ep
> > > > > This was running
> > > > > libcap 2.22.  Same version of libcap compiled on my laptop gives me the
> > > > > long output.  booting an old 14.04 livecd, running 4.4 kernel,
> > > > > getpcap also gives the long output.
> > > > >
> > > > > On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> > > > > > Serge might want to comment. I seem to recall that the issue was relevant
> > > > > > for Serge's namespaces and checkpointing/restarting over a kernel upgrade.
> > > > > > But it has been more than a decade and I can't seem to find the email
> > > > > > exchange we had back then.
> > > > > >
> > > > > > Cheers
> > > > > >
> > > > > > Andrew
> > > > > >
> > > > > >
> > > > > > On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> > > > > > mtk.manpages@gmail.com> wrote:
> > > > > >
> > > > > > > Hello Andrew,
> > > > > > >
> > > > > > > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > > > > > > [Resend with reply-all this time.]
> > > > > > > >
> > > > > > > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk (man-pages)
> > > > > > > > <mtk.manpages@gmail.com> wrote:
> > > > > > > >>
> > > > > > > >> Hello Andrew,
> > > > > > > >>
> > > > > > > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <morgan@kernel.org>
> > > > > > > wrote:
> > > > > > > >>>
> > > > > > > >>> This changed with this commit I think:
> > > > > > > >>>
> > > > > > > >>>
> > > > > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > > > > > > >>>
> > > > > > > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > > > > > > >>> all the bitmasks in e and p. When we started to clip the
> > > > > > > >>> bounding set to only defined capabilities, it meant that the
> > > > > > > >>> text output started to look like
> > > > > > > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > > > > > > >>> was quite terrible.
> > > > > > > >>
> > > > > > > >> Was that really the change that caused this? That's a 2008 commit.
> > > > > > > >> Certainly, I recall in 2014 or 2015 still being able to see =ep, and I
> > > > > > > >> presume (but do not recall for sure) that I was using a system with a
> > > > > > > >> libcap more recent than v2.11 (of which that commit was a part).
> > > > > > > >
> > > > > > > > The only other commit that seems relevant was this one:
> > > > > > > >
> > > > > > > >
> > > > > > > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > > > > > > >
> > > > > > > > But I think this was all part of the same effort.
> > > > > > > >
> > > > > > > >>> So, this was seen as the least worst option.
> > > > > > > >>
> > > > > > > >> But surely this is fixable? Or, to put it another was, I presume
> > > > > > > >> there's something that makes this difficult to fix in getpcaps, but
> > > > > > > >> what is that something?
> > > > > > > >
> > > > > > > > I recall spending a day or more trying to avoid it, but I can't see
> > > > > > > > how it is really fixable because there are too many moving parts.
> > > > > > > >
> > > > > > > > If the kernel adds another capability, then =ep could reasonably mean
> > > > > > > > both the "old full set" or the "new full set". There are contexts
> > > > > > > > where the difference matters. For example, where folk are using text
> > > > > > > > representations for things like package manager shell-script setups.
> > > > > > > > What they get when they say "=ep
> > > > > > > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might suddenly be
> > > > > > > > inappropriate when the new kernel adds "cap_self_destruct". At least
> > > > > > > > the way it is at present, we are very explicit about what is in
> > > > > > > > effect
> > > > > > >
> > > > > > > I can sort of see how the idea you are expressing might
> > > > > > > apply when *setting* capabilities on *files*, but:
> > > > > > >
> > > > > > > a) I'm talking about the *display* of capabilities of a running
> > > > > > > *process* using getpcaps(8).
> > > > > > >
> > > > > > > b) In practice, the logic that actually applies when setting
> > > > > > > capabilities on files seems to run *counter* to the idea
> > > > > > > that you express above (if I understand you correctly),
> > > > > > > and your argument seems more relevant to files (especially
> > > > > > > when *setting* file capabilities) than to processes.
> > > > > > >
> > > > > > > Consider the following examples:
> > > > > > >
> > > > > > > 1. A binary that has all but one capability is described in a
> > > > > > > compact way by getcap(8):
> > > > > > >
> > > > > > >         $ sudo setcap "=p cap_kill-p" mypog
> > > > > > >         $ getcap mypog
> > > > > > >         mypog =p cap_kill-p
> > > > > > >
> > > > > > > When that same binary is run, the process's capabilities
> > > > > > > are described with a very different format by getpcaps(8)
> > > > > > >
> > > > > > >         $ getpcaps $(pidof i_syscall)
> > > > > > >         Capabilities for `152006': =
> > > > > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> > > > > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > > > >         cap_block_suspend,cap_audit_read+p
> > > > > > >
> > > > > > > That is quite inconsistent! And also, the second notation is
> > > > > > > simply very hard to read. How many capabilities are listed there?
> > > > > > > Is it all of them? (When a process does have all caps in permitted,
> > > > > > > the display differs only by one item.) A security-related notation
> > > > > > > that is difficult to read is a risk... [*]
> > > > > > >
> > > > > > > 2. I just now tried the following, and it really surprised me
> > > > > > > (although the reasons are quickly obvious when one considers
> > > > > > > the point I made earlier in this mail thread that 'setcap =p' is
> > > > > > > setting *64* bits in the file's permittted set):
> > > > > > >
> > > > > > >         # Set "all" permitted capabilities on a file:
> > > > > > >
> > > > > > >         $ sudo setcap =p myprog
> > > > > > >         $ getcap myprog
> > > > > > >         myprog =p
> > > > > > >
> > > > > > >         # Set "all" 38 permitted capabilities on a file, by specifying
> > > > > > >         # them individually:
> > > > > > >
> > > > > > >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> > > > > > >         echo -n ",$c"; done)=p myprog
> > > > > > >         $ getcap myprog
> > > > > > >         cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > > > >         cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> > > > > > >         cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > > > >         cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > > > >         cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > > > >         cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > > > >         cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > > > >         cap_block_suspend,cap_audit_read+p
> > > > > > >
> > > > > > > I think there would be few users who would *not* be surprised
> > > > > > > about the fact that two steps that seem equivalent produce
> > > > > > > quite differnt output from getcap(8)!
> > > > > > >
> > > > > > > 3. Suppose I set all permitted capabilities on a binary:
> > > > > > >
> > > > > > >         $ sudo setcap =p myprog
> > > > > > >
> > > > > > > Then actually, I have set not just the 38 existing capabilities,
> > > > > > > but also 26 future capabilities, so that when "cap_self_destruct"
> > > > > > > is added to the kernel, 'myprog' already has it. This seems to
> > > > > > > run directly counter to your argument above (if I have understood
> > > > > > > it correctly).
> > > > > > >
> > > > > > > My summary from the above:
> > > > > > >
> > > > > > > * The output notation from getpcaps(8) is (1) inconsistent (with
> > > > > > > getcap(8)), and (2) difficult to read, two things that strike me
> > > > > > > as risk factors in a security-related notation.
> > > > > > >
> > > > > > > * The fact that "setcap =p ..." sets the top 26 (currently unused)
> > > > > > > bits in the permitted set is surprising, and perhaps also a
> > > > > > > security risk when new capabilities are actually added to the
> > > > > > > kernel, since existing binaries will automatically have those
> > > > > > > capabilities.
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > Michael
> > > > > > >
> > > > > > > [*] I often joke that the cap_to_text(3) notation is one that is
> > > > > > > "human-readable, but not necessarily human-comprehensible", but
> > > > > > > at the same time I also note that the notation has one virtue:
> > > > > > > it is compact. However, that one virtue seems to have gone out
> > > > > > > the window for getpcaps(8).
> > > > > > >
> > > > > > > --
> > > > > > > Michael Kerrisk
> > > > > > > Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> > > > > > > Linux/UNIX System Programming Training: http://man7.org/training/
> > > > > > >

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Serge E. Hallyn @ 2019-12-23 19:14 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Michael Kerrisk, Andy Lutomirski,
	linux-security-module, ksrot
In-Reply-To: <CALQRfL50+JUGcWk_NyyKvviCeNUPQdjjP=OGoewvH-mU6Pn2hQ@mail.gmail.com>

On Sun, Dec 22, 2019 at 10:00:06AM -0800, Andrew G. Morgan wrote:
> I can take a look, but I'm not really convinced this isn't going to
> cause me grief at some point in the future when the kernel adds
> support for another capability and all' does not equal all. I've done
> some digging and I thought it was worth writing up.
> 
> Today, the kernel treats 'all' (as it is defined by libcap today)
> differently for file and process capabilities. I suspect this is
> contributing to expectations about what all means, because all as
> defined today is effectively useless, and somewhere in the kernel
> we've worked around it for filecaps when we shouldn't have.
> 
> Explanation of kernel bug (no use of 'all' here):
> 
> 1) placing, say, bit-63 (undef today) and bit-0 (cap_chown) on the
> permitted (fP) vector of a file, yields a file that can execute - it
> gets pP=cap_chown.
> 
> cd libcap/progs ; make ; cp capsh pcapsh ; sudo setcap
> 63,cap_setpcap,cap_chown=p pcapsh
> 
> ./pcapsh --print
> 
> 2) dropping cap_chown from the bounding set causes this same file to
> not run (insufficient privilege) because the file has been setup to
> require cap_chown, but can't have it so (by design) we deny execution
> altogether -- this fails:
> 
> ./pcapsh --drop=cap_chown --print == --print
> 
> To be consistent, (1) should fail because bit-63 is being dropped by
> the bounding set at execution time. That is, a binary that requires
> some future definition for capability #63 to run successfully should
> fail on a kernel that does not support it today.  This but was
> introduced with this kernel commit:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=7d8b6c63751cfbbe5eef81a48c22978b3407a3ad
> 
> Ironically, the commit message declares the fact that (4) now passes
> is the fix. The real fix is to think about what we mean by 'all'.

Yes, I get the feeling you're right.

It'd be nice to simply say "you can only enable the caps you know
about, bit by bit."  The problem is, we have distributions, and we
have containers, and demanding that all filecap-enabled binaries be
fixed up every time you switch to a new kernel with more or fewer
capabilities is unworkable.

One thing we could do is declare that bit nbits-1 (63 in 64 bit caps)
means "all caps that the kernel knows about."  If 63 and any other
caps are enabled, then -EINVAL.  If/when we switch to 128 bit caps,
then bit 63 becomes a valid cap there, and 127 is 'all'.

Of course that's not at all backward compatible, so would have to be
done in a new capability version.

-serge

^ permalink raw reply

* Re: Anomalous output from getpcaps(1) for process with all capabilities
From: Serge E. Hallyn @ 2019-12-23 19:20 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Michael Kerrisk, Andy Lutomirski,
	linux-security-module, ksrot
In-Reply-To: <CALQRfL6yKpXmpnuapJhBxso7ZyXxpw6uiAMu4S4m_DBkwv3FXA@mail.gmail.com>

Ok, thanks.  So it's probably not worth worrying about this, but if we
were going to, then it might seem reasonable (though arguably so) to say
that when getcap sees all known and unknown capabilities set, then it
shows =pe.  There really cannot be any other intent to the user having
set all known and unknown caps.

-serge

On Mon, Dec 23, 2019 at 10:30:24AM -0800, Andrew G. Morgan wrote:
> I think my commit message included what is going on here.
> 
> On Mon, 23 Dec 2019, 10:05 am Serge E. Hallyn, <serge@hallyn.com> wrote:
> 
> > Oh, hm, now that's interesting.  It did fix getpcaps, but broke
> > getcap:
> >
> >         serge@sl:~/src/libcap$ sudo setcap all+pe /tmp/true.1
> > old behavior:
> >         serge@sl:~/src/libcap$ getcap /tmp/true.1
> >         /tmp/true.1 =ep
> >         serge@sl:~/src/libcap$ getpcaps 1
> >         Capabilities for `1': =
> > cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,cap_fsetid,cap_kill,cap_setgid,cap_setuid,cap_setpcap,cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,cap_block_suspend,cap_audit_read+ep
> >
> > new behavior:
> >         serge@sl:~/src/libcap$ ./progs/getcap /tmp/true
> >         /tmp/true =ep
> > 38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63+ep
> >         serge@sl:~/src/libcap$ ./progs/getpcaps 1
> >         Capabilities for `1': =ep
> >
> > -serge
> >
> > On Sun, Dec 22, 2019 at 03:36:22PM -0800, Andrew G. Morgan wrote:
> > > New libcap behavior implemented:
> > >
> > >
> > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/?id=afef3ef1c62613e1cac12a2bbec6017f7d5e033e
> > >
> > > Cheers
> > >
> > > Andrew
> > >
> > > On Sun, Dec 22, 2019 at 10:00 AM Andrew G. Morgan <morgan@kernel.org>
> > wrote:
> > > >
> > > > I can take a look, but I'm not really convinced this isn't going to
> > > > cause me grief at some point in the future when the kernel adds
> > > > support for another capability and all' does not equal all. I've done
> > > > some digging and I thought it was worth writing up.
> > > >
> > > > Today, the kernel treats 'all' (as it is defined by libcap today)
> > > > differently for file and process capabilities. I suspect this is
> > > > contributing to expectations about what all means, because all as
> > > > defined today is effectively useless, and somewhere in the kernel
> > > > we've worked around it for filecaps when we shouldn't have.
> > > >
> > > > Explanation of kernel bug (no use of 'all' here):
> > > >
> > > > 1) placing, say, bit-63 (undef today) and bit-0 (cap_chown) on the
> > > > permitted (fP) vector of a file, yields a file that can execute - it
> > > > gets pP=cap_chown.
> > > >
> > > > cd libcap/progs ; make ; cp capsh pcapsh ; sudo setcap
> > > > 63,cap_setpcap,cap_chown=p pcapsh
> > > >
> > > > ./pcapsh --print
> > > >
> > > > 2) dropping cap_chown from the bounding set causes this same file to
> > > > not run (insufficient privilege) because the file has been setup to
> > > > require cap_chown, but can't have it so (by design) we deny execution
> > > > altogether -- this fails:
> > > >
> > > > ./pcapsh --drop=cap_chown --print == --print
> > > >
> > > > To be consistent, (1) should fail because bit-63 is being dropped by
> > > > the bounding set at execution time. That is, a binary that requires
> > > > some future definition for capability #63 to run successfully should
> > > > fail on a kernel that does not support it today.  This but was
> > > > introduced with this kernel commit:
> > > >
> > > >
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/security/commoncap.c?id=7d8b6c63751cfbbe5eef81a48c22978b3407a3ad
> > > >
> > > > Ironically, the commit message declares the fact that (4) now passes
> > > > is the fix. The real fix is to think about what we mean by 'all'.
> > > >
> > > > Cheers
> > > >
> > > > Andrew
> > > >
> > > > On Fri, Dec 20, 2019 at 8:36 PM Serge E. Hallyn <serge@hallyn.com>
> > wrote:
> > > > >
> > > > > I think the thing to do is use /proc/sys/kernel/cap_last_cap (if it
> > > > > exists) to compare *caps to ~(cap_last_cap-1) and for each bit where
> > > > > it's the same, show the full set.  Been looking at the fn for a bit,
> > > > > and I might be comfortable enough to try it at this point.  But, if
> > > > > you want to do it (to make sure it's done right :), please let me
> > know.
> > > > >
> > > > > -serge
> > > > >
> > > > > On Fri, Dec 20, 2019 at 07:27:46PM -0800, Andrew G. Morgan wrote:
> > > > > > /proc/1/status changed at some point to drop the undefined bits.
> > When
> > > > > > it did I changed libcap to try harder to not display unnamed
> > > > > > capabilities.
> > > > > >
> > > > > > Cheers
> > > > > >
> > > > > > Andrew
> > > > > >
> > > > > >
> > > > > > On Fri, Dec 20, 2019 at 7:08 PM Serge E. Hallyn <serge@hallyn.com>
> > wrote:
> > > > > > >
> > > > > > > I believe this is coming from the kernel.  I booted an old 12.04
> > > > > > > ubuntu server livecd (3.13 kernel) and libcap 2.22, it shows
> > > > > > > Capabilities for `1': =ep
> > > > > > > This was running
> > > > > > > libcap 2.22.  Same version of libcap compiled on my laptop gives
> > me the
> > > > > > > long output.  booting an old 14.04 livecd, running 4.4 kernel,
> > > > > > > getpcap also gives the long output.
> > > > > > >
> > > > > > > On Mon, Dec 16, 2019 at 07:47:44PM -0800, Andrew G. Morgan wrote:
> > > > > > > > Serge might want to comment. I seem to recall that the issue
> > was relevant
> > > > > > > > for Serge's namespaces and checkpointing/restarting over a
> > kernel upgrade.
> > > > > > > > But it has been more than a decade and I can't seem to find
> > the email
> > > > > > > > exchange we had back then.
> > > > > > > >
> > > > > > > > Cheers
> > > > > > > >
> > > > > > > > Andrew
> > > > > > > >
> > > > > > > >
> > > > > > > > On Sun, Dec 15, 2019 at 8:52 PM Michael Kerrisk (man-pages) <
> > > > > > > > mtk.manpages@gmail.com> wrote:
> > > > > > > >
> > > > > > > > > Hello Andrew,
> > > > > > > > >
> > > > > > > > > On 12/16/19 12:26 AM, Andrew G. Morgan wrote:
> > > > > > > > > > [Resend with reply-all this time.]
> > > > > > > > > >
> > > > > > > > > > On Sun, Dec 15, 2019 at 11:17 AM Michael Kerrisk
> > (man-pages)
> > > > > > > > > > <mtk.manpages@gmail.com> wrote:
> > > > > > > > > >>
> > > > > > > > > >> Hello Andrew,
> > > > > > > > > >>
> > > > > > > > > >> On Sun, 15 Dec 2019 at 19:30, Andrew G. Morgan <
> > morgan@kernel.org>
> > > > > > > > > wrote:
> > > > > > > > > >>>
> > > > > > > > > >>> This changed with this commit I think:
> > > > > > > > > >>>
> > > > > > > > > >>>
> > > > > > > > >
> > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=3fa808f5886d08c45866217cfe6e6e9def7de04e
> > > > > > > > > >>>
> > > > > > > > > >>> Prior to that, we had "=ep" mean the cap set was ~0 for
> > > > > > > > > >>> all the bitmasks in e and p. When we started to clip the
> > > > > > > > > >>> bounding set to only defined capabilities, it meant that
> > the
> > > > > > > > > >>> text output started to look like
> > > > > > > > > >>> "=ep 33,34,35,36,37,38,39,40,41,42.....63-ep" which
> > > > > > > > > >>> was quite terrible.
> > > > > > > > > >>
> > > > > > > > > >> Was that really the change that caused this? That's a
> > 2008 commit.
> > > > > > > > > >> Certainly, I recall in 2014 or 2015 still being able to
> > see =ep, and I
> > > > > > > > > >> presume (but do not recall for sure) that I was using a
> > system with a
> > > > > > > > > >> libcap more recent than v2.11 (of which that commit was a
> > part).
> > > > > > > > > >
> > > > > > > > > > The only other commit that seems relevant was this one:
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/libcap/cap_text.c?id=afb17b8c007a49d93b0d30936b2d65af1bfdb039
> > > > > > > > > >
> > > > > > > > > > But I think this was all part of the same effort.
> > > > > > > > > >
> > > > > > > > > >>> So, this was seen as the least worst option.
> > > > > > > > > >>
> > > > > > > > > >> But surely this is fixable? Or, to put it another was, I
> > presume
> > > > > > > > > >> there's something that makes this difficult to fix in
> > getpcaps, but
> > > > > > > > > >> what is that something?
> > > > > > > > > >
> > > > > > > > > > I recall spending a day or more trying to avoid it, but I
> > can't see
> > > > > > > > > > how it is really fixable because there are too many moving
> > parts.
> > > > > > > > > >
> > > > > > > > > > If the kernel adds another capability, then =ep could
> > reasonably mean
> > > > > > > > > > both the "old full set" or the "new full set". There are
> > contexts
> > > > > > > > > > where the difference matters. For example, where folk are
> > using text
> > > > > > > > > > representations for things like package manager
> > shell-script setups.
> > > > > > > > > > What they get when they say "=ep
> > > > > > > > > > cap_setpcap,cap_sys_admin,cap_setfcap-ep" today, might
> > suddenly be
> > > > > > > > > > inappropriate when the new kernel adds
> > "cap_self_destruct". At least
> > > > > > > > > > the way it is at present, we are very explicit about what
> > is in
> > > > > > > > > > effect
> > > > > > > > >
> > > > > > > > > I can sort of see how the idea you are expressing might
> > > > > > > > > apply when *setting* capabilities on *files*, but:
> > > > > > > > >
> > > > > > > > > a) I'm talking about the *display* of capabilities of a
> > running
> > > > > > > > > *process* using getpcaps(8).
> > > > > > > > >
> > > > > > > > > b) In practice, the logic that actually applies when setting
> > > > > > > > > capabilities on files seems to run *counter* to the idea
> > > > > > > > > that you express above (if I understand you correctly),
> > > > > > > > > and your argument seems more relevant to files (especially
> > > > > > > > > when *setting* file capabilities) than to processes.
> > > > > > > > >
> > > > > > > > > Consider the following examples:
> > > > > > > > >
> > > > > > > > > 1. A binary that has all but one capability is described in a
> > > > > > > > > compact way by getcap(8):
> > > > > > > > >
> > > > > > > > >         $ sudo setcap "=p cap_kill-p" mypog
> > > > > > > > >         $ getcap mypog
> > > > > > > > >         mypog =p cap_kill-p
> > > > > > > > >
> > > > > > > > > When that same binary is run, the process's capabilities
> > > > > > > > > are described with a very different format by getpcaps(8)
> > > > > > > > >
> > > > > > > > >         $ getpcaps $(pidof i_syscall)
> > > > > > > > >         Capabilities for `152006': =
> > > > > > > > >
> >  cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > > > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > > > > > >
> >  cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > > > > > >         cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,
> > > > > > > > >
> >  cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > > > > > >
> >  cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > > > > > >
> >  cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > > > > > >
> >  cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > > > > > >
> >  cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > > > > > >         cap_block_suspend,cap_audit_read+p
> > > > > > > > >
> > > > > > > > > That is quite inconsistent! And also, the second notation is
> > > > > > > > > simply very hard to read. How many capabilities are listed
> > there?
> > > > > > > > > Is it all of them? (When a process does have all caps in
> > permitted,
> > > > > > > > > the display differs only by one item.) A security-related
> > notation
> > > > > > > > > that is difficult to read is a risk... [*]
> > > > > > > > >
> > > > > > > > > 2. I just now tried the following, and it really surprised me
> > > > > > > > > (although the reasons are quickly obvious when one considers
> > > > > > > > > the point I made earlier in this mail thread that 'setcap
> > =p' is
> > > > > > > > > setting *64* bits in the file's permittted set):
> > > > > > > > >
> > > > > > > > >         # Set "all" permitted capabilities on a file:
> > > > > > > > >
> > > > > > > > >         $ sudo setcap =p myprog
> > > > > > > > >         $ getcap myprog
> > > > > > > > >         myprog =p
> > > > > > > > >
> > > > > > > > >         # Set "all" 38 permitted capabilities on a file, by
> > specifying
> > > > > > > > >         # them individually:
> > > > > > > > >
> > > > > > > > >         $ sudo setcap 0$(for c in $(seq 1 37); do \
> > > > > > > > >         echo -n ",$c"; done)=p myprog
> > > > > > > > >         $ getcap myprog
> > > > > > > > >
> >  cap_chown,cap_dac_override,cap_dac_read_search,cap_fowner,
> > > > > > > > >         cap_fsetid,cap_setgid,cap_setuid,cap_setpcap,
> > > > > > > > >
> >  cap_linux_immutable,cap_net_bind_service,cap_net_broadcast,
> > > > > > > > >
> >  cap_net_admin,cap_net_raw,cap_ipc_lock,cap_ipc_owner,cap_kill,
> > > > > > > > >
> >  cap_sys_module,cap_sys_rawio,cap_sys_chroot,cap_sys_ptrace,
> > > > > > > > >
> >  cap_sys_pacct,cap_sys_admin,cap_sys_boot,cap_sys_nice,
> > > > > > > > >
> >  cap_sys_resource,cap_sys_time,cap_sys_tty_config,cap_mknod,
> > > > > > > > >
> >  cap_lease,cap_audit_write,cap_audit_control,cap_setfcap,
> > > > > > > > >
> >  cap_mac_override,cap_mac_admin,cap_syslog,cap_wake_alarm,
> > > > > > > > >         cap_block_suspend,cap_audit_read+p
> > > > > > > > >
> > > > > > > > > I think there would be few users who would *not* be surprised
> > > > > > > > > about the fact that two steps that seem equivalent produce
> > > > > > > > > quite differnt output from getcap(8)!
> > > > > > > > >
> > > > > > > > > 3. Suppose I set all permitted capabilities on a binary:
> > > > > > > > >
> > > > > > > > >         $ sudo setcap =p myprog
> > > > > > > > >
> > > > > > > > > Then actually, I have set not just the 38 existing
> > capabilities,
> > > > > > > > > but also 26 future capabilities, so that when
> > "cap_self_destruct"
> > > > > > > > > is added to the kernel, 'myprog' already has it. This seems
> > to
> > > > > > > > > run directly counter to your argument above (if I have
> > understood
> > > > > > > > > it correctly).
> > > > > > > > >
> > > > > > > > > My summary from the above:
> > > > > > > > >
> > > > > > > > > * The output notation from getpcaps(8) is (1) inconsistent
> > (with
> > > > > > > > > getcap(8)), and (2) difficult to read, two things that
> > strike me
> > > > > > > > > as risk factors in a security-related notation.
> > > > > > > > >
> > > > > > > > > * The fact that "setcap =p ..." sets the top 26 (currently
> > unused)
> > > > > > > > > bits in the permitted set is surprising, and perhaps also a
> > > > > > > > > security risk when new capabilities are actually added to the
> > > > > > > > > kernel, since existing binaries will automatically have those
> > > > > > > > > capabilities.
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > >
> > > > > > > > > Michael
> > > > > > > > >
> > > > > > > > > [*] I often joke that the cap_to_text(3) notation is one
> > that is
> > > > > > > > > "human-readable, but not necessarily human-comprehensible",
> > but
> > > > > > > > > at the same time I also note that the notation has one
> > virtue:
> > > > > > > > > it is compact. However, that one virtue seems to have gone
> > out
> > > > > > > > > the window for getpcaps(8).
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > Michael Kerrisk
> > > > > > > > > Linux man-pages maintainer;
> > http://www.kernel.org/doc/man-pages/
> > > > > > > > > Linux/UNIX System Programming Training:
> > http://man7.org/training/
> > > > > > > > >
> >

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox