[PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata
       [not found] <20180227004121.3633-1-mic@digikod.net>
@ 2018-02-27  0:41 ` Mickaël Salaün
  2018-02-27  0:57   ` Al Viro
                     ` (2 more replies)
  2018-02-27  0:41 ` [PATCH bpf-next v8 02/11] fs,security: Add a new file access type: MAY_CHROOT Mickaël Salaün
       [not found] ` <20180227004121.3633-6-mic@digikod.net>
  2 siblings, 3 replies; 10+ messages in thread
From: Mickaël Salaün @ 2018-02-27  0:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnaldo Carvalho de Melo, Casey Schaufler, Daniel Borkmann,
	David Drysdale, David S . Miller, Eric W . Biederman,
	James Morris, Jann Horn, Jonathan Corbet, Michael Kerrisk,
	Kees Cook, Paul Moore, Sargun Dhillon, Serge E . Hallyn,
	Shuah Khan, Tejun Heo, Thomas Graf, Tycho Andersen, Will Drewry,
	kernel-hardening, linux-api, linux-security-module, netdev,
	Alexander Viro, James Morris, John Johansen, Stephen Smalley,
	Tetsuo Handa, linux-fsdevel

The function current_nameidata_security(struct inode *) can be used to
retrieve a blob's pointer address tied to the inode being walk through.
This enable to follow a path lookup and know where an inode access come
from. This is needed for the Landlock LSM to be able to restrict access
to file path.

The LSM hook nameidata_free_security(struct inode *) is called before
freeing the associated nameidata.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: James Morris <jmorris@namei.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/namei.c                | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/lsm_hooks.h |  7 +++++++
 include/linux/namei.h     | 14 +++++++++++++-
 include/linux/security.h  |  7 +++++++
 security/security.c       |  7 +++++++
 5 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index 921ae32dbc80..d592b3fb0d1e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,6 +505,9 @@ struct nameidata {
 	struct inode	*link_inode;
 	unsigned	root_seq;
 	int		dfd;
+#ifdef CONFIG_SECURITY
+	struct nameidata_lookup lookup;
+#endif
 } __randomize_layout;
 
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
@@ -515,6 +518,9 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 	p->name = name;
 	p->total_link_count = old ? old->total_link_count : 0;
 	p->saved = old;
+#ifdef CONFIG_SECURITY
+	p->lookup.security = NULL;
+#endif
 	current->nameidata = p;
 }
 
@@ -522,6 +528,7 @@ static void restore_nameidata(void)
 {
 	struct nameidata *now = current->nameidata, *old = now->saved;
 
+	security_nameidata_put_lookup(&now->lookup, now->inode);
 	current->nameidata = old;
 	if (old)
 		old->total_link_count = now->total_link_count;
@@ -549,6 +556,27 @@ static int __nd_alloc_stack(struct nameidata *nd)
 	return 0;
 }
 
+#ifdef CONFIG_SECURITY
+/**
+ * current_nameidata_lookup - get the state of the current path walk
+ *
+ * @inode: inode associated to the path walk
+ *
+ * Used by LSM modules for access restriction based on path walk. The LSM is in
+ * charge of the lookup->security blob allocation and management. The hook
+ * security_nameidata_put_lookup() will be called after the path walk end.
+ *
+ * Return ERR_PTR(-ENOENT) if there is no match.
+ */
+struct nameidata_lookup *current_nameidata_lookup(const struct inode *inode)
+{
+	if (!current->nameidata || current->nameidata->inode != inode)
+		return ERR_PTR(-ENOENT);
+	return &current->nameidata->lookup;
+}
+EXPORT_SYMBOL(current_nameidata_lookup);
+#endif
+
 /**
  * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
  * @path: nameidate to verify
@@ -2009,6 +2037,13 @@ static inline u64 hash_name(const void *salt, const char *name)
 
 #endif
 
+static inline void refresh_lookup(struct nameidata *nd)
+{
+#ifdef CONFIG_SECURITY
+	nd->lookup.type = nd->last_type;
+#endif
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -2025,6 +2060,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		name++;
 	if (!*name)
 		return 0;
+	/* be ready for may_lookup() */
+	refresh_lookup(nd);
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
@@ -2064,6 +2101,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		nd->last.hash_len = hash_len;
 		nd->last.name = name;
 		nd->last_type = type;
+		/* be ready for the next security_inode_permission() */
+		refresh_lookup(nd);
 
 		name += hashlen_len(hash_len);
 		if (!*name)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7161d8e7ee79..d71cf183f0be 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -428,6 +428,10 @@
  *	security module does not know about attribute or a negative error code
  *	to abort the copy up. Note that the caller is responsible for reading
  *	and writing the xattrs as this hook is merely a filter.
+ * @nameidata_put_lookup:
+ *	Deallocate and clear the current's nameidata->lookup.security field.
+ *	@lookup->security contains the security structure to be freed.
+ *	@inode is the last associated inode to the path walk
  *
  * Security hooks for file operations
  *
@@ -1514,6 +1518,8 @@ union security_list_options {
 	void (*inode_getsecid)(struct inode *inode, u32 *secid);
 	int (*inode_copy_up)(struct dentry *src, struct cred **new);
 	int (*inode_copy_up_xattr)(const char *name);
+	void (*nameidata_put_lookup)(struct nameidata_lookup *lookup,
+					struct inode *inode);
 
 	int (*file_permission)(struct file *file, int mask);
 	int (*file_alloc_security)(struct file *file);
@@ -1805,6 +1811,7 @@ struct security_hook_heads {
 	struct list_head inode_getsecid;
 	struct list_head inode_copy_up;
 	struct list_head inode_copy_up_xattr;
+	struct list_head nameidata_put_lookup;
 	struct list_head file_permission;
 	struct list_head file_alloc_security;
 	struct list_head file_free_security;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a982bb7cd480..ba08cbb41f97 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -14,7 +14,19 @@ enum { MAX_NESTED_LINKS = 8 };
 /*
  * Type of the last component on LOOKUP_PARENT
  */
-enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+enum namei_type {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+
+#ifdef CONFIG_SECURITY
+struct nameidata_lookup {
+	void *security;
+	enum namei_type type;
+};
+
+struct inode;
+
+extern struct nameidata_lookup *current_nameidata_lookup(
+		const struct inode *inode);
+#endif
 
 /*
  * The bitmask for a lookup event:
diff --git a/include/linux/security.h b/include/linux/security.h
index 73f1ef625d40..b1fd4370daf8 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 
 struct linux_binprm;
 struct cred;
@@ -302,6 +303,8 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer
 void security_inode_getsecid(struct inode *inode, u32 *secid);
 int security_inode_copy_up(struct dentry *src, struct cred **new);
 int security_inode_copy_up_xattr(const char *name);
+void security_nameidata_put_lookup(struct nameidata_lookup *lookup,
+					struct inode *inode);
 int security_file_permission(struct file *file, int mask);
 int security_file_alloc(struct file *file);
 void security_file_free(struct file *file);
@@ -808,6 +811,10 @@ static inline int security_inode_copy_up_xattr(const char *name)
 	return -EOPNOTSUPP;
 }
 
+static inline void security_nameidata_put_lookup(
+		struct nameidata_lookup *lookup, struct inode *inode)
+{ }
+
 static inline int security_file_permission(struct file *file, int mask)
 {
 	return 0;
diff --git a/security/security.c b/security/security.c
index 1cd8526cb0b7..17053c7a1a77 100644
--- a/security/security.c
+++ b/security/security.c
@@ -857,6 +857,13 @@ int security_inode_copy_up_xattr(const char *name)
 }
 EXPORT_SYMBOL(security_inode_copy_up_xattr);
 
+void security_nameidata_put_lookup(struct nameidata_lookup *lookup,
+					struct inode *inode)
+{
+	call_void_hook(nameidata_put_lookup, lookup, inode);
+}
+EXPORT_SYMBOL(security_nameidata_put_lookup);
+
 int security_file_permission(struct file *file, int mask)
 {
 	int ret;
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH bpf-next v8 02/11] fs,security: Add a new file access type: MAY_CHROOT
       [not found] <20180227004121.3633-1-mic@digikod.net>
  2018-02-27  0:41 ` [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata Mickaël Salaün
@ 2018-02-27  0:41 ` Mickaël Salaün
       [not found] ` <20180227004121.3633-6-mic@digikod.net>
  2 siblings, 0 replies; 10+ messages in thread
From: Mickaël Salaün @ 2018-02-27  0:41 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Arnaldo Carvalho de Melo, Casey Schaufler, Daniel Borkmann,
	David Drysdale, David S . Miller, Eric W . Biederman,
	James Morris, Jann Horn, Jonathan Corbet, Michael Kerrisk,
	Kees Cook, Paul Moore, Sargun Dhillon, Serge E . Hallyn,
	Shuah Khan, Tejun Heo, Thomas Graf, Tycho Andersen, Will Drewry,
	kernel-hardening, linux-api, linux-security-module, netdev,
	Alexander Viro, James Morris, John Johansen, Stephen Smalley,
	Tetsuo Handa, linux-fsdevel

For compatibility reason, MAY_CHROOT is always set with MAY_CHDIR.
However, this new flag enable to differentiate a chdir form a chroot.

This is needed for the Landlock LSM to be able to evaluate a new root
directory.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: James Morris <jmorris@namei.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/open.c          | 3 ++-
 include/linux/fs.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/open.c b/fs/open.c
index 7ea118471dce..084d147c0e96 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -489,7 +489,8 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR |
+			MAY_CHROOT);
 	if (error)
 		goto dput_and_out;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a815560fda0..67c62374446c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -90,6 +90,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define MAY_CHDIR		0x00000040
 /* called from RCU mode, don't block */
 #define MAY_NOT_BLOCK		0x00000080
+#define MAY_CHROOT		0x00000100
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata
  2018-02-27  0:41 ` [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata Mickaël Salaün
@ 2018-02-27  0:57   ` Al Viro
  2018-02-27  1:23     ` Al Viro
  2018-02-28 16:27   ` kbuild test robot
  2018-02-28 16:58   ` kbuild test robot
  2 siblings, 1 reply; 10+ messages in thread
From: Al Viro @ 2018-02-27  0:57 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Arnaldo Carvalho de Melo, Casey Schaufler, Daniel Borkmann,
	David Drysdale, David S . Miller, Eric W . Biederman,
	James Morris, Jann Horn, Jonathan Corbet, Michael Kerrisk,
	Kees Cook, Paul Moore, Sargun Dhillon, Serge E . Hallyn,
	Shuah Khan, Tejun Heo, Thomas Graf, Tycho Andersen, Will Drewry,
	kernel-hardening, linux-api, linux-security-module, netdev,
	James Morris, John Johansen, Stephen Smalley, Tetsuo Handa,
	linux-fsdevel

On Tue, Feb 27, 2018 at 01:41:11AM +0100, Mickaï¿½l Salaï¿½n wrote:
> The function current_nameidata_security(struct inode *) can be used to
> retrieve a blob's pointer address tied to the inode being walk through.
> This enable to follow a path lookup and know where an inode access come
> from. This is needed for the Landlock LSM to be able to restrict access
> to file path.
> 
> The LSM hook nameidata_free_security(struct inode *) is called before
> freeing the associated nameidata.

NAK.  Not without well-defined semantics and "some Linux S&M uses that for
something, don't ask what" does not count.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata
  2018-02-27  0:57   ` Al Viro
@ 2018-02-27  1:23     ` Al Viro
  2018-03-11 20:14       ` Mickaël Salaün
  0 siblings, 1 reply; 10+ messages in thread
From: Al Viro @ 2018-02-27  1:23 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Arnaldo Carvalho de Melo, Casey Schaufler, Daniel Borkmann,
	David Drysdale, David S . Miller, Eric W . Biederman,
	James Morris, Jann Horn, Jonathan Corbet, Michael Kerrisk,
	Kees Cook, Paul Moore, Sargun Dhillon, Serge E . Hallyn,
	Shuah Khan, Tejun Heo, Thomas Graf, Tycho Andersen, Will Drewry,
	kernel-hardening, linux-api, linux-security-module, netdev,
	James Morris, John Johansen, Stephen Smalley, Tetsuo Handa,
	linux-fsdevel

On Tue, Feb 27, 2018 at 12:57:21AM +0000, Al Viro wrote:
> On Tue, Feb 27, 2018 at 01:41:11AM +0100, Mickaï¿½l Salaï¿½n wrote:
> > The function current_nameidata_security(struct inode *) can be used to
> > retrieve a blob's pointer address tied to the inode being walk through.
> > This enable to follow a path lookup and know where an inode access come
> > from. This is needed for the Landlock LSM to be able to restrict access
> > to file path.
> > 
> > The LSM hook nameidata_free_security(struct inode *) is called before
> > freeing the associated nameidata.
> 
> NAK.  Not without well-defined semantics and "some Linux S&M uses that for
> something, don't ask what" does not count.

Incidentally, pathwalk mechanics is subject to change at zero notice, so
if you want something, you'd better
	* have explicitly defined semantics
	* explain what it is - on fsdevel
	* not have it hidden behind the layers of opaque LSM dreck, pardon
the redundance.

Again, pathwalk internals have changed in the past and may bloody well
change again in the future.  There's a damn good reason why struct nameidata
is _not_ visible outside of fs/namei.c, and quietly relying upon any
implementation details is no-go.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata
  2018-02-27  0:41 ` [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata Mickaël Salaün
  2018-02-27  0:57   ` Al Viro
@ 2018-02-28 16:27   ` kbuild test robot
  2018-02-28 16:58   ` kbuild test robot
  2 siblings, 0 replies; 10+ messages in thread
From: kbuild test robot @ 2018-02-28 16:27 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: kbuild-all, linux-kernel, Mickaël Salaün,
	Alexei Starovoitov, Andy Lutomirski, Arnaldo Carvalho de Melo,
	Casey Schaufler, Daniel Borkmann, David Drysdale,
	David S . Miller, Eric W . Biederman, James Morris, Jann Horn,
	Jonathan Corbet, Michael Kerrisk, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Shuah Khan, Tejun Heo,
	Thomas Graf, Tycho Andersen, Will Drewry, kernel-hardening,
	linux-api, linux-security-module, netdev, Alexander Viro,
	James Morris, John Johansen, Stephen Smalley, Tetsuo Handa,
	linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1202 bytes --]

Hi Mickaï¿½l,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Micka-l-Sala-n/Landlock-LSM-Toward-unprivileged-sandboxing/20180228-233659
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-x017-201808 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   In file included from fs//quota/dquot.c:74:0:
>> include/linux/security.h:815:10: warning: 'struct nameidata_lookup' declared inside parameter list will not be visible outside of this definition or declaration
      struct nameidata_lookup *lookup, struct inode *inode)
             ^~~~~~~~~~~~~~~~

vim +815 include/linux/security.h

   813	
   814	static inline void security_nameidata_put_lookup(
 > 815			struct nameidata_lookup *lookup, struct inode *inode)
   816	{ }
   817	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26738 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata
  2018-02-27  0:41 ` [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata Mickaël Salaün
  2018-02-27  0:57   ` Al Viro
  2018-02-28 16:27   ` kbuild test robot
@ 2018-02-28 16:58   ` kbuild test robot
  2 siblings, 0 replies; 10+ messages in thread
From: kbuild test robot @ 2018-02-28 16:58 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: kbuild-all, linux-kernel, Mickaël Salaün,
	Alexei Starovoitov, Andy Lutomirski, Arnaldo Carvalho de Melo,
	Casey Schaufler, Daniel Borkmann, David Drysdale,
	David S . Miller, Eric W . Biederman, James Morris, Jann Horn,
	Jonathan Corbet, Michael Kerrisk, Kees Cook, Paul Moore,
	Sargun Dhillon, Serge E . Hallyn, Shuah Khan, Tejun Heo,
	Thomas Graf, Tycho Andersen, Will Drewry, kernel-hardening,
	linux-api, linux-security-module, netdev, Alexander Viro,
	James Morris, John Johansen, Stephen Smalley, Tetsuo Handa,
	linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2549 bytes --]

Hi Mickaï¿½l,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Micka-l-Sala-n/Landlock-LSM-Toward-unprivileged-sandboxing/20180228-233659
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-randconfig-a1-201808 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from init/main.c:37:0:
>> include/linux/security.h:815:43: warning: 'struct nameidata_lookup' declared inside parameter list
      struct nameidata_lookup *lookup, struct inode *inode)
                                              ^
>> include/linux/security.h:815:43: warning: its scope is only this definition or declaration, which is probably not what you want
--
   In file included from fs/namei.c:27:0:
>> include/linux/security.h:815:43: warning: 'struct nameidata_lookup' declared inside parameter list
      struct nameidata_lookup *lookup, struct inode *inode)
                                              ^
>> include/linux/security.h:815:43: warning: its scope is only this definition or declaration, which is probably not what you want
   fs/namei.c: In function 'restore_nameidata':
   fs/namei.c:531:36: error: 'struct nameidata' has no member named 'lookup'
     security_nameidata_put_lookup(&now->lookup, now->inode);
                                       ^
--
   In file included from include/linux/lsm_hooks.h:28:0,
                    from security/commoncap.c:15:
>> include/linux/security.h:815:43: warning: 'struct nameidata_lookup' declared inside parameter list
      struct nameidata_lookup *lookup, struct inode *inode)
                                              ^
>> include/linux/security.h:815:43: warning: its scope is only this definition or declaration, which is probably not what you want
   In file included from security/commoncap.c:15:0:
>> include/linux/lsm_hooks.h:1522:13: warning: 'struct nameidata_lookup' declared inside parameter list
         struct inode *inode);
                ^

vim +815 include/linux/security.h

   813	
   814	static inline void security_nameidata_put_lookup(
 > 815			struct nameidata_lookup *lookup, struct inode *inode)
   816	{ }
   817	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29018 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata
  2018-02-27  1:23     ` Al Viro
@ 2018-03-11 20:14       ` Mickaël Salaün
  0 siblings, 0 replies; 10+ messages in thread
From: Mickaël Salaün @ 2018-03-11 20:14 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Arnaldo Carvalho de Melo, Casey Schaufler, Daniel Borkmann,
	David Drysdale, David S . Miller, Eric W . Biederman,
	James Morris, Jann Horn, Jonathan Corbet, Michael Kerrisk,
	Kees Cook, Paul Moore, Sargun Dhillon, Serge E . Hallyn,
	Shuah Khan, Tejun Heo, Thomas Graf, Tycho Andersen, Will Drewry,
	kernel-hardening, linux-api, linux-security-module, netdev,
	James Morris, John Johansen, Stephen Smalley, Tetsuo Handa,
	linux-fsdevel

[-- Attachment #1.1: Type: text/plain, Size: 3655 bytes --]

On 02/27/2018 02:23 AM, Al Viro wrote:
> On Tue, Feb 27, 2018 at 12:57:21AM +0000, Al Viro wrote:
>> On Tue, Feb 27, 2018 at 01:41:11AM +0100, Mickaël Salaün wrote:
>>> The function current_nameidata_security(struct inode *) can be used to
>>> retrieve a blob's pointer address tied to the inode being walk through.
>>> This enable to follow a path lookup and know where an inode access come
>>> from. This is needed for the Landlock LSM to be able to restrict access
>>> to file path.
>>>
>>> The LSM hook nameidata_free_security(struct inode *) is called before
>>> freeing the associated nameidata.
>>
>> NAK.  Not without well-defined semantics and "some Linux S&M uses that for
>> something, don't ask what" does not count.
> 
> Incidentally, pathwalk mechanics is subject to change at zero notice, so
> if you want something, you'd better
> 	* have explicitly defined semantics
> 	* explain what it is - on fsdevel
> 	* not have it hidden behind the layers of opaque LSM dreck, pardon
> the redundance.
> 
> Again, pathwalk internals have changed in the past and may bloody well
> change again in the future.  There's a damn good reason why struct nameidata
> is _not_ visible outside of fs/namei.c, and quietly relying upon any
> implementation details is no-go.
> 

I thought this whole patch series would go to linux-fsdevel but only
this patch did. I'll CCed fsdevel for the next round. Meanwhile, the
cover letter is here: https://lkml.org/lkml/2018/2/26/1214
The code using current_nameidata_lookup(inode) is in the patch 07/11:
https://lkml.org/lkml/2018/2/26/1206

To sum up, I don't know any way to identify if a directory (execute)
access was directly requested by a process or inferred by the kernel
because of a path walk. This was not needed until now because the other
access control systems (either the DAC or access controls enforced by
inode-based LSM, i.e. SELinux and Smack) do not care about the file
hierarchy. Path-based access controls (i.e. AppArmor and Tomoyo)
directly use the notion of path to define a security policy (in the
kernel, not only in the user space configuration). Landlock can't rely
on xattrs (because of composed and unprivileged access control). Because
we can't know for sure from which path an inode come from (if any),
path-based LSM hooks do not help for some file system checks (e.g.
inode_permission). With Landlock, I try to find a way to identify a set
of inodes, from the user space point of view, which is most of the time
related to file hierarchies.

I needed a way to "follow" a path walk, with the minimum amount of code,
and if possible without touching the fs/namei.c . I saw that the
pathwalk mechanism has evolved over time. With this patch, I tried to
make a kernel object (nameidata) usable in some way by LSM, but only
through an inode (current_nameidata_lookup(inode)). The "only" guarantee
of this function should be to identify if an inode is tied to a path
walk. This enable to follow a path walk and know why an inode access is
requested.

I get your concern about the "instability" of the path walk mechanism.
However, I though that a path resolution should not change from the user
space point of view, like other Linux ABI. Anyway, all the current
inode-based access controls, including DAC, rely on this path walks
mechanism. This patch does not expose anything to user space, but only
through the API of Landlock, which is currently relying on path walk
resolutions, already visible to user space. Did I miss something? Do you
have another suggestion to tie an inode to a path walk?

Thanks,
 Mickaël

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 05/11] seccomp,landlock: Enforce Landlock programs per process hierarchy
       [not found]                   ` <CALCETrViaXEx1iQ6q8bEEWSLchj=FH6LjcRY6+hjMx8A+rtgDQ@mail.gmail.com>
@ 2018-04-08 22:01                     ` Mickaël Salaün
  2018-04-10  4:48                       ` Alexei Starovoitov
  0 siblings, 1 reply; 10+ messages in thread
From: Mickaël Salaün @ 2018-04-08 22:01 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Daniel Borkmann, LKML, Alexei Starovoitov,
	Arnaldo Carvalho de Melo, Casey Schaufler, David Drysdale,
	David S . Miller, Eric W . Biederman, Jann Horn, Jonathan Corbet,
	Michael Kerrisk, Kees Cook, Paul Moore, Sargun Dhillon,
	Serge E . Hallyn, Shuah Khan, Tejun Heo, Thomas Graf,
	Tycho Andersen, Will Drewry, Kernel Hardening, Linux API,
	LSM List, Network Development, Andrew Morton, Al Viro,
	Linux-Fsdevel


[-- Attachment #1.1: Type: text/plain, Size: 22278 bytes --]


On 04/08/2018 11:06 PM, Andy Lutomirski wrote:
> On Sun, Apr 8, 2018 at 6:13 AM, Mickaël Salaün <mic@digikod.net> wrote:
>>
>> On 02/27/2018 10:48 PM, Mickaël Salaün wrote:
>>>
>>> On 27/02/2018 17:39, Andy Lutomirski wrote:
>>>> On Tue, Feb 27, 2018 at 5:32 AM, Alexei Starovoitov
>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>> On Tue, Feb 27, 2018 at 05:20:55AM +0000, Andy Lutomirski wrote:
>>>>>> On Tue, Feb 27, 2018 at 4:54 AM, Alexei Starovoitov
>>>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>>>> On Tue, Feb 27, 2018 at 04:40:34AM +0000, Andy Lutomirski wrote:
>>>>>>>> On Tue, Feb 27, 2018 at 2:08 AM, Alexei Starovoitov
>>>>>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>>>>>> On Tue, Feb 27, 2018 at 01:41:15AM +0100, Mickaël Salaün wrote:
>>>>>>>>>> The seccomp(2) syscall can be used by a task to apply a Landlock program
>>>>>>>>>> to itself. As a seccomp filter, a Landlock program is enforced for the
>>>>>>>>>> current task and all its future children. A program is immutable and a
>>>>>>>>>> task can only add new restricting programs to itself, forming a list of
>>>>>>>>>> programss.
>>>>>>>>>>
>>>>>>>>>> A Landlock program is tied to a Landlock hook. If the action on a kernel
>>>>>>>>>> object is allowed by the other Linux security mechanisms (e.g. DAC,
>>>>>>>>>> capabilities, other LSM), then a Landlock hook related to this kind of
>>>>>>>>>> object is triggered. The list of programs for this hook is then
>>>>>>>>>> evaluated. Each program return a 32-bit value which can deny the action
>>>>>>>>>> on a kernel object with a non-zero value. If every programs of the list
>>>>>>>>>> return zero, then the action on the object is allowed.
>>>>>>>>>>
>>>>>>>>>> Multiple Landlock programs can be chained to share a 64-bits value for a
>>>>>>>>>> call chain (e.g. evaluating multiple elements of a file path).  This
>>>>>>>>>> chaining is restricted when a process construct this chain by loading a
>>>>>>>>>> program, but additional checks are performed when it requests to apply
>>>>>>>>>> this chain of programs to itself.  The restrictions ensure that it is
>>>>>>>>>> not possible to call multiple programs in a way that would imply to
>>>>>>>>>> handle multiple shared values (i.e. cookies) for one chain.  For now,
>>>>>>>>>> only a fs_pick program can be chained to the same type of program,
>>>>>>>>>> because it may make sense if they have different triggers (cf. next
>>>>>>>>>> commits).  This restrictions still allows to reuse Landlock programs in
>>>>>>>>>> a safe way (e.g. use the same loaded fs_walk program with multiple
>>>>>>>>>> chains of fs_pick programs).
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Mickaël Salaün <mic@digikod.net>
>>>>>>>>>
>>>>>>>>> ...
>>>>>>>>>
>>>>>>>>>> +struct landlock_prog_set *landlock_prepend_prog(
>>>>>>>>>> +             struct landlock_prog_set *current_prog_set,
>>>>>>>>>> +             struct bpf_prog *prog)
>>>>>>>>>> +{
>>>>>>>>>> +     struct landlock_prog_set *new_prog_set = current_prog_set;
>>>>>>>>>> +     unsigned long pages;
>>>>>>>>>> +     int err;
>>>>>>>>>> +     size_t i;
>>>>>>>>>> +     struct landlock_prog_set tmp_prog_set = {};
>>>>>>>>>> +
>>>>>>>>>> +     if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
>>>>>>>>>> +             return ERR_PTR(-EINVAL);
>>>>>>>>>> +
>>>>>>>>>> +     /* validate memory size allocation */
>>>>>>>>>> +     pages = prog->pages;
>>>>>>>>>> +     if (current_prog_set) {
>>>>>>>>>> +             size_t i;
>>>>>>>>>> +
>>>>>>>>>> +             for (i = 0; i < ARRAY_SIZE(current_prog_set->programs); i++) {
>>>>>>>>>> +                     struct landlock_prog_list *walker_p;
>>>>>>>>>> +
>>>>>>>>>> +                     for (walker_p = current_prog_set->programs[i];
>>>>>>>>>> +                                     walker_p; walker_p = walker_p->prev)
>>>>>>>>>> +                             pages += walker_p->prog->pages;
>>>>>>>>>> +             }
>>>>>>>>>> +             /* count a struct landlock_prog_set if we need to allocate one */
>>>>>>>>>> +             if (refcount_read(&current_prog_set->usage) != 1)
>>>>>>>>>> +                     pages += round_up(sizeof(*current_prog_set), PAGE_SIZE)
>>>>>>>>>> +                             / PAGE_SIZE;
>>>>>>>>>> +     }
>>>>>>>>>> +     if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
>>>>>>>>>> +             return ERR_PTR(-E2BIG);
>>>>>>>>>> +
>>>>>>>>>> +     /* ensure early that we can allocate enough memory for the new
>>>>>>>>>> +      * prog_lists */
>>>>>>>>>> +     err = store_landlock_prog(&tmp_prog_set, current_prog_set, prog);
>>>>>>>>>> +     if (err)
>>>>>>>>>> +             return ERR_PTR(err);
>>>>>>>>>> +
>>>>>>>>>> +     /*
>>>>>>>>>> +      * Each task_struct points to an array of prog list pointers.  These
>>>>>>>>>> +      * tables are duplicated when additions are made (which means each
>>>>>>>>>> +      * table needs to be refcounted for the processes using it). When a new
>>>>>>>>>> +      * table is created, all the refcounters on the prog_list are bumped (to
>>>>>>>>>> +      * track each table that references the prog). When a new prog is
>>>>>>>>>> +      * added, it's just prepended to the list for the new table to point
>>>>>>>>>> +      * at.
>>>>>>>>>> +      *
>>>>>>>>>> +      * Manage all the possible errors before this step to not uselessly
>>>>>>>>>> +      * duplicate current_prog_set and avoid a rollback.
>>>>>>>>>> +      */
>>>>>>>>>> +     if (!new_prog_set) {
>>>>>>>>>> +             /*
>>>>>>>>>> +              * If there is no Landlock program set used by the current task,
>>>>>>>>>> +              * then create a new one.
>>>>>>>>>> +              */
>>>>>>>>>> +             new_prog_set = new_landlock_prog_set();
>>>>>>>>>> +             if (IS_ERR(new_prog_set))
>>>>>>>>>> +                     goto put_tmp_lists;
>>>>>>>>>> +     } else if (refcount_read(&current_prog_set->usage) > 1) {
>>>>>>>>>> +             /*
>>>>>>>>>> +              * If the current task is not the sole user of its Landlock
>>>>>>>>>> +              * program set, then duplicate them.
>>>>>>>>>> +              */
>>>>>>>>>> +             new_prog_set = new_landlock_prog_set();
>>>>>>>>>> +             if (IS_ERR(new_prog_set))
>>>>>>>>>> +                     goto put_tmp_lists;
>>>>>>>>>> +             for (i = 0; i < ARRAY_SIZE(new_prog_set->programs); i++) {
>>>>>>>>>> +                     new_prog_set->programs[i] =
>>>>>>>>>> +                             READ_ONCE(current_prog_set->programs[i]);
>>>>>>>>>> +                     if (new_prog_set->programs[i])
>>>>>>>>>> +                             refcount_inc(&new_prog_set->programs[i]->usage);
>>>>>>>>>> +             }
>>>>>>>>>> +
>>>>>>>>>> +             /*
>>>>>>>>>> +              * Landlock program set from the current task will not be freed
>>>>>>>>>> +              * here because the usage is strictly greater than 1. It is
>>>>>>>>>> +              * only prevented to be freed by another task thanks to the
>>>>>>>>>> +              * caller of landlock_prepend_prog() which should be locked if
>>>>>>>>>> +              * needed.
>>>>>>>>>> +              */
>>>>>>>>>> +             landlock_put_prog_set(current_prog_set);
>>>>>>>>>> +     }
>>>>>>>>>> +
>>>>>>>>>> +     /* prepend tmp_prog_set to new_prog_set */
>>>>>>>>>> +     for (i = 0; i < ARRAY_SIZE(tmp_prog_set.programs); i++) {
>>>>>>>>>> +             /* get the last new list */
>>>>>>>>>> +             struct landlock_prog_list *last_list =
>>>>>>>>>> +                     tmp_prog_set.programs[i];
>>>>>>>>>> +
>>>>>>>>>> +             if (last_list) {
>>>>>>>>>> +                     while (last_list->prev)
>>>>>>>>>> +                             last_list = last_list->prev;
>>>>>>>>>> +                     /* no need to increment usage (pointer replacement) */
>>>>>>>>>> +                     last_list->prev = new_prog_set->programs[i];
>>>>>>>>>> +                     new_prog_set->programs[i] = tmp_prog_set.programs[i];
>>>>>>>>>> +             }
>>>>>>>>>> +     }
>>>>>>>>>> +     new_prog_set->chain_last = tmp_prog_set.chain_last;
>>>>>>>>>> +     return new_prog_set;
>>>>>>>>>> +
>>>>>>>>>> +put_tmp_lists:
>>>>>>>>>> +     for (i = 0; i < ARRAY_SIZE(tmp_prog_set.programs); i++)
>>>>>>>>>> +             put_landlock_prog_list(tmp_prog_set.programs[i]);
>>>>>>>>>> +     return new_prog_set;
>>>>>>>>>> +}
>>>>>>>>>
>>>>>>>>> Nack on the chaining concept.
>>>>>>>>> Please do not reinvent the wheel.
>>>>>>>>> There is an existing mechanism for attaching/detaching/quering multiple
>>>>>>>>> programs attached to cgroup and tracing hooks that are also
>>>>>>>>> efficiently executed via BPF_PROG_RUN_ARRAY.
>>>>>>>>> Please use that instead.
>>>>>>>>>
>>>>>>>>
>>>>>>>> I don't see how that would help.  Suppose you add a filter, then
>>>>>>>> fork(), and then the child adds another filter.  Do you want to
>>>>>>>> duplicate the entire array?  You certainly can't *modify* the array
>>>>>>>> because you'll affect processes that shouldn't be affected.
>>>>>>>>
>>>>>>>> In contrast, doing this through seccomp like the earlier patches
>>>>>>>> seemed just fine to me, and seccomp already had the right logic.
>>>>>>>
>>>>>>> it doesn't look to me that existing seccomp side of managing fork
>>>>>>> situation can be reused. Here there is an attempt to add 'chaining'
>>>>>>> concept which sort of an extension of existing seccomp style,
>>>>>>> but somehow heavily done on bpf side and contradicts cgroup/tracing.
>>>>>>>
>>>>>>
>>>>>> I don't see why the seccomp way can't be used.  I agree with you that
>>>>>> the seccomp *style* shouldn't be used in bpf code like this, but I
>>>>>> think that Landlock programs can and should just live in the existing
>>>>>> seccomp chain.  If the existing seccomp code needs some modification
>>>>>> to make this work, then so be it.
>>>>>
>>>>> +1
>>>>> if that was the case...
>>>>> but that's not my reading of the patch set.
>>>>
>>>> An earlier version of the patch set used the seccomp filter chain.
>>>> Mickaël, what exactly was wrong with that approach other than that the
>>>> seccomp() syscall was awkward for you to use?  You could add a
>>>> seccomp_add_landlock_rule() syscall if you needed to.
>>>
>>> Nothing was wrong about about that, this part did not changed (see my
>>> next comment).
>>>
>>>>
>>>> As a side comment, why is this an LSM at all, let alone a non-stacking
>>>> LSM?  It would make a lot more sense to me to make Landlock depend on
>>>> having LSMs configured in but to call the landlock hooks directly from
>>>> the security_xyz() hooks.
>>>
>>> See Casey's answer and his patch series: https://lwn.net/Articles/741963/
>>>
>>>>
>>>>>
>>>>>> In other words, the kernel already has two kinds of chaining:
>>>>>> seccomp's and bpf's.  bpf's doesn't work right for this type of usage
>>>>>> across fork(), whereas seccomp's already handles that case correctly.
>>>>>> (In contrast, seccomp's is totally wrong for cgroup-attached filters.)
>>>>>>  So IMO Landlock should use the seccomp core code and call into bpf
>>>>>> for the actual filtering.
>>>>>
>>>>> +1
>>>>> in cgroup we had to invent this new BPF_PROG_RUN_ARRAY mechanism,
>>>>> since cgroup hierarchy can be complicated with bpf progs attached
>>>>> at different levels with different override/multiprog properties,
>>>>> so walking link list and checking all flags at run-time would have
>>>>> been too slow. That's why we added compute_effective_progs().
>>>>
>>>> If we start adding override flags to Landlock, I think we're doing it
>>>> wrong.   With cgroup bpf programs, the whole mess is set up by the
>>>> administrator.  With seccomp, and with Landlock if done correctly, it
>>>> *won't* be set up by the administrator, so the chance that everyone
>>>> gets all the flags right is about zero.  All attached filters should
>>>> run unconditionally.
>>>
>>>
>>> There is a misunderstanding about this chaining mechanism. This should
>>> not be confused with the list of seccomp filters nor the cgroup
>>> hierarchies. Landlock programs can be stacked the same way seccomp's
>>> filters can (cf. struct landlock_prog_set, the "chain_last" field is an
>>> optimization which is not used for this struct handling). This stackable
>>> property did not changed from the previous patch series. The chaining
>>> mechanism is for another use case, which does not make sense for seccomp
>>> filters nor other eBPF program types, at least for now, from what I can
>>> tell.
>>>
>>> You may want to get a look at my talk at FOSDEM
>>> (https://landlock.io/talks/2018-02-04_landlock-fosdem.pdf), especially
>>> slides 11 and 12.
>>>
>>> Let me explain my reasoning about this program chaining thing.
>>>
>>> To check if an action on a file is allowed, we first need to identify
>>> this file and match it to the security policy. In a previous
>>> (non-public) patch series, I tried to use one type of eBPF program to
>>> check every kind of access to a file. To be able to identify a file, I
>>> relied on an eBPF map, similar to the current inode map. This map store
>>> a set of references to file descriptors. I then created a function
>>> bpf_is_file_beneath() to check if the requested file was beneath a file
>>> in the map. This way, no chaining, only one eBPF program type to check
>>> an access to a file... but some issues then emerged. First, this design
>>> create a side-channel which help an attacker using such a program to
>>> infer some information not normally available, for example to get a hint
>>> on where a file descriptor (received from a UNIX socket) come from.
>>> Another issue is that this type of program would be called for each
>>> component of a path. Indeed, when the kernel check if an access to a
>>> file is allowed, it walk through all of the directories in its path
>>> (checking if the current process is allowed to execute them). That first
>>> attempt led me to rethink the way we could filter an access to a file
>>> *path*.
>>>
>>> To minimize the number of called to an eBPF program dedicated to
>>> validate an access to a file path, I decided to create three subtype of
>>> eBPF programs. The FS_WALK type is called when walking through every
>>> directory of a file path (except the last one if it is the target). We
>>> can then restrict this type of program to the minimum set of functions
>>> it is allowed to call and the minimum set of data available from its
>>> context. The first implicit chaining is for this type of program. To be
>>> able to evaluate a path while being called for all its components, this
>>> program need to store a state (to remember what was the parent directory
>>> of this path). There is no "previous" field in the subtype for this
>>> program because it is chained with itself, for each directories. This
>>> enable to create a FS_WALK program to evaluate a file hierarchy, thank
>>> to the inode map which can be used to check if a directory of this
>>> hierarchy is part of an allowed (or denied) list of directories. This
>>> design enables to express a file hierarchy in a programmatic way,
>>> without requiring an eBPF helper to do the job (unlike my first experiment).
>>>
>>> The explicit chaining is used to tied a path evaluation (with a FS_WALK
>>> program) to an access to the actual file being requested (the last
>>> component of a file path), with a FS_PICK program. It is only at this
>>> time that the kernel check for the requested action (e.g. read, write,
>>> chdir, append...). To be able to filter such access request we can have
>>> one call to the same program for every action and let this program check
>>> for which action it was called. However, this design does not allow the
>>> kernel to know if the current action is indeed handled by this program.
>>> Hence, it is not possible to implement a cache mechanism to only call
>>> this program if it knows how to handle this action.
>>>
>>> The approach I took for this FS_PICK type of program is to add to its
>>> subtype which action it can handle (with the "triggers" bitfield, seen
>>> as ORed actions). This way, the kernel knows if a call to a FS_PICK
>>> program is necessary. If the user wants to enforce a different security
>>> policy according to the action requested on a file, then it needs
>>> multiple FS_PICK programs. However, to reduce the number of such
>>> programs, this patch series allow a FS_PICK program to be chained with
>>> another, the same way a FS_WALK is chained with itself. This way, if the
>>> user want to check if the action is a for example an "open" and a "read"
>>> and not a "map" and a "read", then it can chain multiple FS_PICK
>>> programs with different triggers actions. The OR check performed by the
>>> kernel is not a limitation then, only a way to know if a call to an eBPF
>>> program is needed.
>>>
>>> The last type of program is FS_GET. This one is called when a process
>>> get a struct file or change its working directory. This is the only
>>> program type able (and allowed) to tag a file. This restriction is
>>> important to not being subject to resource exhaustion attacks (i.e.
>>> tagging every inode accessible to an attacker, which would allocate too
>>> much kernel memory).
>>>
>>> This design gives room for improvements to create a cache of eBPF
>>> context (input data, including maps if any), with the result of an eBPF
>>> program. This would help limit the number of call to an eBPF program the
>>> same way SELinux or other kernel components do to limit costly checks.
>>>
>>> The eBPF maps of progs are useful to call the same type of eBPF
>>> program. It does not fit with this use case because we may want multiple
>>> eBPF program according to the action requested on a kernel object (e.g.
>>> FS_GET). The other reason is because the eBPF program does not know what
>>> will be the next (type of) access check performed by the kernel.
>>>
>>> To say it another way, this chaining mechanism is a way to split a
>>> kernel object evaluation with multiple specialized programs, each of
>>> them being able to deal with data tied to their type. Using a monolithic
>>> eBPF program to check everything does not scale and does not fit with
>>> unprivileged use either.
>>>
>>> As a side note, the cookie value is only an ephemeral value to keep a
>>> state between multiple programs call. It can be used to create a state
>>> machine for an object evaluation.
>>>
>>> I don't see a way to do an efficient and programmatic path evaluation,
>>> with different access checks, with the current eBPF features. Please let
>>> me know if you know how to do it another way.
>>>
>>
>> Andy, Alexei, Daniel, what do you think about this Landlock program
>> chaining and cookie?
>>
> 
> Can you give a small pseudocode real world example that acutally needs
> chaining?  The mechanism is quite complicated and I'd like to
> understand how it'll be used.
> 

Here is the interesting part from the example (patch 09/11):

+SEC("maps")
+struct bpf_map_def inode_map = {
+	.type = BPF_MAP_TYPE_INODE,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = 20,
+};
+
+SEC("subtype/landlock1")
+static union bpf_prog_subtype _subtype1 = {
+	.landlock_hook = {
+		.type = LANDLOCK_HOOK_FS_WALK,
+	}
+};
+
+static __always_inline __u64 update_cookie(__u64 cookie, __u8 lookup,
+		void *inode, void *chain, bool freeze)
+{
+	__u64 map_allow = 0;
+
+	if (cookie == 0) {
+		cookie = bpf_inode_get_tag(inode, chain);
+		if (cookie)
+			return cookie;
+		/* only look for the first match in the map, ignore nested
+		 * paths in this example */
+		map_allow = bpf_inode_map_lookup(&inode_map, inode);
+		if (map_allow)
+			cookie = 1 | map_allow;
+	} else {
+		if (cookie & COOKIE_VALUE_FREEZED)
+			return cookie;
+		map_allow = cookie & _MAP_MARK_MASK;
+		cookie &= ~_MAP_MARK_MASK;
+		switch (lookup) {
+		case LANDLOCK_CTX_FS_WALK_INODE_LOOKUP_DOTDOT:
+			cookie--;
+			break;
+		case LANDLOCK_CTX_FS_WALK_INODE_LOOKUP_DOT:
+			break;
+		default:
+			/* ignore _MAP_MARK_MASK overflow in this example */
+			cookie++;
+			break;
+		}
+		if (cookie >= 1)
+			cookie |= map_allow;
+	}
+	/* do not modify the cookie for each fs_pick */
+	if (freeze && cookie)
+		cookie |= COOKIE_VALUE_FREEZED;
+	return cookie;
+}
+
+SEC("landlock1")
+int fs_walk(struct landlock_ctx_fs_walk *ctx)
+{
+	ctx->cookie = update_cookie(ctx->cookie, ctx->inode_lookup,
+			(void *)ctx->inode, (void *)ctx->chain, false);
+	return LANDLOCK_RET_ALLOW;
+}

The program "landlock1" is called for every directory execution (except
the last one if it is the leaf of a path). This enables to identify a
file hierarchy with only a (one dimension) list of file descriptors
(i.e. inode_map).

Underneath, the Landlock LSM part looks if there is an associated path
walk (nameidata) with each inode access request. If there is one, then
the cookie associated with the path walk (if any) is made available
through the eBPF program context. This enables to develop a state
machine with an eBPF program to "evaluate" a file path (without string
parsing).

The goal with this chaining mechanism is to be able to express a complex
kernel object like a file, with multiple run of one or more eBPF
programs, as a multilayer evaluation. This semantic may only make sense
for the user/developer and his security policy. We must keep in mind
that this object identification should be available to unprivileged
processes. This means that we must be very careful to what kind of
information are available to an eBPF program because this can then leak
to a process (e.g. through a map). With this mechanism, only information
already available to user space is available to the eBPF program.

In this example, the complexity of the path evaluation is in the eBPF
program. We can then keep the kernel code more simple and generic. This
enables more flexibility for a security policy definition.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 05/11] seccomp,landlock: Enforce Landlock programs per process hierarchy
  2018-04-08 22:01                     ` [PATCH bpf-next v8 05/11] seccomp,landlock: Enforce Landlock programs per process hierarchy Mickaël Salaün
@ 2018-04-10  4:48                       ` Alexei Starovoitov
  2018-04-11 22:18                         ` Mickaël Salaün
  0 siblings, 1 reply; 10+ messages in thread
From: Alexei Starovoitov @ 2018-04-10  4:48 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Andy Lutomirski, Daniel Borkmann, LKML, Alexei Starovoitov,
	Arnaldo Carvalho de Melo, Casey Schaufler, David Drysdale,
	David S . Miller, Eric W . Biederman, Jann Horn, Jonathan Corbet,
	Michael Kerrisk, Kees Cook, Paul Moore, Sargun Dhillon,
	Serge E . Hallyn, Shuah Khan, Tejun Heo, Thomas Graf,
	Tycho Andersen, Will Drewry, Kernel Hardening, Linux API,
	LSM List, Network Development, Andrew Morton, Al Viro,
	Linux-Fsdevel

On Mon, Apr 09, 2018 at 12:01:59AM +0200, Mickaï¿½l Salaï¿½n wrote:
> 
> On 04/08/2018 11:06 PM, Andy Lutomirski wrote:
> > On Sun, Apr 8, 2018 at 6:13 AM, Mickaï¿½l Salaï¿½n <mic@digikod.net> wrote:
> >>
> >> On 02/27/2018 10:48 PM, Mickaï¿½l Salaï¿½n wrote:
> >>>
> >>> On 27/02/2018 17:39, Andy Lutomirski wrote:
> >>>> On Tue, Feb 27, 2018 at 5:32 AM, Alexei Starovoitov
> >>>> <alexei.starovoitov@gmail.com> wrote:
> >>>>> On Tue, Feb 27, 2018 at 05:20:55AM +0000, Andy Lutomirski wrote:
> >>>>>> On Tue, Feb 27, 2018 at 4:54 AM, Alexei Starovoitov
> >>>>>> <alexei.starovoitov@gmail.com> wrote:
> >>>>>>> On Tue, Feb 27, 2018 at 04:40:34AM +0000, Andy Lutomirski wrote:
> >>>>>>>> On Tue, Feb 27, 2018 at 2:08 AM, Alexei Starovoitov
> >>>>>>>> <alexei.starovoitov@gmail.com> wrote:
> >>>>>>>>> On Tue, Feb 27, 2018 at 01:41:15AM +0100, Mickaï¿½l Salaï¿½n wrote:
> >>>>>>>>>> The seccomp(2) syscall can be used by a task to apply a Landlock program
> >>>>>>>>>> to itself. As a seccomp filter, a Landlock program is enforced for the
> >>>>>>>>>> current task and all its future children. A program is immutable and a
> >>>>>>>>>> task can only add new restricting programs to itself, forming a list of
> >>>>>>>>>> programss.
> >>>>>>>>>>
> >>>>>>>>>> A Landlock program is tied to a Landlock hook. If the action on a kernel
> >>>>>>>>>> object is allowed by the other Linux security mechanisms (e.g. DAC,
> >>>>>>>>>> capabilities, other LSM), then a Landlock hook related to this kind of
> >>>>>>>>>> object is triggered. The list of programs for this hook is then
> >>>>>>>>>> evaluated. Each program return a 32-bit value which can deny the action
> >>>>>>>>>> on a kernel object with a non-zero value. If every programs of the list
> >>>>>>>>>> return zero, then the action on the object is allowed.
> >>>>>>>>>>
> >>>>>>>>>> Multiple Landlock programs can be chained to share a 64-bits value for a
> >>>>>>>>>> call chain (e.g. evaluating multiple elements of a file path).  This
> >>>>>>>>>> chaining is restricted when a process construct this chain by loading a
> >>>>>>>>>> program, but additional checks are performed when it requests to apply
> >>>>>>>>>> this chain of programs to itself.  The restrictions ensure that it is
> >>>>>>>>>> not possible to call multiple programs in a way that would imply to
> >>>>>>>>>> handle multiple shared values (i.e. cookies) for one chain.  For now,
> >>>>>>>>>> only a fs_pick program can be chained to the same type of program,
> >>>>>>>>>> because it may make sense if they have different triggers (cf. next
> >>>>>>>>>> commits).  This restrictions still allows to reuse Landlock programs in
> >>>>>>>>>> a safe way (e.g. use the same loaded fs_walk program with multiple
> >>>>>>>>>> chains of fs_pick programs).
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Mickaï¿½l Salaï¿½n <mic@digikod.net>
> >>>>>>>>>
> >>>>>>>>> ...
> >>>>>>>>>
> >>>>>>>>>> +struct landlock_prog_set *landlock_prepend_prog(
> >>>>>>>>>> +             struct landlock_prog_set *current_prog_set,
> >>>>>>>>>> +             struct bpf_prog *prog)
> >>>>>>>>>> +{
> >>>>>>>>>> +     struct landlock_prog_set *new_prog_set = current_prog_set;
> >>>>>>>>>> +     unsigned long pages;
> >>>>>>>>>> +     int err;
> >>>>>>>>>> +     size_t i;
> >>>>>>>>>> +     struct landlock_prog_set tmp_prog_set = {};
> >>>>>>>>>> +
> >>>>>>>>>> +     if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
> >>>>>>>>>> +             return ERR_PTR(-EINVAL);
> >>>>>>>>>> +
> >>>>>>>>>> +     /* validate memory size allocation */
> >>>>>>>>>> +     pages = prog->pages;
> >>>>>>>>>> +     if (current_prog_set) {
> >>>>>>>>>> +             size_t i;
> >>>>>>>>>> +
> >>>>>>>>>> +             for (i = 0; i < ARRAY_SIZE(current_prog_set->programs); i++) {
> >>>>>>>>>> +                     struct landlock_prog_list *walker_p;
> >>>>>>>>>> +
> >>>>>>>>>> +                     for (walker_p = current_prog_set->programs[i];
> >>>>>>>>>> +                                     walker_p; walker_p = walker_p->prev)
> >>>>>>>>>> +                             pages += walker_p->prog->pages;
> >>>>>>>>>> +             }
> >>>>>>>>>> +             /* count a struct landlock_prog_set if we need to allocate one */
> >>>>>>>>>> +             if (refcount_read(&current_prog_set->usage) != 1)
> >>>>>>>>>> +                     pages += round_up(sizeof(*current_prog_set), PAGE_SIZE)
> >>>>>>>>>> +                             / PAGE_SIZE;
> >>>>>>>>>> +     }
> >>>>>>>>>> +     if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
> >>>>>>>>>> +             return ERR_PTR(-E2BIG);
> >>>>>>>>>> +
> >>>>>>>>>> +     /* ensure early that we can allocate enough memory for the new
> >>>>>>>>>> +      * prog_lists */
> >>>>>>>>>> +     err = store_landlock_prog(&tmp_prog_set, current_prog_set, prog);
> >>>>>>>>>> +     if (err)
> >>>>>>>>>> +             return ERR_PTR(err);
> >>>>>>>>>> +
> >>>>>>>>>> +     /*
> >>>>>>>>>> +      * Each task_struct points to an array of prog list pointers.  These
> >>>>>>>>>> +      * tables are duplicated when additions are made (which means each
> >>>>>>>>>> +      * table needs to be refcounted for the processes using it). When a new
> >>>>>>>>>> +      * table is created, all the refcounters on the prog_list are bumped (to
> >>>>>>>>>> +      * track each table that references the prog). When a new prog is
> >>>>>>>>>> +      * added, it's just prepended to the list for the new table to point
> >>>>>>>>>> +      * at.
> >>>>>>>>>> +      *
> >>>>>>>>>> +      * Manage all the possible errors before this step to not uselessly
> >>>>>>>>>> +      * duplicate current_prog_set and avoid a rollback.
> >>>>>>>>>> +      */
> >>>>>>>>>> +     if (!new_prog_set) {
> >>>>>>>>>> +             /*
> >>>>>>>>>> +              * If there is no Landlock program set used by the current task,
> >>>>>>>>>> +              * then create a new one.
> >>>>>>>>>> +              */
> >>>>>>>>>> +             new_prog_set = new_landlock_prog_set();
> >>>>>>>>>> +             if (IS_ERR(new_prog_set))
> >>>>>>>>>> +                     goto put_tmp_lists;
> >>>>>>>>>> +     } else if (refcount_read(&current_prog_set->usage) > 1) {
> >>>>>>>>>> +             /*
> >>>>>>>>>> +              * If the current task is not the sole user of its Landlock
> >>>>>>>>>> +              * program set, then duplicate them.
> >>>>>>>>>> +              */
> >>>>>>>>>> +             new_prog_set = new_landlock_prog_set();
> >>>>>>>>>> +             if (IS_ERR(new_prog_set))
> >>>>>>>>>> +                     goto put_tmp_lists;
> >>>>>>>>>> +             for (i = 0; i < ARRAY_SIZE(new_prog_set->programs); i++) {
> >>>>>>>>>> +                     new_prog_set->programs[i] =
> >>>>>>>>>> +                             READ_ONCE(current_prog_set->programs[i]);
> >>>>>>>>>> +                     if (new_prog_set->programs[i])
> >>>>>>>>>> +                             refcount_inc(&new_prog_set->programs[i]->usage);
> >>>>>>>>>> +             }
> >>>>>>>>>> +
> >>>>>>>>>> +             /*
> >>>>>>>>>> +              * Landlock program set from the current task will not be freed
> >>>>>>>>>> +              * here because the usage is strictly greater than 1. It is
> >>>>>>>>>> +              * only prevented to be freed by another task thanks to the
> >>>>>>>>>> +              * caller of landlock_prepend_prog() which should be locked if
> >>>>>>>>>> +              * needed.
> >>>>>>>>>> +              */
> >>>>>>>>>> +             landlock_put_prog_set(current_prog_set);
> >>>>>>>>>> +     }
> >>>>>>>>>> +
> >>>>>>>>>> +     /* prepend tmp_prog_set to new_prog_set */
> >>>>>>>>>> +     for (i = 0; i < ARRAY_SIZE(tmp_prog_set.programs); i++) {
> >>>>>>>>>> +             /* get the last new list */
> >>>>>>>>>> +             struct landlock_prog_list *last_list =
> >>>>>>>>>> +                     tmp_prog_set.programs[i];
> >>>>>>>>>> +
> >>>>>>>>>> +             if (last_list) {
> >>>>>>>>>> +                     while (last_list->prev)
> >>>>>>>>>> +                             last_list = last_list->prev;
> >>>>>>>>>> +                     /* no need to increment usage (pointer replacement) */
> >>>>>>>>>> +                     last_list->prev = new_prog_set->programs[i];
> >>>>>>>>>> +                     new_prog_set->programs[i] = tmp_prog_set.programs[i];
> >>>>>>>>>> +             }
> >>>>>>>>>> +     }
> >>>>>>>>>> +     new_prog_set->chain_last = tmp_prog_set.chain_last;
> >>>>>>>>>> +     return new_prog_set;
> >>>>>>>>>> +
> >>>>>>>>>> +put_tmp_lists:
> >>>>>>>>>> +     for (i = 0; i < ARRAY_SIZE(tmp_prog_set.programs); i++)
> >>>>>>>>>> +             put_landlock_prog_list(tmp_prog_set.programs[i]);
> >>>>>>>>>> +     return new_prog_set;
> >>>>>>>>>> +}
> >>>>>>>>>
> >>>>>>>>> Nack on the chaining concept.
> >>>>>>>>> Please do not reinvent the wheel.
> >>>>>>>>> There is an existing mechanism for attaching/detaching/quering multiple
> >>>>>>>>> programs attached to cgroup and tracing hooks that are also
> >>>>>>>>> efficiently executed via BPF_PROG_RUN_ARRAY.
> >>>>>>>>> Please use that instead.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> I don't see how that would help.  Suppose you add a filter, then
> >>>>>>>> fork(), and then the child adds another filter.  Do you want to
> >>>>>>>> duplicate the entire array?  You certainly can't *modify* the array
> >>>>>>>> because you'll affect processes that shouldn't be affected.
> >>>>>>>>
> >>>>>>>> In contrast, doing this through seccomp like the earlier patches
> >>>>>>>> seemed just fine to me, and seccomp already had the right logic.
> >>>>>>>
> >>>>>>> it doesn't look to me that existing seccomp side of managing fork
> >>>>>>> situation can be reused. Here there is an attempt to add 'chaining'
> >>>>>>> concept which sort of an extension of existing seccomp style,
> >>>>>>> but somehow heavily done on bpf side and contradicts cgroup/tracing.
> >>>>>>>
> >>>>>>
> >>>>>> I don't see why the seccomp way can't be used.  I agree with you that
> >>>>>> the seccomp *style* shouldn't be used in bpf code like this, but I
> >>>>>> think that Landlock programs can and should just live in the existing
> >>>>>> seccomp chain.  If the existing seccomp code needs some modification
> >>>>>> to make this work, then so be it.
> >>>>>
> >>>>> +1
> >>>>> if that was the case...
> >>>>> but that's not my reading of the patch set.
> >>>>
> >>>> An earlier version of the patch set used the seccomp filter chain.
> >>>> Mickaï¿½l, what exactly was wrong with that approach other than that the
> >>>> seccomp() syscall was awkward for you to use?  You could add a
> >>>> seccomp_add_landlock_rule() syscall if you needed to.
> >>>
> >>> Nothing was wrong about about that, this part did not changed (see my
> >>> next comment).
> >>>
> >>>>
> >>>> As a side comment, why is this an LSM at all, let alone a non-stacking
> >>>> LSM?  It would make a lot more sense to me to make Landlock depend on
> >>>> having LSMs configured in but to call the landlock hooks directly from
> >>>> the security_xyz() hooks.
> >>>
> >>> See Casey's answer and his patch series: https://lwn.net/Articles/741963/
> >>>
> >>>>
> >>>>>
> >>>>>> In other words, the kernel already has two kinds of chaining:
> >>>>>> seccomp's and bpf's.  bpf's doesn't work right for this type of usage
> >>>>>> across fork(), whereas seccomp's already handles that case correctly.
> >>>>>> (In contrast, seccomp's is totally wrong for cgroup-attached filters.)
> >>>>>>  So IMO Landlock should use the seccomp core code and call into bpf
> >>>>>> for the actual filtering.
> >>>>>
> >>>>> +1
> >>>>> in cgroup we had to invent this new BPF_PROG_RUN_ARRAY mechanism,
> >>>>> since cgroup hierarchy can be complicated with bpf progs attached
> >>>>> at different levels with different override/multiprog properties,
> >>>>> so walking link list and checking all flags at run-time would have
> >>>>> been too slow. That's why we added compute_effective_progs().
> >>>>
> >>>> If we start adding override flags to Landlock, I think we're doing it
> >>>> wrong.   With cgroup bpf programs, the whole mess is set up by the
> >>>> administrator.  With seccomp, and with Landlock if done correctly, it
> >>>> *won't* be set up by the administrator, so the chance that everyone
> >>>> gets all the flags right is about zero.  All attached filters should
> >>>> run unconditionally.
> >>>
> >>>
> >>> There is a misunderstanding about this chaining mechanism. This should
> >>> not be confused with the list of seccomp filters nor the cgroup
> >>> hierarchies. Landlock programs can be stacked the same way seccomp's
> >>> filters can (cf. struct landlock_prog_set, the "chain_last" field is an
> >>> optimization which is not used for this struct handling). This stackable
> >>> property did not changed from the previous patch series. The chaining
> >>> mechanism is for another use case, which does not make sense for seccomp
> >>> filters nor other eBPF program types, at least for now, from what I can
> >>> tell.
> >>>
> >>> You may want to get a look at my talk at FOSDEM
> >>> (https://landlock.io/talks/2018-02-04_landlock-fosdem.pdf), especially
> >>> slides 11 and 12.
> >>>
> >>> Let me explain my reasoning about this program chaining thing.
> >>>
> >>> To check if an action on a file is allowed, we first need to identify
> >>> this file and match it to the security policy. In a previous
> >>> (non-public) patch series, I tried to use one type of eBPF program to
> >>> check every kind of access to a file. To be able to identify a file, I
> >>> relied on an eBPF map, similar to the current inode map. This map store
> >>> a set of references to file descriptors. I then created a function
> >>> bpf_is_file_beneath() to check if the requested file was beneath a file
> >>> in the map. This way, no chaining, only one eBPF program type to check
> >>> an access to a file... but some issues then emerged. First, this design
> >>> create a side-channel which help an attacker using such a program to
> >>> infer some information not normally available, for example to get a hint
> >>> on where a file descriptor (received from a UNIX socket) come from.
> >>> Another issue is that this type of program would be called for each
> >>> component of a path. Indeed, when the kernel check if an access to a
> >>> file is allowed, it walk through all of the directories in its path
> >>> (checking if the current process is allowed to execute them). That first
> >>> attempt led me to rethink the way we could filter an access to a file
> >>> *path*.
> >>>
> >>> To minimize the number of called to an eBPF program dedicated to
> >>> validate an access to a file path, I decided to create three subtype of
> >>> eBPF programs. The FS_WALK type is called when walking through every
> >>> directory of a file path (except the last one if it is the target). We
> >>> can then restrict this type of program to the minimum set of functions
> >>> it is allowed to call and the minimum set of data available from its
> >>> context. The first implicit chaining is for this type of program. To be
> >>> able to evaluate a path while being called for all its components, this
> >>> program need to store a state (to remember what was the parent directory
> >>> of this path). There is no "previous" field in the subtype for this
> >>> program because it is chained with itself, for each directories. This
> >>> enable to create a FS_WALK program to evaluate a file hierarchy, thank
> >>> to the inode map which can be used to check if a directory of this
> >>> hierarchy is part of an allowed (or denied) list of directories. This
> >>> design enables to express a file hierarchy in a programmatic way,
> >>> without requiring an eBPF helper to do the job (unlike my first experiment).
> >>>
> >>> The explicit chaining is used to tied a path evaluation (with a FS_WALK
> >>> program) to an access to the actual file being requested (the last
> >>> component of a file path), with a FS_PICK program. It is only at this
> >>> time that the kernel check for the requested action (e.g. read, write,
> >>> chdir, append...). To be able to filter such access request we can have
> >>> one call to the same program for every action and let this program check
> >>> for which action it was called. However, this design does not allow the
> >>> kernel to know if the current action is indeed handled by this program.
> >>> Hence, it is not possible to implement a cache mechanism to only call
> >>> this program if it knows how to handle this action.
> >>>
> >>> The approach I took for this FS_PICK type of program is to add to its
> >>> subtype which action it can handle (with the "triggers" bitfield, seen
> >>> as ORed actions). This way, the kernel knows if a call to a FS_PICK
> >>> program is necessary. If the user wants to enforce a different security
> >>> policy according to the action requested on a file, then it needs
> >>> multiple FS_PICK programs. However, to reduce the number of such
> >>> programs, this patch series allow a FS_PICK program to be chained with
> >>> another, the same way a FS_WALK is chained with itself. This way, if the
> >>> user want to check if the action is a for example an "open" and a "read"
> >>> and not a "map" and a "read", then it can chain multiple FS_PICK
> >>> programs with different triggers actions. The OR check performed by the
> >>> kernel is not a limitation then, only a way to know if a call to an eBPF
> >>> program is needed.
> >>>
> >>> The last type of program is FS_GET. This one is called when a process
> >>> get a struct file or change its working directory. This is the only
> >>> program type able (and allowed) to tag a file. This restriction is
> >>> important to not being subject to resource exhaustion attacks (i.e.
> >>> tagging every inode accessible to an attacker, which would allocate too
> >>> much kernel memory).
> >>>
> >>> This design gives room for improvements to create a cache of eBPF
> >>> context (input data, including maps if any), with the result of an eBPF
> >>> program. This would help limit the number of call to an eBPF program the
> >>> same way SELinux or other kernel components do to limit costly checks.
> >>>
> >>> The eBPF maps of progs are useful to call the same type of eBPF
> >>> program. It does not fit with this use case because we may want multiple
> >>> eBPF program according to the action requested on a kernel object (e.g.
> >>> FS_GET). The other reason is because the eBPF program does not know what
> >>> will be the next (type of) access check performed by the kernel.
> >>>
> >>> To say it another way, this chaining mechanism is a way to split a
> >>> kernel object evaluation with multiple specialized programs, each of
> >>> them being able to deal with data tied to their type. Using a monolithic
> >>> eBPF program to check everything does not scale and does not fit with
> >>> unprivileged use either.
> >>>
> >>> As a side note, the cookie value is only an ephemeral value to keep a
> >>> state between multiple programs call. It can be used to create a state
> >>> machine for an object evaluation.
> >>>
> >>> I don't see a way to do an efficient and programmatic path evaluation,
> >>> with different access checks, with the current eBPF features. Please let
> >>> me know if you know how to do it another way.
> >>>
> >>
> >> Andy, Alexei, Daniel, what do you think about this Landlock program
> >> chaining and cookie?
> >>
> > 
> > Can you give a small pseudocode real world example that acutally needs
> > chaining?  The mechanism is quite complicated and I'd like to
> > understand how it'll be used.
> > 
> 
> Here is the interesting part from the example (patch 09/11):
> 
> +SEC("maps")
> +struct bpf_map_def inode_map = {
> +	.type = BPF_MAP_TYPE_INODE,
> +	.key_size = sizeof(u32),
> +	.value_size = sizeof(u64),
> +	.max_entries = 20,
> +};
> +
> +SEC("subtype/landlock1")
> +static union bpf_prog_subtype _subtype1 = {
> +	.landlock_hook = {
> +		.type = LANDLOCK_HOOK_FS_WALK,
> +	}
> +};
> +
> +static __always_inline __u64 update_cookie(__u64 cookie, __u8 lookup,
> +		void *inode, void *chain, bool freeze)
> +{
> +	__u64 map_allow = 0;
> +
> +	if (cookie == 0) {
> +		cookie = bpf_inode_get_tag(inode, chain);
> +		if (cookie)
> +			return cookie;
> +		/* only look for the first match in the map, ignore nested
> +		 * paths in this example */
> +		map_allow = bpf_inode_map_lookup(&inode_map, inode);
> +		if (map_allow)
> +			cookie = 1 | map_allow;
> +	} else {
> +		if (cookie & COOKIE_VALUE_FREEZED)
> +			return cookie;
> +		map_allow = cookie & _MAP_MARK_MASK;
> +		cookie &= ~_MAP_MARK_MASK;
> +		switch (lookup) {
> +		case LANDLOCK_CTX_FS_WALK_INODE_LOOKUP_DOTDOT:
> +			cookie--;
> +			break;
> +		case LANDLOCK_CTX_FS_WALK_INODE_LOOKUP_DOT:
> +			break;
> +		default:
> +			/* ignore _MAP_MARK_MASK overflow in this example */
> +			cookie++;
> +			break;
> +		}
> +		if (cookie >= 1)
> +			cookie |= map_allow;
> +	}
> +	/* do not modify the cookie for each fs_pick */
> +	if (freeze && cookie)
> +		cookie |= COOKIE_VALUE_FREEZED;
> +	return cookie;
> +}
> +
> +SEC("landlock1")
> +int fs_walk(struct landlock_ctx_fs_walk *ctx)
> +{
> +	ctx->cookie = update_cookie(ctx->cookie, ctx->inode_lookup,
> +			(void *)ctx->inode, (void *)ctx->chain, false);
> +	return LANDLOCK_RET_ALLOW;
> +}
> 
> The program "landlock1" is called for every directory execution (except
> the last one if it is the leaf of a path). This enables to identify a
> file hierarchy with only a (one dimension) list of file descriptors
> (i.e. inode_map).
> 
> Underneath, the Landlock LSM part looks if there is an associated path
> walk (nameidata) with each inode access request. If there is one, then
> the cookie associated with the path walk (if any) is made available
> through the eBPF program context. This enables to develop a state
> machine with an eBPF program to "evaluate" a file path (without string
> parsing).
> 
> The goal with this chaining mechanism is to be able to express a complex
> kernel object like a file, with multiple run of one or more eBPF
> programs, as a multilayer evaluation. This semantic may only make sense
> for the user/developer and his security policy. We must keep in mind
> that this object identification should be available to unprivileged
> processes. This means that we must be very careful to what kind of
> information are available to an eBPF program because this can then leak
> to a process (e.g. through a map). With this mechanism, only information
> already available to user space is available to the eBPF program.
> 
> In this example, the complexity of the path evaluation is in the eBPF
> program. We can then keep the kernel code more simple and generic. This
> enables more flexibility for a security policy definition.

it all sounds correct on paper, but it's pretty novel
approach and I'm not sure I see all the details in the patch.
When people say "inode" they most of the time mean inode integer number,
whereas in this patch do you mean a raw pointer to in-kernel
'struct inode' ?
To avoid confusion it should probably be called differently.

If you meant inode as a number then why inode only?
where is superblock, device, mount point?
How bpf side can compare inodes without this additional info?
How bpf side will know what inode to compare to?
What if inode number is reused?
This approach is an optimization to compare inodes
instead of strings passed into sys_open ?

If you meant inode as a pointer how bpf side will
know the pointer before the walk begins?
What guarantees that it's not a stale pointer?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf-next v8 05/11] seccomp,landlock: Enforce Landlock programs per process hierarchy
  2018-04-10  4:48                       ` Alexei Starovoitov
@ 2018-04-11 22:18                         ` Mickaël Salaün
  0 siblings, 0 replies; 10+ messages in thread
From: Mickaël Salaün @ 2018-04-11 22:18 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Daniel Borkmann, LKML, Alexei Starovoitov,
	Arnaldo Carvalho de Melo, Casey Schaufler, David Drysdale,
	David S . Miller, Eric W . Biederman, Jann Horn, Jonathan Corbet,
	Michael Kerrisk, Kees Cook, Paul Moore, Sargun Dhillon,
	Serge E . Hallyn, Shuah Khan, Tejun Heo, Thomas Graf,
	Tycho Andersen, Will Drewry, Kernel Hardening, Linux API,
	LSM List, Network Development, Andrew Morton, Al Viro,
	Linux-Fsdevel


[-- Attachment #1.1: Type: text/plain, Size: 25891 bytes --]


On 04/10/2018 06:48 AM, Alexei Starovoitov wrote:
> On Mon, Apr 09, 2018 at 12:01:59AM +0200, Mickaël Salaün wrote:
>>
>> On 04/08/2018 11:06 PM, Andy Lutomirski wrote:
>>> On Sun, Apr 8, 2018 at 6:13 AM, Mickaël Salaün <mic@digikod.net> wrote:
>>>>
>>>> On 02/27/2018 10:48 PM, Mickaël Salaün wrote:
>>>>>
>>>>> On 27/02/2018 17:39, Andy Lutomirski wrote:
>>>>>> On Tue, Feb 27, 2018 at 5:32 AM, Alexei Starovoitov
>>>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>>>> On Tue, Feb 27, 2018 at 05:20:55AM +0000, Andy Lutomirski wrote:
>>>>>>>> On Tue, Feb 27, 2018 at 4:54 AM, Alexei Starovoitov
>>>>>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>>>>>> On Tue, Feb 27, 2018 at 04:40:34AM +0000, Andy Lutomirski wrote:
>>>>>>>>>> On Tue, Feb 27, 2018 at 2:08 AM, Alexei Starovoitov
>>>>>>>>>> <alexei.starovoitov@gmail.com> wrote:
>>>>>>>>>>> On Tue, Feb 27, 2018 at 01:41:15AM +0100, Mickaël Salaün wrote:
>>>>>>>>>>>> The seccomp(2) syscall can be used by a task to apply a Landlock program
>>>>>>>>>>>> to itself. As a seccomp filter, a Landlock program is enforced for the
>>>>>>>>>>>> current task and all its future children. A program is immutable and a
>>>>>>>>>>>> task can only add new restricting programs to itself, forming a list of
>>>>>>>>>>>> programss.
>>>>>>>>>>>>
>>>>>>>>>>>> A Landlock program is tied to a Landlock hook. If the action on a kernel
>>>>>>>>>>>> object is allowed by the other Linux security mechanisms (e.g. DAC,
>>>>>>>>>>>> capabilities, other LSM), then a Landlock hook related to this kind of
>>>>>>>>>>>> object is triggered. The list of programs for this hook is then
>>>>>>>>>>>> evaluated. Each program return a 32-bit value which can deny the action
>>>>>>>>>>>> on a kernel object with a non-zero value. If every programs of the list
>>>>>>>>>>>> return zero, then the action on the object is allowed.
>>>>>>>>>>>>
>>>>>>>>>>>> Multiple Landlock programs can be chained to share a 64-bits value for a
>>>>>>>>>>>> call chain (e.g. evaluating multiple elements of a file path).  This
>>>>>>>>>>>> chaining is restricted when a process construct this chain by loading a
>>>>>>>>>>>> program, but additional checks are performed when it requests to apply
>>>>>>>>>>>> this chain of programs to itself.  The restrictions ensure that it is
>>>>>>>>>>>> not possible to call multiple programs in a way that would imply to
>>>>>>>>>>>> handle multiple shared values (i.e. cookies) for one chain.  For now,
>>>>>>>>>>>> only a fs_pick program can be chained to the same type of program,
>>>>>>>>>>>> because it may make sense if they have different triggers (cf. next
>>>>>>>>>>>> commits).  This restrictions still allows to reuse Landlock programs in
>>>>>>>>>>>> a safe way (e.g. use the same loaded fs_walk program with multiple
>>>>>>>>>>>> chains of fs_pick programs).
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Mickaël Salaün <mic@digikod.net>
>>>>>>>>>>>
>>>>>>>>>>> ...
>>>>>>>>>>>
>>>>>>>>>>>> +struct landlock_prog_set *landlock_prepend_prog(
>>>>>>>>>>>> +             struct landlock_prog_set *current_prog_set,
>>>>>>>>>>>> +             struct bpf_prog *prog)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +     struct landlock_prog_set *new_prog_set = current_prog_set;
>>>>>>>>>>>> +     unsigned long pages;
>>>>>>>>>>>> +     int err;
>>>>>>>>>>>> +     size_t i;
>>>>>>>>>>>> +     struct landlock_prog_set tmp_prog_set = {};
>>>>>>>>>>>> +
>>>>>>>>>>>> +     if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
>>>>>>>>>>>> +             return ERR_PTR(-EINVAL);
>>>>>>>>>>>> +
>>>>>>>>>>>> +     /* validate memory size allocation */
>>>>>>>>>>>> +     pages = prog->pages;
>>>>>>>>>>>> +     if (current_prog_set) {
>>>>>>>>>>>> +             size_t i;
>>>>>>>>>>>> +
>>>>>>>>>>>> +             for (i = 0; i < ARRAY_SIZE(current_prog_set->programs); i++) {
>>>>>>>>>>>> +                     struct landlock_prog_list *walker_p;
>>>>>>>>>>>> +
>>>>>>>>>>>> +                     for (walker_p = current_prog_set->programs[i];
>>>>>>>>>>>> +                                     walker_p; walker_p = walker_p->prev)
>>>>>>>>>>>> +                             pages += walker_p->prog->pages;
>>>>>>>>>>>> +             }
>>>>>>>>>>>> +             /* count a struct landlock_prog_set if we need to allocate one */
>>>>>>>>>>>> +             if (refcount_read(&current_prog_set->usage) != 1)
>>>>>>>>>>>> +                     pages += round_up(sizeof(*current_prog_set), PAGE_SIZE)
>>>>>>>>>>>> +                             / PAGE_SIZE;
>>>>>>>>>>>> +     }
>>>>>>>>>>>> +     if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
>>>>>>>>>>>> +             return ERR_PTR(-E2BIG);
>>>>>>>>>>>> +
>>>>>>>>>>>> +     /* ensure early that we can allocate enough memory for the new
>>>>>>>>>>>> +      * prog_lists */
>>>>>>>>>>>> +     err = store_landlock_prog(&tmp_prog_set, current_prog_set, prog);
>>>>>>>>>>>> +     if (err)
>>>>>>>>>>>> +             return ERR_PTR(err);
>>>>>>>>>>>> +
>>>>>>>>>>>> +     /*
>>>>>>>>>>>> +      * Each task_struct points to an array of prog list pointers.  These
>>>>>>>>>>>> +      * tables are duplicated when additions are made (which means each
>>>>>>>>>>>> +      * table needs to be refcounted for the processes using it). When a new
>>>>>>>>>>>> +      * table is created, all the refcounters on the prog_list are bumped (to
>>>>>>>>>>>> +      * track each table that references the prog). When a new prog is
>>>>>>>>>>>> +      * added, it's just prepended to the list for the new table to point
>>>>>>>>>>>> +      * at.
>>>>>>>>>>>> +      *
>>>>>>>>>>>> +      * Manage all the possible errors before this step to not uselessly
>>>>>>>>>>>> +      * duplicate current_prog_set and avoid a rollback.
>>>>>>>>>>>> +      */
>>>>>>>>>>>> +     if (!new_prog_set) {
>>>>>>>>>>>> +             /*
>>>>>>>>>>>> +              * If there is no Landlock program set used by the current task,
>>>>>>>>>>>> +              * then create a new one.
>>>>>>>>>>>> +              */
>>>>>>>>>>>> +             new_prog_set = new_landlock_prog_set();
>>>>>>>>>>>> +             if (IS_ERR(new_prog_set))
>>>>>>>>>>>> +                     goto put_tmp_lists;
>>>>>>>>>>>> +     } else if (refcount_read(&current_prog_set->usage) > 1) {
>>>>>>>>>>>> +             /*
>>>>>>>>>>>> +              * If the current task is not the sole user of its Landlock
>>>>>>>>>>>> +              * program set, then duplicate them.
>>>>>>>>>>>> +              */
>>>>>>>>>>>> +             new_prog_set = new_landlock_prog_set();
>>>>>>>>>>>> +             if (IS_ERR(new_prog_set))
>>>>>>>>>>>> +                     goto put_tmp_lists;
>>>>>>>>>>>> +             for (i = 0; i < ARRAY_SIZE(new_prog_set->programs); i++) {
>>>>>>>>>>>> +                     new_prog_set->programs[i] =
>>>>>>>>>>>> +                             READ_ONCE(current_prog_set->programs[i]);
>>>>>>>>>>>> +                     if (new_prog_set->programs[i])
>>>>>>>>>>>> +                             refcount_inc(&new_prog_set->programs[i]->usage);
>>>>>>>>>>>> +             }
>>>>>>>>>>>> +
>>>>>>>>>>>> +             /*
>>>>>>>>>>>> +              * Landlock program set from the current task will not be freed
>>>>>>>>>>>> +              * here because the usage is strictly greater than 1. It is
>>>>>>>>>>>> +              * only prevented to be freed by another task thanks to the
>>>>>>>>>>>> +              * caller of landlock_prepend_prog() which should be locked if
>>>>>>>>>>>> +              * needed.
>>>>>>>>>>>> +              */
>>>>>>>>>>>> +             landlock_put_prog_set(current_prog_set);
>>>>>>>>>>>> +     }
>>>>>>>>>>>> +
>>>>>>>>>>>> +     /* prepend tmp_prog_set to new_prog_set */
>>>>>>>>>>>> +     for (i = 0; i < ARRAY_SIZE(tmp_prog_set.programs); i++) {
>>>>>>>>>>>> +             /* get the last new list */
>>>>>>>>>>>> +             struct landlock_prog_list *last_list =
>>>>>>>>>>>> +                     tmp_prog_set.programs[i];
>>>>>>>>>>>> +
>>>>>>>>>>>> +             if (last_list) {
>>>>>>>>>>>> +                     while (last_list->prev)
>>>>>>>>>>>> +                             last_list = last_list->prev;
>>>>>>>>>>>> +                     /* no need to increment usage (pointer replacement) */
>>>>>>>>>>>> +                     last_list->prev = new_prog_set->programs[i];
>>>>>>>>>>>> +                     new_prog_set->programs[i] = tmp_prog_set.programs[i];
>>>>>>>>>>>> +             }
>>>>>>>>>>>> +     }
>>>>>>>>>>>> +     new_prog_set->chain_last = tmp_prog_set.chain_last;
>>>>>>>>>>>> +     return new_prog_set;
>>>>>>>>>>>> +
>>>>>>>>>>>> +put_tmp_lists:
>>>>>>>>>>>> +     for (i = 0; i < ARRAY_SIZE(tmp_prog_set.programs); i++)
>>>>>>>>>>>> +             put_landlock_prog_list(tmp_prog_set.programs[i]);
>>>>>>>>>>>> +     return new_prog_set;
>>>>>>>>>>>> +}
>>>>>>>>>>>
>>>>>>>>>>> Nack on the chaining concept.
>>>>>>>>>>> Please do not reinvent the wheel.
>>>>>>>>>>> There is an existing mechanism for attaching/detaching/quering multiple
>>>>>>>>>>> programs attached to cgroup and tracing hooks that are also
>>>>>>>>>>> efficiently executed via BPF_PROG_RUN_ARRAY.
>>>>>>>>>>> Please use that instead.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I don't see how that would help.  Suppose you add a filter, then
>>>>>>>>>> fork(), and then the child adds another filter.  Do you want to
>>>>>>>>>> duplicate the entire array?  You certainly can't *modify* the array
>>>>>>>>>> because you'll affect processes that shouldn't be affected.
>>>>>>>>>>
>>>>>>>>>> In contrast, doing this through seccomp like the earlier patches
>>>>>>>>>> seemed just fine to me, and seccomp already had the right logic.
>>>>>>>>>
>>>>>>>>> it doesn't look to me that existing seccomp side of managing fork
>>>>>>>>> situation can be reused. Here there is an attempt to add 'chaining'
>>>>>>>>> concept which sort of an extension of existing seccomp style,
>>>>>>>>> but somehow heavily done on bpf side and contradicts cgroup/tracing.
>>>>>>>>>
>>>>>>>>
>>>>>>>> I don't see why the seccomp way can't be used.  I agree with you that
>>>>>>>> the seccomp *style* shouldn't be used in bpf code like this, but I
>>>>>>>> think that Landlock programs can and should just live in the existing
>>>>>>>> seccomp chain.  If the existing seccomp code needs some modification
>>>>>>>> to make this work, then so be it.
>>>>>>>
>>>>>>> +1
>>>>>>> if that was the case...
>>>>>>> but that's not my reading of the patch set.
>>>>>>
>>>>>> An earlier version of the patch set used the seccomp filter chain.
>>>>>> Mickaël, what exactly was wrong with that approach other than that the
>>>>>> seccomp() syscall was awkward for you to use?  You could add a
>>>>>> seccomp_add_landlock_rule() syscall if you needed to.
>>>>>
>>>>> Nothing was wrong about about that, this part did not changed (see my
>>>>> next comment).
>>>>>
>>>>>>
>>>>>> As a side comment, why is this an LSM at all, let alone a non-stacking
>>>>>> LSM?  It would make a lot more sense to me to make Landlock depend on
>>>>>> having LSMs configured in but to call the landlock hooks directly from
>>>>>> the security_xyz() hooks.
>>>>>
>>>>> See Casey's answer and his patch series: https://lwn.net/Articles/741963/
>>>>>
>>>>>>
>>>>>>>
>>>>>>>> In other words, the kernel already has two kinds of chaining:
>>>>>>>> seccomp's and bpf's.  bpf's doesn't work right for this type of usage
>>>>>>>> across fork(), whereas seccomp's already handles that case correctly.
>>>>>>>> (In contrast, seccomp's is totally wrong for cgroup-attached filters.)
>>>>>>>>  So IMO Landlock should use the seccomp core code and call into bpf
>>>>>>>> for the actual filtering.
>>>>>>>
>>>>>>> +1
>>>>>>> in cgroup we had to invent this new BPF_PROG_RUN_ARRAY mechanism,
>>>>>>> since cgroup hierarchy can be complicated with bpf progs attached
>>>>>>> at different levels with different override/multiprog properties,
>>>>>>> so walking link list and checking all flags at run-time would have
>>>>>>> been too slow. That's why we added compute_effective_progs().
>>>>>>
>>>>>> If we start adding override flags to Landlock, I think we're doing it
>>>>>> wrong.   With cgroup bpf programs, the whole mess is set up by the
>>>>>> administrator.  With seccomp, and with Landlock if done correctly, it
>>>>>> *won't* be set up by the administrator, so the chance that everyone
>>>>>> gets all the flags right is about zero.  All attached filters should
>>>>>> run unconditionally.
>>>>>
>>>>>
>>>>> There is a misunderstanding about this chaining mechanism. This should
>>>>> not be confused with the list of seccomp filters nor the cgroup
>>>>> hierarchies. Landlock programs can be stacked the same way seccomp's
>>>>> filters can (cf. struct landlock_prog_set, the "chain_last" field is an
>>>>> optimization which is not used for this struct handling). This stackable
>>>>> property did not changed from the previous patch series. The chaining
>>>>> mechanism is for another use case, which does not make sense for seccomp
>>>>> filters nor other eBPF program types, at least for now, from what I can
>>>>> tell.
>>>>>
>>>>> You may want to get a look at my talk at FOSDEM
>>>>> (https://landlock.io/talks/2018-02-04_landlock-fosdem.pdf), especially
>>>>> slides 11 and 12.
>>>>>
>>>>> Let me explain my reasoning about this program chaining thing.
>>>>>
>>>>> To check if an action on a file is allowed, we first need to identify
>>>>> this file and match it to the security policy. In a previous
>>>>> (non-public) patch series, I tried to use one type of eBPF program to
>>>>> check every kind of access to a file. To be able to identify a file, I
>>>>> relied on an eBPF map, similar to the current inode map. This map store
>>>>> a set of references to file descriptors. I then created a function
>>>>> bpf_is_file_beneath() to check if the requested file was beneath a file
>>>>> in the map. This way, no chaining, only one eBPF program type to check
>>>>> an access to a file... but some issues then emerged. First, this design
>>>>> create a side-channel which help an attacker using such a program to
>>>>> infer some information not normally available, for example to get a hint
>>>>> on where a file descriptor (received from a UNIX socket) come from.
>>>>> Another issue is that this type of program would be called for each
>>>>> component of a path. Indeed, when the kernel check if an access to a
>>>>> file is allowed, it walk through all of the directories in its path
>>>>> (checking if the current process is allowed to execute them). That first
>>>>> attempt led me to rethink the way we could filter an access to a file
>>>>> *path*.
>>>>>
>>>>> To minimize the number of called to an eBPF program dedicated to
>>>>> validate an access to a file path, I decided to create three subtype of
>>>>> eBPF programs. The FS_WALK type is called when walking through every
>>>>> directory of a file path (except the last one if it is the target). We
>>>>> can then restrict this type of program to the minimum set of functions
>>>>> it is allowed to call and the minimum set of data available from its
>>>>> context. The first implicit chaining is for this type of program. To be
>>>>> able to evaluate a path while being called for all its components, this
>>>>> program need to store a state (to remember what was the parent directory
>>>>> of this path). There is no "previous" field in the subtype for this
>>>>> program because it is chained with itself, for each directories. This
>>>>> enable to create a FS_WALK program to evaluate a file hierarchy, thank
>>>>> to the inode map which can be used to check if a directory of this
>>>>> hierarchy is part of an allowed (or denied) list of directories. This
>>>>> design enables to express a file hierarchy in a programmatic way,
>>>>> without requiring an eBPF helper to do the job (unlike my first experiment).
>>>>>
>>>>> The explicit chaining is used to tied a path evaluation (with a FS_WALK
>>>>> program) to an access to the actual file being requested (the last
>>>>> component of a file path), with a FS_PICK program. It is only at this
>>>>> time that the kernel check for the requested action (e.g. read, write,
>>>>> chdir, append...). To be able to filter such access request we can have
>>>>> one call to the same program for every action and let this program check
>>>>> for which action it was called. However, this design does not allow the
>>>>> kernel to know if the current action is indeed handled by this program.
>>>>> Hence, it is not possible to implement a cache mechanism to only call
>>>>> this program if it knows how to handle this action.
>>>>>
>>>>> The approach I took for this FS_PICK type of program is to add to its
>>>>> subtype which action it can handle (with the "triggers" bitfield, seen
>>>>> as ORed actions). This way, the kernel knows if a call to a FS_PICK
>>>>> program is necessary. If the user wants to enforce a different security
>>>>> policy according to the action requested on a file, then it needs
>>>>> multiple FS_PICK programs. However, to reduce the number of such
>>>>> programs, this patch series allow a FS_PICK program to be chained with
>>>>> another, the same way a FS_WALK is chained with itself. This way, if the
>>>>> user want to check if the action is a for example an "open" and a "read"
>>>>> and not a "map" and a "read", then it can chain multiple FS_PICK
>>>>> programs with different triggers actions. The OR check performed by the
>>>>> kernel is not a limitation then, only a way to know if a call to an eBPF
>>>>> program is needed.
>>>>>
>>>>> The last type of program is FS_GET. This one is called when a process
>>>>> get a struct file or change its working directory. This is the only
>>>>> program type able (and allowed) to tag a file. This restriction is
>>>>> important to not being subject to resource exhaustion attacks (i.e.
>>>>> tagging every inode accessible to an attacker, which would allocate too
>>>>> much kernel memory).
>>>>>
>>>>> This design gives room for improvements to create a cache of eBPF
>>>>> context (input data, including maps if any), with the result of an eBPF
>>>>> program. This would help limit the number of call to an eBPF program the
>>>>> same way SELinux or other kernel components do to limit costly checks.
>>>>>
>>>>> The eBPF maps of progs are useful to call the same type of eBPF
>>>>> program. It does not fit with this use case because we may want multiple
>>>>> eBPF program according to the action requested on a kernel object (e.g.
>>>>> FS_GET). The other reason is because the eBPF program does not know what
>>>>> will be the next (type of) access check performed by the kernel.
>>>>>
>>>>> To say it another way, this chaining mechanism is a way to split a
>>>>> kernel object evaluation with multiple specialized programs, each of
>>>>> them being able to deal with data tied to their type. Using a monolithic
>>>>> eBPF program to check everything does not scale and does not fit with
>>>>> unprivileged use either.
>>>>>
>>>>> As a side note, the cookie value is only an ephemeral value to keep a
>>>>> state between multiple programs call. It can be used to create a state
>>>>> machine for an object evaluation.
>>>>>
>>>>> I don't see a way to do an efficient and programmatic path evaluation,
>>>>> with different access checks, with the current eBPF features. Please let
>>>>> me know if you know how to do it another way.
>>>>>
>>>>
>>>> Andy, Alexei, Daniel, what do you think about this Landlock program
>>>> chaining and cookie?
>>>>
>>>
>>> Can you give a small pseudocode real world example that acutally needs
>>> chaining?  The mechanism is quite complicated and I'd like to
>>> understand how it'll be used.
>>>
>>
>> Here is the interesting part from the example (patch 09/11):
>>
>> +SEC("maps")
>> +struct bpf_map_def inode_map = {
>> +	.type = BPF_MAP_TYPE_INODE,
>> +	.key_size = sizeof(u32),
>> +	.value_size = sizeof(u64),
>> +	.max_entries = 20,
>> +};
>> +
>> +SEC("subtype/landlock1")
>> +static union bpf_prog_subtype _subtype1 = {
>> +	.landlock_hook = {
>> +		.type = LANDLOCK_HOOK_FS_WALK,
>> +	}
>> +};
>> +
>> +static __always_inline __u64 update_cookie(__u64 cookie, __u8 lookup,
>> +		void *inode, void *chain, bool freeze)
>> +{
>> +	__u64 map_allow = 0;
>> +
>> +	if (cookie == 0) {
>> +		cookie = bpf_inode_get_tag(inode, chain);
>> +		if (cookie)
>> +			return cookie;
>> +		/* only look for the first match in the map, ignore nested
>> +		 * paths in this example */
>> +		map_allow = bpf_inode_map_lookup(&inode_map, inode);
>> +		if (map_allow)
>> +			cookie = 1 | map_allow;
>> +	} else {
>> +		if (cookie & COOKIE_VALUE_FREEZED)
>> +			return cookie;
>> +		map_allow = cookie & _MAP_MARK_MASK;
>> +		cookie &= ~_MAP_MARK_MASK;
>> +		switch (lookup) {
>> +		case LANDLOCK_CTX_FS_WALK_INODE_LOOKUP_DOTDOT:
>> +			cookie--;
>> +			break;
>> +		case LANDLOCK_CTX_FS_WALK_INODE_LOOKUP_DOT:
>> +			break;
>> +		default:
>> +			/* ignore _MAP_MARK_MASK overflow in this example */
>> +			cookie++;
>> +			break;
>> +		}
>> +		if (cookie >= 1)
>> +			cookie |= map_allow;
>> +	}
>> +	/* do not modify the cookie for each fs_pick */
>> +	if (freeze && cookie)
>> +		cookie |= COOKIE_VALUE_FREEZED;
>> +	return cookie;
>> +}
>> +
>> +SEC("landlock1")
>> +int fs_walk(struct landlock_ctx_fs_walk *ctx)
>> +{
>> +	ctx->cookie = update_cookie(ctx->cookie, ctx->inode_lookup,
>> +			(void *)ctx->inode, (void *)ctx->chain, false);
>> +	return LANDLOCK_RET_ALLOW;
>> +}
>>
>> The program "landlock1" is called for every directory execution (except
>> the last one if it is the leaf of a path). This enables to identify a
>> file hierarchy with only a (one dimension) list of file descriptors
>> (i.e. inode_map).
>>
>> Underneath, the Landlock LSM part looks if there is an associated path
>> walk (nameidata) with each inode access request. If there is one, then
>> the cookie associated with the path walk (if any) is made available
>> through the eBPF program context. This enables to develop a state
>> machine with an eBPF program to "evaluate" a file path (without string
>> parsing).
>>
>> The goal with this chaining mechanism is to be able to express a complex
>> kernel object like a file, with multiple run of one or more eBPF
>> programs, as a multilayer evaluation. This semantic may only make sense
>> for the user/developer and his security policy. We must keep in mind
>> that this object identification should be available to unprivileged
>> processes. This means that we must be very careful to what kind of
>> information are available to an eBPF program because this can then leak
>> to a process (e.g. through a map). With this mechanism, only information
>> already available to user space is available to the eBPF program.
>>
>> In this example, the complexity of the path evaluation is in the eBPF
>> program. We can then keep the kernel code more simple and generic. This
>> enables more flexibility for a security policy definition.
> 
> it all sounds correct on paper, but it's pretty novel
> approach and I'm not sure I see all the details in the patch.
> When people say "inode" they most of the time mean inode integer number,
> whereas in this patch do you mean a raw pointer to in-kernel
> 'struct inode' ?
> To avoid confusion it should probably be called differently.

It's indeed a pointer to a "struct inode", not an inode number.

I was thinking about generalizing the BPF_MAP_TYPE_INODE by renaming it
to BPF_MAP_TYPE_FD. This map type could then be used either to identify
a set of inodes (pointers) or other kernel objects identifiable by a
file descriptor. A "subtype" (similar to the BPF prog subtype introduced
in this patch series) may be used to specialize such a map to statically
identify the kind of content it may hold. We could then add more
subtypes to identify sockets, devices, processes, and so on.

> 
> If you meant inode as a number then why inode only?
> where is superblock, device, mount point?
> How bpf side can compare inodes without this additional info?
> How bpf side will know what inode to compare to?
> What if inode number is reused?

This pointer can identify if a giver inode is the same as one pointed by
a file descriptor (or a file path).


> This approach is an optimization to compare inodes
> instead of strings passed into sys_open ?

Comparing paths with strings is less efficient but it is also very
error-prone. Another advantage of using file descriptors is for
unprivileged processes: we can be sure that this processes are allowed
to access a file referred by a file descriptor (opened file). Indeed we
check (security_inode_getattr) that the process is allowed to stat an
opened file. This way, a malicious process can't infer information by
crafting path strings.


> 
> If you meant inode as a pointer how bpf side will
> know the pointer before the walk begins?

The BPF map is filled by user space with file descriptors pointing to
opened files. When a path walk begin, the LSM part of Landlock is
notified that a process is requesting an access to the first element of
the path (e.g. "/"). This first element may be part of a map or not. The
BPF program can then choose if this request is legitimate or not.


> What guarantees that it's not a stale pointer?

When user space updates a map with a new file descriptor, the kernel
checks if this FD is valid. If this is the case, then the inode's usage
counter is incremented and its address is stored in the map.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-04-11 22:20 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20180227004121.3633-1-mic@digikod.net>
2018-02-27  0:41 ` [PATCH bpf-next v8 01/11] fs,security: Add a security blob to nameidata Mickaël Salaün
2018-02-27  0:57   ` Al Viro
2018-02-27  1:23     ` Al Viro
2018-03-11 20:14       ` Mickaël Salaün
2018-02-28 16:27   ` kbuild test robot
2018-02-28 16:58   ` kbuild test robot
2018-02-27  0:41 ` [PATCH bpf-next v8 02/11] fs,security: Add a new file access type: MAY_CHROOT Mickaël Salaün
     [not found] ` <20180227004121.3633-6-mic@digikod.net>
     [not found]   ` <20180227020856.teq4hobw3zwussu2@ast-mbp>
     [not found]     ` <CALCETrVKRYnGJ9XNW-x7eHdMt+eGP90j7cAed-KTzp1KT_kMeQ@mail.gmail.com>
     [not found]       ` <20180227045458.wjrbbsxf3po656du@ast-mbp>
     [not found]         ` <CALCETrXZ=xJEd53RhA67_VDoAKBWgUeyBi9XN7ibrE7V6nzk5Q@mail.gmail.com>
     [not found]           ` <20180227053255.a7ua24kjd6tvei2a@ast-mbp>
     [not found]             ` <CALCETrUqc0wbig4ntQe9KNS-fOgYOpD+MPodXYruCRyJ7bhagA@mail.gmail.com>
     [not found]               ` <ab8dda73-4a6e-4e10-cda0-3e91c5019a63@digikod.net>
     [not found]                 ` <498f8193-c909-78b2-e4ca-c1dd05605255@digikod.net>
     [not found]                   ` <CALCETrViaXEx1iQ6q8bEEWSLchj=FH6LjcRY6+hjMx8A+rtgDQ@mail.gmail.com>
2018-04-08 22:01                     ` [PATCH bpf-next v8 05/11] seccomp,landlock: Enforce Landlock programs per process hierarchy Mickaël Salaün
2018-04-10  4:48                       ` Alexei Starovoitov
2018-04-11 22:18                         ` Mickaël Salaün

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).