[PATCH 1/1] RFC: taking a crack at targeted capabilities

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/1] RFC: taking a crack at targeted capabilities
@ 2010-01-06  6:28 Serge E. Hallyn
       [not found] ` <20100106062809.GA17064-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2010-01-06  6:28 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

So i was thinking about how to safely but incrementally introduce
targeted capabilities - which we decided was a prereq to making VFS
handle user namespaces - and the following seemed doable.  My main
motivations were (in order):

        1. don't make any unconverted capable() checks unsafe
        2. minimize performance impact on non-container case
        3. minimize performance impact on containers

This patch adds a per-task inherited securebit SECURE_CONTAINERIZED.
The capable() call is considered unconverted.  Therefore any call
to capable() by a task which is SECURE_CONTAINERIZED returns -EPERM.

A new syscall capable_to() is the container-aware version of capable().

int capable_to(int cap, enum ns_type type, void *src, void *dest);

meaning a task which owns 'src' wants 'cap' access to an object
in namespace 'dest'.

In a case like setting hostname, there is no way to try to set the
hostname in another container, so the check is converted in this patch to

        capable_to(CAP_SYS_ADMIN, NS_TYPE_NONE, NULL, NULL);

capable_to() will act like the old capable(), meaning grant permission
if CAP_SYS_ADMIN is in pE.

The check for sending a signal depends on a user namespace, so I
converted an instance to

        capable_to(CAP_KILL, NS_TYPE_USERNS, current_userns(),
                        target->user_ns);

The NS_TYPE_USERNS check checks whether target->userns is the same
as or a descendent of target->user_ns.  If not, then -EPERM is
returned even if the task has CAP_KILL.

To test, compile a program (call it 'containerize_cap') that does

	prctl(PR_SET_SECUREBITS, 1 << 6 | 1 << 7);
	execl("/bin/bash", "bash", NULL);

Run that in a container (say, do 'ns_exec -cmpuU /bin/bash' and
run screen there).  Notice you can set hostname, but you can't
for instance read user's directories which don't have world write
perms, and can't mount.  You can also kill processes which are
either in your own or a child user namespace, but not in a parent
user namespace.

Purely for discussion.  Comments?

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/capability.h     |    5 +++
 include/linux/securebits.h     |   15 ++++++++++-
 include/linux/user_namespace.h |    9 ++++++
 kernel/capability.c            |   55 ++++++++++++++++++++++++++++++++++++++++
 kernel/signal.c                |    3 +-
 kernel/sys.c                   |    2 +-
 kernel/user_namespace.c        |   20 ++++++++++++++
 7 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 39e5ff5..f618804 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -110,6 +110,10 @@ struct cpu_vfs_cap_data {
 
 #endif
 
+enum ns_type {
+	NS_TYPE_NONE = 0,
+	NS_TYPE_USERNS,
+};
 
 /**
  ** POSIX-draft defined capabilities.
@@ -561,6 +565,7 @@ extern const kernel_cap_t __cap_init_eff_set;
 	(security_real_capable_noaudit((t), (cap)) == 0)
 
 extern int capable(int cap);
+extern int capable_to(int cap, enum ns_type type, void *src, void *dest);
 
 /* audit system wants to get cap info from files as well */
 struct dentry;
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 3340617..8cc2329 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -43,12 +43,25 @@
 #define SECURE_KEEP_CAPS		4
 #define SECURE_KEEP_CAPS_LOCKED		5  /* make bit-4 immutable */
 
+/* When set, capable() will always return -EPERM.  Capability checks
+   which make sense with respect to a container, or are safe to grant
+   in a container, can be converted to capable_to().
+   Note this is not a part of SECURE_ALL_BITS because it is not
+   related to locking a task in a pure POSIX capability environment. */
+#define SECURE_CONTAINERIZED		6
+#define SECURE_CONTAINERIZED_LOCKED	7
+
+#define SECBIT_CONTAINERIZED (issecure_mask(SECURE_CONTAINERIZED))
+#define SECBIT_CONTAINERIZED_LOCKED \
+			(issecure_mask(SECBIT_CONTAINERIZED_LOCKED))
+
 #define SECBIT_KEEP_CAPS	(issecure_mask(SECURE_KEEP_CAPS))
 #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
 
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-				 issecure_mask(SECURE_KEEP_CAPS))
+				 issecure_mask(SECURE_KEEP_CAPS) | \
+				 issecure_mask(SECURE_CONTAINERIZED))
 #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
 
 #endif /* !_LINUX_SECUREBITS_H */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index cc4f453..e05d06a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -20,6 +20,9 @@ extern struct user_namespace init_user_ns;
 
 #ifdef CONFIG_USER_NS
 
+extern int userns_is_ancestor(struct user_namespace *src,
+				struct user_namespace *dest);
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -38,6 +41,12 @@ static inline void put_user_ns(struct user_namespace *ns)
 
 #else
 
+static inline int userns_is_ancestor(struct user_namespace *src,
+				struct user_namespace *dest)
+{
+	return 1;
+}
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e6..0efd0e7 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/securebits.h>
 #include <asm/uaccess.h>
 #include "cred-internals.h"
 
@@ -307,6 +308,9 @@ int capable(int cap)
 		BUG();
 	}
 
+	if (issecure(SECURE_CONTAINERIZED))
+		return 0;
+
 	if (security_capable(cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
@@ -314,3 +318,54 @@ int capable(int cap)
 	return 0;
 }
 EXPORT_SYMBOL(capable);
+
+/* defined in kernel/user_namespace.c */
+extern int userns_is_ancestor(struct user_namespace *src,
+				struct user_namespace *dest);
+
+/**
+ * capable_to - Determine if the current task has capability applicable to the
+ * target namespace
+ * 
+ * @cap: The capability to be tested for
+ * @type: type of namespace
+ * @src: user's namespace
+ * @dest: object's namespace
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int capable_to(int cap, enum ns_type type, void *src, void *dest)
+{
+	if (unlikely(!cap_valid(cap))) {
+		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+		BUG();
+	}
+
+	if (!issecure(SECURE_CONTAINERIZED))
+		goto check_capable;
+
+	switch(type) {
+	case NS_TYPE_NONE:
+		goto check_capable;
+	case NS_TYPE_USERNS:
+		if (!userns_is_ancestor(src, dest))
+			return 0;
+		goto check_capable;
+	default:
+		printk(KERN_CRIT "capable_to() called with invalid type=%d\n",
+			type);
+		BUG();
+		return 0;
+	}
+
+check_capable:
+	if (security_capable(cap) == 0) {
+		current->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index d09692b..9600028 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -644,7 +644,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	    (cred->euid ^ tcred->uid) &&
 	    (cred->uid  ^ tcred->suid) &&
 	    (cred->uid  ^ tcred->uid) &&
-	    !capable(CAP_KILL)) {
+	    !capable_to(CAP_KILL, NS_TYPE_USERNS, cred->user->user_ns,
+				tcred->user->user_ns)) {
 		switch (sig) {
 		case SIGCONT:
 			sid = task_session(t);
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73..5c40837 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1134,7 +1134,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_to(CAP_SYS_ADMIN, NS_TYPE_NONE, NULL, NULL))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8..49944fb 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -82,3 +82,23 @@ void free_user_ns(struct kref *kref)
 	schedule_work(&ns->destroyer);
 }
 EXPORT_SYMBOL(free_user_ns);
+
+/*
+ * userns_is_ancestor: return true if src is equal to or an ancestor
+ * of dest
+ */
+int userns_is_ancestor(struct user_namespace *src, struct user_namespace *dest)
+{
+	struct user_struct *u;
+	struct user_namespace *ns = dest;
+
+	if (dest == src)
+		return 1;
+	while (ns != &init_user_ns && ns != src) {
+		u = ns->creator;
+		ns = u->user_ns;
+	}
+	if (ns == src)
+		return 1;
+	return 0;
+}
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found] ` <20100106062809.GA17064-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-01-06 15:44   ` Eric W. Biederman
       [not found]     ` <m13a2j2q7c.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  2010-01-06 16:56   ` Eric W. Biederman
  1 sibling, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2010-01-06 15:44 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> So i was thinking about how to safely but incrementally introduce
> targeted capabilities - which we decided was a prereq to making VFS
> handle user namespaces - and the following seemed doable.  My main
> motivations were (in order):
>
>         1. don't make any unconverted capable() checks unsafe
>         2. minimize performance impact on non-container case
>         3. minimize performance impact on containers
>
> This patch adds a per-task inherited securebit SECURE_CONTAINERIZED.
> The capable() call is considered unconverted.  Therefore any call
> to capable() by a task which is SECURE_CONTAINERIZED returns -EPERM.
>
> A new syscall capable_to() is the container-aware version of capable().
>
> int capable_to(int cap, enum ns_type type, void *src, void *dest);
>
> meaning a task which owns 'src' wants 'cap' access to an object
> in namespace 'dest'.
>
> In a case like setting hostname, there is no way to try to set the
> hostname in another container, so the check is converted in this patch to
>
>         capable_to(CAP_SYS_ADMIN, NS_TYPE_NONE, NULL, NULL);
>
> capable_to() will act like the old capable(), meaning grant permission
> if CAP_SYS_ADMIN is in pE.
>
> The check for sending a signal depends on a user namespace, so I
> converted an instance to
>
>         capable_to(CAP_KILL, NS_TYPE_USERNS, current_userns(),
>                         target->user_ns);
>
> The NS_TYPE_USERNS check checks whether target->userns is the same
> as or a descendent of target->user_ns.  If not, then -EPERM is
> returned even if the task has CAP_KILL.
>
> To test, compile a program (call it 'containerize_cap') that does
>
> 	prctl(PR_SET_SECUREBITS, 1 << 6 | 1 << 7);
> 	execl("/bin/bash", "bash", NULL);
>
> Run that in a container (say, do 'ns_exec -cmpuU /bin/bash' and
> run screen there).  Notice you can set hostname, but you can't
> for instance read user's directories which don't have world write
> perms, and can't mount.  You can also kill processes which are
> either in your own or a child user namespace, but not in a parent
> user namespace.
>
> Purely for discussion.  Comments?

This looks like a good start of discussion, and you have
choosen two good examples.

I believe your check for ancestor user namespaces is actually
too liberal, I can't quite follow it but it looks like any
process in an ancestor user namespace has all rights over
a child, which would let fred kill joe's processes..

I think we can use a much simpler definition, based on the core
concept that we are making the capabilities namespace relative,
thus we need to pass in which namespace we want the capability for.

	/* Put in kernel/capability.c */
	int capable(int cap)
	{
	        return capable_to(&init_user_ns, cap);
	}
	
	int capable_to(struct user_namespace *ns, int cap)
	{
	        if (unlikely(!cap_valid(cap))) {
			printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
			BUG();
	        }
	        
	        if (security_capable(ns, cap) == 0) {
	        	current->flags |= PF_SUPERPRIV;
	                return 1;
	        }
	        return 0;
	}
	
	/* Put in security/common_cap.c */
	int cap_capable(struct task_struct *tsk, const cred *cred,
	    		struct user_namespace *targ_ns, int targ_cap, int audit)
	{
	        struct user_namespace *curr_ns = cred->user->user_ns
	
	        for (;;) {
	        	/* Do we have the necessary capabilities? */
		        if (targ_ns == curr_ns)
				return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
	
			/* The creator of the user namespace has all caps. */
			if (targ_ns->creator == cred->user)
				return 0;
	
			/* Have we tried all of the parent namespaces? */
			if (targ_ns == &init_user_ns)
				return -EPERM;
	
			/* If you have the capability in a parent user ns you have it
	                 * in the over all children user namespaces as well, so see
	                 * if this process has the capability in the parent user
	                 * namespace.
	                 */
			targ_ns = targ_ns->creator->user_ns;
		}
	
	        /* We never get here */
		return -EPERM;                
	}


The example in check_kill_permission simply becomes:
	capable_to(tcred->user->user_ns, CAP_KILL);

While the check in hostname remains unchanged until we convert teach
the userns to unshare without privilege.  At which point the check should
become.
	capable_to(utsname()->creator->user_ns, CAP_SYS_ADMIN);

Which matters because we can set the hostname through /proc/sys....

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found] ` <20100106062809.GA17064-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-01-06 15:44   ` Eric W. Biederman
@ 2010-01-06 16:56   ` Eric W. Biederman
       [not found]     ` <m17hrv18ad.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  1 sibling, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2010-01-06 16:56 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> So i was thinking about how to safely but incrementally introduce
> targeted capabilities - which we decided was a prereq to making VFS
> handle user namespaces - and the following seemed doable.  My main
> motivations were (in order):
>
>         1. don't make any unconverted capable() checks unsafe
>         2. minimize performance impact on non-container case
>         3. minimize performance impact on containers

My motivation is a bit different.  I would like to get to the
unprivileged creation of new namespaces.  It looks like this gets us
90% of the way there, with only potential uid confusion issues left.

I still need to handle getting all caps after creation but otherwise I
think I have a good starter patch that achieves all of your goals.

Of course kill_permission needs the checks you have suggested as well.

Eric


From db104af741b5f0a2f128688905498cae68fbbde2 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
Date: Wed, 6 Jan 2010 08:26:21 -0800
Subject: [PATCH] security:  Make capabilities relative to the user namespace.

- Introduce ns_capable to test for a capability in a non-default
  user namespace.
- Teach cap_capable to handle capabilities in a non-default
  user namespace.

Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---
 include/linux/capability.h |    6 ++++--
 include/linux/security.h   |   12 +++++++-----
 kernel/capability.c        |   22 ++++++++++++++++++++--
 security/commoncap.c       |   40 +++++++++++++++++++++++++++++++++-------
 security/security.c        |   12 ++++++------
 security/selinux/hooks.c   |   14 +++++++++-----
 6 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 39e5ff5..89572b2 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -544,7 +544,7 @@ extern const kernel_cap_t __cap_init_eff_set;
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-#define has_capability(t, cap) (security_real_capable((t), (cap)) == 0)
+#define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0)
 
 /**
  * has_capability_noaudit - Determine if a task has a superior capability available (unaudited)
@@ -558,9 +558,11 @@ extern const kernel_cap_t __cap_init_eff_set;
  * Note that this does not set PF_SUPERPRIV on the task.
  */
 #define has_capability_noaudit(t, cap) \
-	(security_real_capable_noaudit((t), (cap)) == 0)
+	(security_real_capable_noaudit((t), &init_user_ns, (cap)) == 0)
 
+struct user_namespace;
 extern int capable(int cap);
+extern int ns_capable(struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 struct dentry;
diff --git a/include/linux/security.h b/include/linux/security.h
index 2c627d3..f44932f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -45,13 +45,14 @@
 
 struct ctl_table;
 struct audit_krule;
+struct user_namespace;
 
 /*
  * These functions are in security/capability.c and are used
  * as the default capabilities functions
  */
 extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
-		       int cap, int audit);
+		       struct user_namespace *ns, int cap, int audit);
 extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
@@ -1327,6 +1328,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	credentials.
  *	@tsk contains the task_struct for the process.
  *	@cred contains the credentials to use.
+ *      @ns contains the user namespace we want the capability in
  *	@cap contains the capability <include/linux/capability.h>.
  *	@audit: Whether to write an audit message or not
  *	Return 0 if the capability is granted for @tsk.
@@ -1457,7 +1459,7 @@ struct security_operations {
 		       const kernel_cap_t *inheritable,
 		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, const struct cred *cred,
-			int cap, int audit);
+			struct user_namespace *ns, int cap, int audit);
 	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
@@ -1754,9 +1756,9 @@ int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted);
-int security_capable(int cap);
-int security_real_capable(struct task_struct *tsk, int cap);
-int security_real_capable_noaudit(struct task_struct *tsk, int cap);
+int security_capable(struct user_namespace *ns, int cap);
+int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap);
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap);
 int security_acct(struct file *file);
 int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e6..63dcf53 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 #include "cred-internals.h"
 
@@ -302,15 +303,32 @@ error:
  */
 int capable(int cap)
 {
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int ns_capable(struct user_namespace *ns, int cap)
+{
 	if (unlikely(!cap_valid(cap))) {
 		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
 		BUG();
 	}
 
-	if (security_capable(cap) == 0) {
+	if (security_capable(ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
diff --git a/security/commoncap.c b/security/commoncap.c
index 34500e3..ffde5be 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/prctl.h>
 #include <linux/securebits.h>
+#include <linux/user_namespace.h>
 
 /*
  * If a non-root user executes a setuid-root binary in
@@ -68,6 +69,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * cap_capable - Determine whether a task has a particular effective capability
  * @tsk: The task to query
  * @cred: The credentials to use
+ * @ns:  The user namespace in which we need the capability
  * @cap: The capability to check for
  * @audit: Whether to write an audit message or not
  *
@@ -79,10 +81,32 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * cap_has_capability() returns 0 when a task has a capability, but the
  * kernel's capable() and has_capability() returns 1 for this case.
  */
-int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
-		int audit)
+int cap_capable(struct task_struct *tsk, const struct cred *cred,
+		struct user_namespace *targ_ns, int cap, int audit)
 {
-	return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+	for (;;) {
+		/* Do we have the necessary capabilities? */
+		if (targ_ns == cred->user->user_ns)
+			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+	
+		/* The creator of the user namespace has all caps. */
+		if (targ_ns->creator == cred->user)
+			return 0;
+	
+		/* Have we tried all of the parent namespaces? */
+		if (targ_ns == &init_user_ns)
+			return -EPERM;
+	
+		/* If you have the capability in a parent user ns you have it
+		 * in the over all children user namespaces as well, so see
+		 * if this process has the capability in the parent user
+		 * namespace.
+		 */
+		targ_ns = targ_ns->creator->user_ns;
+	}
+	
+	/* We never get here */
+	return -EPERM;                
 }
 
 /**
@@ -177,7 +201,8 @@ static inline int cap_inh_is_capped(void)
 	/* they are so limited unless the current task has the CAP_SETPCAP
 	 * capability
 	 */
-	if (cap_capable(current, current_cred(), CAP_SETPCAP,
+	if (cap_capable(current, current_cred(),
+			current_cred()->user->user_ns, CAP_SETPCAP,
 			SECURITY_CAP_AUDIT) == 0)
 		return 0;
 	return 1;
@@ -832,7 +857,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		     & (new->securebits ^ arg2))			/*[1]*/
 		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
-		    || (cap_capable(current, current_cred(), CAP_SETPCAP,
+		    || (cap_capable(current, current_cred(),
+				    current_cred()->user->user_ns, CAP_SETPCAP,
 				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
@@ -910,7 +936,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int cap_sys_admin = 0;
 
-	if (cap_capable(current, current_cred(), CAP_SYS_ADMIN,
+	if (cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_ADMIN,
 			SECURITY_CAP_NOAUDIT) == 0)
 		cap_sys_admin = 1;
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
@@ -937,7 +963,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
 	int ret = 0;
 
 	if (addr < dac_mmap_min_addr) {
-		ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO,
+		ret = cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_RAWIO,
 				  SECURITY_CAP_AUDIT);
 		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
 		if (ret == 0)
diff --git a/security/security.c b/security/security.c
index 24e060b..ad75427 100644
--- a/security/security.c
+++ b/security/security.c
@@ -155,30 +155,30 @@ int security_capset(struct cred *new, const struct cred *old,
 				    effective, inheritable, permitted);
 }
 
-int security_capable(int cap)
+int security_capable(struct user_namespace *ns, int cap)
 {
-	return security_ops->capable(current, current_cred(), cap,
+	return security_ops->capable(current, current_cred(), ns, cap,
 				     SECURITY_CAP_AUDIT);
 }
 
-int security_real_capable(struct task_struct *tsk, int cap)
+int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	const struct cred *cred;
 	int ret;
 
 	cred = get_task_cred(tsk);
-	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_AUDIT);
+	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_AUDIT);
 	put_cred(cred);
 	return ret;
 }
 
-int security_real_capable_noaudit(struct task_struct *tsk, int cap)
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	const struct cred *cred;
 	int ret;
 
 	cred = get_task_cred(tsk);
-	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_NOAUDIT);
+	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_NOAUDIT);
 	put_cred(cred);
 	return ret;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bd77a2b..a69f97d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -76,6 +76,7 @@
 #include <linux/selinux.h>
 #include <linux/mutex.h>
 #include <linux/posix-timers.h>
+#include <linux/user_namespace.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -1480,6 +1481,7 @@ static int current_has_perm(const struct task_struct *tsk,
 /* Check whether a task is allowed to use a capability. */
 static int task_has_capability(struct task_struct *tsk,
 			       const struct cred *cred,
+			       struct user_namespace *ns,
 			       int cap, int audit)
 {
 	struct common_audit_data ad;
@@ -1927,15 +1929,15 @@ static int selinux_capset(struct cred *new, const struct cred *old,
  */
 
 static int selinux_capable(struct task_struct *tsk, const struct cred *cred,
-			   int cap, int audit)
+			   struct user_namespace *ns, int cap, int audit)
 {
 	int rc;
 
-	rc = cap_capable(tsk, cred, cap, audit);
+	rc = cap_capable(tsk, cred, ns, cap, audit);
 	if (rc)
 		return rc;
 
-	return task_has_capability(tsk, cred, cap, audit);
+	return task_has_capability(tsk, cred, ns, cap, audit);
 }
 
 static int selinux_sysctl_get_sid(ctl_table *table, u16 tclass, u32 *sid)
@@ -2091,7 +2093,8 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int rc, cap_sys_admin = 0;
 
-	rc = selinux_capable(current, current_cred(), CAP_SYS_ADMIN,
+	rc = selinux_capable(current, current_cred(),
+			     &init_user_ns, CAP_SYS_ADMIN,
 			     SECURITY_CAP_NOAUDIT);
 	if (rc == 0)
 		cap_sys_admin = 1;
@@ -2889,7 +2892,8 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name
 	 * and lack of permission just means that we fall back to the
 	 * in-core context value, not a denial.
 	 */
-	error = selinux_capable(current, current_cred(), CAP_MAC_ADMIN,
+	error = selinux_capable(current, current_cred(),
+				&init_user_ns, CAP_MAC_ADMIN,
 				SECURITY_CAP_NOAUDIT);
 	if (!error)
 		error = security_sid_to_context_force(isec->sid, &context,
-- 
1.6.5.2.143.g8cc62

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]     ` <m13a2j2q7c.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2010-01-06 17:30       ` Serge E. Hallyn
       [not found]         ` <20100106173056.GC15784-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2010-01-06 17:30 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > So i was thinking about how to safely but incrementally introduce
> > targeted capabilities - which we decided was a prereq to making VFS
> > handle user namespaces - and the following seemed doable.  My main
> > motivations were (in order):
> >
> >         1. don't make any unconverted capable() checks unsafe
> >         2. minimize performance impact on non-container case
> >         3. minimize performance impact on containers
> >
> > This patch adds a per-task inherited securebit SECURE_CONTAINERIZED.
> > The capable() call is considered unconverted.  Therefore any call
> > to capable() by a task which is SECURE_CONTAINERIZED returns -EPERM.
> >
> > A new syscall capable_to() is the container-aware version of capable().
> >
> > int capable_to(int cap, enum ns_type type, void *src, void *dest);
> >
> > meaning a task which owns 'src' wants 'cap' access to an object
> > in namespace 'dest'.
> >
> > In a case like setting hostname, there is no way to try to set the
> > hostname in another container, so the check is converted in this patch to
> >
> >         capable_to(CAP_SYS_ADMIN, NS_TYPE_NONE, NULL, NULL);
> >
> > capable_to() will act like the old capable(), meaning grant permission
> > if CAP_SYS_ADMIN is in pE.
> >
> > The check for sending a signal depends on a user namespace, so I
> > converted an instance to
> >
> >         capable_to(CAP_KILL, NS_TYPE_USERNS, current_userns(),
> >                         target->user_ns);
> >
> > The NS_TYPE_USERNS check checks whether target->userns is the same
> > as or a descendent of target->user_ns.  If not, then -EPERM is
> > returned even if the task has CAP_KILL.
> >
> > To test, compile a program (call it 'containerize_cap') that does
> >
> > 	prctl(PR_SET_SECUREBITS, 1 << 6 | 1 << 7);
> > 	execl("/bin/bash", "bash", NULL);
> >
> > Run that in a container (say, do 'ns_exec -cmpuU /bin/bash' and
> > run screen there).  Notice you can set hostname, but you can't
> > for instance read user's directories which don't have world write
> > perms, and can't mount.  You can also kill processes which are
> > either in your own or a child user namespace, but not in a parent
> > user namespace.
> >
> > Purely for discussion.  Comments?
> 
> This looks like a good start of discussion, and you have
> choosen two good examples.
> 
> I believe your check for ancestor user namespaces is actually
> too liberal, I can't quite follow it but it looks like any
> process in an ancestor user namespace has all rights over
> a child, which would let fred kill joe's processes..

But that's only if fred has CAP_KILL in a user namespace which is
ancestor to joe's process.  Only fred's processes in a child
userns should have CAP_KILL.

> I think we can use a much simpler definition, based on the core
> concept that we are making the capabilities namespace relative,
> thus we need to pass in which namespace we want the capability for.
> 
> 	/* Put in kernel/capability.c */
> 	int capable(int cap)
> 	{
> 	        return capable_to(&init_user_ns, cap);
> 	}
> 	
> 	int capable_to(struct user_namespace *ns, int cap)
> 	{
> 	        if (unlikely(!cap_valid(cap))) {
> 			printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
> 			BUG();
> 	        }
> 	        
> 	        if (security_capable(ns, cap) == 0) {
> 	        	current->flags |= PF_SUPERPRIV;
> 	                return 1;
> 	        }
> 	        return 0;
> 	}
> 	
> 	/* Put in security/common_cap.c */
> 	int cap_capable(struct task_struct *tsk, const cred *cred,
> 	    		struct user_namespace *targ_ns, int targ_cap, int audit)
> 	{
> 	        struct user_namespace *curr_ns = cred->user->user_ns
> 	
> 	        for (;;) {
> 	        	/* Do we have the necessary capabilities? */
> 		        if (targ_ns == curr_ns)
> 				return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> 	
> 			/* The creator of the user namespace has all caps. */
> 			if (targ_ns->creator == cred->user)
> 				return 0;
> 	
> 			/* Have we tried all of the parent namespaces? */
> 			if (targ_ns == &init_user_ns)
> 				return -EPERM;
> 	
> 			/* If you have the capability in a parent user ns you have it
> 	                 * in the over all children user namespaces as well, so see
> 	                 * if this process has the capability in the parent user
> 	                 * namespace.
> 	                 */
> 			targ_ns = targ_ns->creator->user_ns;
> 		}
> 	
> 	        /* We never get here */
> 		return -EPERM;                
> 	}
> 
> 
> The example in check_kill_permission simply becomes:
> 	capable_to(tcred->user->user_ns, CAP_KILL);
> 
> While the check in hostname remains unchanged until we convert teach
> the userns to unshare without privilege.  At which point the check should
> become.
> 	capable_to(utsname()->creator->user_ns, CAP_SYS_ADMIN);
> 
> Which matters because we can set the hostname through /proc/sys....

Oh, right.  However, utsname doesn't have a creator, and we won't always
want to use user namespaces to authorize.  For instance, for CAP_NET_ADMIN
we'll want to compare the net_ns.  That's why i had the switch inside
capable_to() based on ns type.

-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]     ` <m17hrv18ad.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2010-01-06 17:35       ` Serge E. Hallyn
       [not found]         ` <20100106173536.GD15784-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-01-06 20:17       ` Serge E. Hallyn
  2010-02-15  4:05       ` Serge E. Hallyn
  2 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2010-01-06 17:35 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > So i was thinking about how to safely but incrementally introduce
> > targeted capabilities - which we decided was a prereq to making VFS
> > handle user namespaces - and the following seemed doable.  My main
> > motivations were (in order):
> >
> >         1. don't make any unconverted capable() checks unsafe
> >         2. minimize performance impact on non-container case
> >         3. minimize performance impact on containers
> 
> My motivation is a bit different.  I would like to get to the
> unprivileged creation of new namespaces.  It looks like this gets us
> 90% of the way there, with only potential uid confusion issues left.

Yup, that was actually what I was thinking about last night when I decided
to give it a shot :)  IMO, my patch + a dummy version of user_namespaces
for vfs (done in a clean way that can be an incremental step toward full
vfs userns support - which I haven't yet thought through) is enough to
give you safe fully unprivileged containers.  Now with the API I have,
you'd have a program with either setuid-root or cap_sys_admin,cap_setpcap=pe
which does the prctl and the unshares, but it would theoretically be safe
to hand that program to unprivileged users.

> I still need to handle getting all caps after creation but otherwise I
> think I have a good starter patch that achieves all of your goals.

Well in my patch we don't need to clear out the bounding set, or set
SETUID_NOROOT - so running a setuid root program or becoming root should
still give you capabilities!  They'll just be targeted at your container.

I really think this is what you need.

> Of course kill_permission needs the checks you have suggested as well.

Ok, I can't look at your patch in detail right now and don't quite get
where you're going with a quick glance, so will look in closer detail
later.   Will also think about a way to get "just-enough" vfs userns
support to completely give you what you need for privileged users in
unprivileged containers.

-serge

> >From db104af741b5f0a2f128688905498cae68fbbde2 Mon Sep 17 00:00:00 2001
> From: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> Date: Wed, 6 Jan 2010 08:26:21 -0800
> Subject: [PATCH] security:  Make capabilities relative to the user namespace.
> 
> - Introduce ns_capable to test for a capability in a non-default
>   user namespace.
> - Teach cap_capable to handle capabilities in a non-default
>   user namespace.
> 
> Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
>  include/linux/capability.h |    6 ++++--
>  include/linux/security.h   |   12 +++++++-----
>  kernel/capability.c        |   22 ++++++++++++++++++++--
>  security/commoncap.c       |   40 +++++++++++++++++++++++++++++++++-------
>  security/security.c        |   12 ++++++------
>  security/selinux/hooks.c   |   14 +++++++++-----
>  6 files changed, 79 insertions(+), 27 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index 39e5ff5..89572b2 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -544,7 +544,7 @@ extern const kernel_cap_t __cap_init_eff_set;
>   *
>   * Note that this does not set PF_SUPERPRIV on the task.
>   */
> -#define has_capability(t, cap) (security_real_capable((t), (cap)) == 0)
> +#define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0)
> 
>  /**
>   * has_capability_noaudit - Determine if a task has a superior capability available (unaudited)
> @@ -558,9 +558,11 @@ extern const kernel_cap_t __cap_init_eff_set;
>   * Note that this does not set PF_SUPERPRIV on the task.
>   */
>  #define has_capability_noaudit(t, cap) \
> -	(security_real_capable_noaudit((t), (cap)) == 0)
> +	(security_real_capable_noaudit((t), &init_user_ns, (cap)) == 0)
> 
> +struct user_namespace;
>  extern int capable(int cap);
> +extern int ns_capable(struct user_namespace *ns, int cap);
> 
>  /* audit system wants to get cap info from files as well */
>  struct dentry;
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 2c627d3..f44932f 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -45,13 +45,14 @@
> 
>  struct ctl_table;
>  struct audit_krule;
> +struct user_namespace;
> 
>  /*
>   * These functions are in security/capability.c and are used
>   * as the default capabilities functions
>   */
>  extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
> -		       int cap, int audit);
> +		       struct user_namespace *ns, int cap, int audit);
>  extern int cap_settime(struct timespec *ts, struct timezone *tz);
>  extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
>  extern int cap_ptrace_traceme(struct task_struct *parent);
> @@ -1327,6 +1328,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
>   *	credentials.
>   *	@tsk contains the task_struct for the process.
>   *	@cred contains the credentials to use.
> + *      @ns contains the user namespace we want the capability in
>   *	@cap contains the capability <include/linux/capability.h>.
>   *	@audit: Whether to write an audit message or not
>   *	Return 0 if the capability is granted for @tsk.
> @@ -1457,7 +1459,7 @@ struct security_operations {
>  		       const kernel_cap_t *inheritable,
>  		       const kernel_cap_t *permitted);
>  	int (*capable) (struct task_struct *tsk, const struct cred *cred,
> -			int cap, int audit);
> +			struct user_namespace *ns, int cap, int audit);
>  	int (*acct) (struct file *file);
>  	int (*sysctl) (struct ctl_table *table, int op);
>  	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
> @@ -1754,9 +1756,9 @@ int security_capset(struct cred *new, const struct cred *old,
>  		    const kernel_cap_t *effective,
>  		    const kernel_cap_t *inheritable,
>  		    const kernel_cap_t *permitted);
> -int security_capable(int cap);
> -int security_real_capable(struct task_struct *tsk, int cap);
> -int security_real_capable_noaudit(struct task_struct *tsk, int cap);
> +int security_capable(struct user_namespace *ns, int cap);
> +int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap);
> +int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap);
>  int security_acct(struct file *file);
>  int security_sysctl(struct ctl_table *table, int op);
>  int security_quotactl(int cmds, int type, int id, struct super_block *sb);
> diff --git a/kernel/capability.c b/kernel/capability.c
> index 7f876e6..63dcf53 100644
> --- a/kernel/capability.c
> +++ b/kernel/capability.c
> @@ -14,6 +14,7 @@
>  #include <linux/security.h>
>  #include <linux/syscalls.h>
>  #include <linux/pid_namespace.h>
> +#include <linux/user_namespace.h>
>  #include <asm/uaccess.h>
>  #include "cred-internals.h"
> 
> @@ -302,15 +303,32 @@ error:
>   */
>  int capable(int cap)
>  {
> +	return ns_capable(&init_user_ns, cap);
> +}
> +EXPORT_SYMBOL(capable);
> +
> +/**
> + * ns_capable - Determine if the current task has a superior capability in effect
> + * @ns:  The usernamespace we want the capability in
> + * @cap: The capability to be tested for
> + *
> + * Return true if the current task has the given superior capability currently
> + * available for use, false if not.
> + *
> + * This sets PF_SUPERPRIV on the task if the capability is available on the
> + * assumption that it's about to be used.
> + */
> +int ns_capable(struct user_namespace *ns, int cap)
> +{
>  	if (unlikely(!cap_valid(cap))) {
>  		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
>  		BUG();
>  	}
> 
> -	if (security_capable(cap) == 0) {
> +	if (security_capable(ns, cap) == 0) {
>  		current->flags |= PF_SUPERPRIV;
>  		return 1;
>  	}
>  	return 0;
>  }
> -EXPORT_SYMBOL(capable);
> +EXPORT_SYMBOL(ns_capable);
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 34500e3..ffde5be 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -27,6 +27,7 @@
>  #include <linux/sched.h>
>  #include <linux/prctl.h>
>  #include <linux/securebits.h>
> +#include <linux/user_namespace.h>
> 
>  /*
>   * If a non-root user executes a setuid-root binary in
> @@ -68,6 +69,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
>   * cap_capable - Determine whether a task has a particular effective capability
>   * @tsk: The task to query
>   * @cred: The credentials to use
> + * @ns:  The user namespace in which we need the capability
>   * @cap: The capability to check for
>   * @audit: Whether to write an audit message or not
>   *
> @@ -79,10 +81,32 @@ EXPORT_SYMBOL(cap_netlink_recv);
>   * cap_has_capability() returns 0 when a task has a capability, but the
>   * kernel's capable() and has_capability() returns 1 for this case.
>   */
> -int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
> -		int audit)
> +int cap_capable(struct task_struct *tsk, const struct cred *cred,
> +		struct user_namespace *targ_ns, int cap, int audit)
>  {
> -	return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> +	for (;;) {
> +		/* Do we have the necessary capabilities? */
> +		if (targ_ns == cred->user->user_ns)
> +			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> +	
> +		/* The creator of the user namespace has all caps. */
> +		if (targ_ns->creator == cred->user)
> +			return 0;
> +	
> +		/* Have we tried all of the parent namespaces? */
> +		if (targ_ns == &init_user_ns)
> +			return -EPERM;
> +	
> +		/* If you have the capability in a parent user ns you have it
> +		 * in the over all children user namespaces as well, so see
> +		 * if this process has the capability in the parent user
> +		 * namespace.
> +		 */
> +		targ_ns = targ_ns->creator->user_ns;
> +	}
> +	
> +	/* We never get here */
> +	return -EPERM;                
>  }
> 
>  /**
> @@ -177,7 +201,8 @@ static inline int cap_inh_is_capped(void)
>  	/* they are so limited unless the current task has the CAP_SETPCAP
>  	 * capability
>  	 */
> -	if (cap_capable(current, current_cred(), CAP_SETPCAP,
> +	if (cap_capable(current, current_cred(),
> +			current_cred()->user->user_ns, CAP_SETPCAP,
>  			SECURITY_CAP_AUDIT) == 0)
>  		return 0;
>  	return 1;
> @@ -832,7 +857,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
>  		     & (new->securebits ^ arg2))			/*[1]*/
>  		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
>  		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
> -		    || (cap_capable(current, current_cred(), CAP_SETPCAP,
> +		    || (cap_capable(current, current_cred(),
> +				    current_cred()->user->user_ns, CAP_SETPCAP,
>  				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
>  			/*
>  			 * [1] no changing of bits that are locked
> @@ -910,7 +936,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
>  {
>  	int cap_sys_admin = 0;
> 
> -	if (cap_capable(current, current_cred(), CAP_SYS_ADMIN,
> +	if (cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_ADMIN,
>  			SECURITY_CAP_NOAUDIT) == 0)
>  		cap_sys_admin = 1;
>  	return __vm_enough_memory(mm, pages, cap_sys_admin);
> @@ -937,7 +963,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
>  	int ret = 0;
> 
>  	if (addr < dac_mmap_min_addr) {
> -		ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO,
> +		ret = cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_RAWIO,
>  				  SECURITY_CAP_AUDIT);
>  		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
>  		if (ret == 0)
> diff --git a/security/security.c b/security/security.c
> index 24e060b..ad75427 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -155,30 +155,30 @@ int security_capset(struct cred *new, const struct cred *old,
>  				    effective, inheritable, permitted);
>  }
> 
> -int security_capable(int cap)
> +int security_capable(struct user_namespace *ns, int cap)
>  {
> -	return security_ops->capable(current, current_cred(), cap,
> +	return security_ops->capable(current, current_cred(), ns, cap,
>  				     SECURITY_CAP_AUDIT);
>  }
> 
> -int security_real_capable(struct task_struct *tsk, int cap)
> +int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
>  {
>  	const struct cred *cred;
>  	int ret;
> 
>  	cred = get_task_cred(tsk);
> -	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_AUDIT);
> +	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_AUDIT);
>  	put_cred(cred);
>  	return ret;
>  }
> 
> -int security_real_capable_noaudit(struct task_struct *tsk, int cap)
> +int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
>  {
>  	const struct cred *cred;
>  	int ret;
> 
>  	cred = get_task_cred(tsk);
> -	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_NOAUDIT);
> +	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_NOAUDIT);
>  	put_cred(cred);
>  	return ret;
>  }
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index bd77a2b..a69f97d 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -76,6 +76,7 @@
>  #include <linux/selinux.h>
>  #include <linux/mutex.h>
>  #include <linux/posix-timers.h>
> +#include <linux/user_namespace.h>
> 
>  #include "avc.h"
>  #include "objsec.h"
> @@ -1480,6 +1481,7 @@ static int current_has_perm(const struct task_struct *tsk,
>  /* Check whether a task is allowed to use a capability. */
>  static int task_has_capability(struct task_struct *tsk,
>  			       const struct cred *cred,
> +			       struct user_namespace *ns,
>  			       int cap, int audit)
>  {
>  	struct common_audit_data ad;
> @@ -1927,15 +1929,15 @@ static int selinux_capset(struct cred *new, const struct cred *old,
>   */
> 
>  static int selinux_capable(struct task_struct *tsk, const struct cred *cred,
> -			   int cap, int audit)
> +			   struct user_namespace *ns, int cap, int audit)
>  {
>  	int rc;
> 
> -	rc = cap_capable(tsk, cred, cap, audit);
> +	rc = cap_capable(tsk, cred, ns, cap, audit);
>  	if (rc)
>  		return rc;
> 
> -	return task_has_capability(tsk, cred, cap, audit);
> +	return task_has_capability(tsk, cred, ns, cap, audit);
>  }
> 
>  static int selinux_sysctl_get_sid(ctl_table *table, u16 tclass, u32 *sid)
> @@ -2091,7 +2093,8 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
>  {
>  	int rc, cap_sys_admin = 0;
> 
> -	rc = selinux_capable(current, current_cred(), CAP_SYS_ADMIN,
> +	rc = selinux_capable(current, current_cred(),
> +			     &init_user_ns, CAP_SYS_ADMIN,
>  			     SECURITY_CAP_NOAUDIT);
>  	if (rc == 0)
>  		cap_sys_admin = 1;
> @@ -2889,7 +2892,8 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name
>  	 * and lack of permission just means that we fall back to the
>  	 * in-core context value, not a denial.
>  	 */
> -	error = selinux_capable(current, current_cred(), CAP_MAC_ADMIN,
> +	error = selinux_capable(current, current_cred(),
> +				&init_user_ns, CAP_MAC_ADMIN,
>  				SECURITY_CAP_NOAUDIT);
>  	if (!error)
>  		error = security_sid_to_context_force(isec->sid, &context,
> -- 
> 1.6.5.2.143.g8cc62

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]     ` <m17hrv18ad.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  2010-01-06 17:35       ` Serge E. Hallyn
@ 2010-01-06 20:17       ` Serge E. Hallyn
       [not found]         ` <20100106201725.GA24242-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-02-15  4:05       ` Serge E. Hallyn
  2 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2010-01-06 20:17 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > So i was thinking about how to safely but incrementally introduce
> > targeted capabilities - which we decided was a prereq to making VFS
> > handle user namespaces - and the following seemed doable.  My main
> > motivations were (in order):
> >
> >         1. don't make any unconverted capable() checks unsafe
> >         2. minimize performance impact on non-container case
> >         3. minimize performance impact on containers
> 
> My motivation is a bit different.  I would like to get to the
> unprivileged creation of new namespaces.  It looks like this gets us
> 90% of the way there, with only potential uid confusion issues left.
> 
> I still need to handle getting all caps after creation but otherwise I
> think I have a good starter patch that achieves all of your goals.
> 
> Of course kill_permission needs the checks you have suggested as well.
>
> Eric
> 
> 
> >From db104af741b5f0a2f128688905498cae68fbbde2 Mon Sep 17 00:00:00 2001
> From: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> Date: Wed, 6 Jan 2010 08:26:21 -0800
> Subject: [PATCH] security:  Make capabilities relative to the user namespace.
> 
> - Introduce ns_capable to test for a capability in a non-default
>   user namespace.
> - Teach cap_capable to handle capabilities in a non-default
>   user namespace.

So yeah, I didn't address the whole has_capability junk.  Feh.

So do you intend to tag all namespaces with the userns which
created it?  So sys_hostname() can check utsname->uts_ns->creator,
and net ioctl SIOCSIFNAME checks struct net->creator?

-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]         ` <20100106173056.GC15784-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-01-06 20:43           ` Eric W. Biederman
  0 siblings, 0 replies; 18+ messages in thread
From: Eric W. Biederman @ 2010-01-06 20:43 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> But that's only if fred has CAP_KILL in a user namespace which is
> ancestor to joe's process.  Only fred's processes in a child
> userns should have CAP_KILL.

Got it.  What I don't see in your implementation is how you can kill a
child that is in it's own user namespace if you don't have CAP_KILL.

>> Which matters because we can set the hostname through /proc/sys....
>
> Oh, right.  However, utsname doesn't have a creator, and we won't always
> want to use user namespaces to authorize.  For instance, for CAP_NET_ADMIN
> we'll want to compare the net_ns.  That's why I had the switch inside
> capable_to() based on ns type.

I disagree.  For CAP_NET_ADMIN we will want to do:
ns_capable(net->userns, CAP_NET_ADMIN);

Network namespaces do not have a hierarchy so I don't see how they
would be useful in this context.

When we add an unprivileged unshare it is trivial to capture either
the creator or at least the creators user namespace.  Giving us a
usernamespace to compare against.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]         ` <20100106173536.GD15784-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-01-06 20:57           ` Eric W. Biederman
  0 siblings, 0 replies; 18+ messages in thread
From: Eric W. Biederman @ 2010-01-06 20:57 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
>> 
>> > So i was thinking about how to safely but incrementally introduce
>> > targeted capabilities - which we decided was a prereq to making VFS
>> > handle user namespaces - and the following seemed doable.  My main
>> > motivations were (in order):
>> >
>> >         1. don't make any unconverted capable() checks unsafe
>> >         2. minimize performance impact on non-container case
>> >         3. minimize performance impact on containers
>> 
>> My motivation is a bit different.  I would like to get to the
>> unprivileged creation of new namespaces.  It looks like this gets us
>> 90% of the way there, with only potential uid confusion issues left.
>
> Yup, that was actually what I was thinking about last night when I decided
> to give it a shot :)  IMO, my patch + a dummy version of user_namespaces
> for vfs (done in a clean way that can be an incremental step toward full
> vfs userns support - which I haven't yet thought through) is enough to
> give you safe fully unprivileged containers.  Now with the API I have,
> you'd have a program with either setuid-root or cap_sys_admin,cap_setpcap=pe
> which does the prctl and the unshares, but it would theoretically be safe
> to hand that program to unprivileged users.

Yes.

>> I still need to handle getting all caps after creation but otherwise I
>> think I have a good starter patch that achieves all of your goals.
>
> Well in my patch we don't need to clear out the bounding set, or set
> SETUID_NOROOT - so running a setuid root program or becoming root should
> still give you capabilities!  They'll just be targeted at your container.
>
> I really think this is what you need.

Yes.  So far things don't look too hard.  What I meant is that after
CLONE_USERNS you should become uid 0 with a full set of capabilities in
a new user namespace.  Those capabilities aren't good for anything because
they are user namespace relative.

I believe we have a bug today where the new uid 0 does not have a full set
of capabilities, but that it is hidden because only uid 0 can unshare
the user namespace.

>> Of course kill_permission needs the checks you have suggested as well.
>
> Ok, I can't look at your patch in detail right now and don't quite get
> where you're going with a quick glance, so will look in closer detail
> later.   Will also think about a way to get "just-enough" vfs userns
> support to completely give you what you need for privileged users in
> unprivileged containers.

Sounds good.  That uid 0 problem is particularly interesting, because half
the world is owned by uid 0.

As for my patch.  The heart of it is the cap_capable implementation.
The rest is just the obvious consequences of adding a user_namespace parameter
to a security->capable().

int cap_capable(struct task_struct *tsk, const struct cred *cred,
		struct user_namespace *targ_ns, int cap, int audit)
{
	for (;;) {
		/* Do we have the necessary capabilities? */
		if (targ_ns == cred->user->user_ns)
			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
	
		/* The creator of the user namespace has all caps. */
		if (targ_ns->creator == cred->user)
			return 0;
	
		/* Have we tried all of the parent namespaces? */
		if (targ_ns == &init_user_ns)
			return -EPERM;
	
		/* If you have the capability in a parent user ns you have it
		 * in the over all children user namespaces as well, so see
		 * if this process has the capability in the parent user
		 * namespace.
		 */
		targ_ns = targ_ns->creator->user_ns;
	}
}

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]         ` <20100106201725.GA24242-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-01-06 21:11           ` Eric W. Biederman
       [not found]             ` <m1skajszuw.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  2010-02-15 14:27           ` Matt Helsley
  1 sibling, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2010-01-06 21:11 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

>> - Introduce ns_capable to test for a capability in a non-default
>>   user namespace.
>> - Teach cap_capable to handle capabilities in a non-default
>>   user namespace.
>
> So yeah, I didn't address the whole has_capability junk.  Feh.

That just fell out...

> So do you intend to tag all namespaces with the userns which
> created it?  So sys_hostname() can check utsname->uts_ns->creator,
> and net ioctl SIOCSIFNAME checks struct net->creator?

That is the plan.  Add a creator/usernamespace as part of the patches
to support creating a new namespace without the global CAP_SYS_ADMIN.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]             ` <m1skajszuw.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2010-01-06 21:57               ` Serge E. Hallyn
       [not found]                 ` <20100106215721.GA5823-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2010-01-06 21:57 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> >> - Introduce ns_capable to test for a capability in a non-default
> >>   user namespace.
> >> - Teach cap_capable to handle capabilities in a non-default
> >>   user namespace.
> >
> > So yeah, I didn't address the whole has_capability junk.  Feh.
> 
> That just fell out...
> 
> > So do you intend to tag all namespaces with the userns which
> > created it?  So sys_hostname() can check utsname->uts_ns->creator,
> > and net ioctl SIOCSIFNAME checks struct net->creator?
> 
> That is the plan.  Add a creator/usernamespace as part of the patches
> to support creating a new namespace without the global CAP_SYS_ADMIN.

Cool - are you working on that right now, or should I start it myself
if i'm bored and restless tonight?

-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]                 ` <20100106215721.GA5823-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-01-07  0:16                   ` Eric W. Biederman
  0 siblings, 0 replies; 18+ messages in thread
From: Eric W. Biederman @ 2010-01-07  0:16 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
>> 
>> >> - Introduce ns_capable to test for a capability in a non-default
>> >>   user namespace.
>> >> - Teach cap_capable to handle capabilities in a non-default
>> >>   user namespace.
>> >
>> > So yeah, I didn't address the whole has_capability junk.  Feh.
>> 
>> That just fell out...
>> 
>> > So do you intend to tag all namespaces with the userns which
>> > created it?  So sys_hostname() can check utsname->uts_ns->creator,
>> > and net ioctl SIOCSIFNAME checks struct net->creator?
>> 
>> That is the plan.  Add a creator/usernamespace as part of the patches
>> to support creating a new namespace without the global CAP_SYS_ADMIN.
>
> Cool - are you working on that right now, or should I start it myself
> if i'm bored and restless tonight?

Go for it.  I have some things to wrap up before I can do much with this,
despite the fact I want to.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]     ` <m17hrv18ad.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  2010-01-06 17:35       ` Serge E. Hallyn
  2010-01-06 20:17       ` Serge E. Hallyn
@ 2010-02-15  4:05       ` Serge E. Hallyn
       [not found]         ` <20100215040529.GA20519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2010-02-15  4:05 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > So i was thinking about how to safely but incrementally introduce
> > targeted capabilities - which we decided was a prereq to making VFS
> > handle user namespaces - and the following seemed doable.  My main
> > motivations were (in order):
> >
> >         1. don't make any unconverted capable() checks unsafe
> >         2. minimize performance impact on non-container case
> >         3. minimize performance impact on containers
> 
> My motivation is a bit different.  I would like to get to the
> unprivileged creation of new namespaces.  It looks like this gets us
> 90% of the way there, with only potential uid confusion issues left.

Just a pair of instances of uid comparison are now addressed in

	http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/sergeh/linux-cr.git;a=shortlog;h=refs/heads/feb13.userns.uid_equivs

which has your patch "taking a crack at targeted capabilities" at its
core.  Talk about your baby steps...  But I need to go back and re-read
what we'd discussed over the last few years about how we wanted to
tag superblocks/mounts->inodes before I go on.

Anyway now uid equivalence checks are ns-aware for basic vfs_permission
and task kill at least.  It's a start.

-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]         ` <20100215040529.GA20519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-02-15 11:06           ` Eric W. Biederman
       [not found]             ` <m1ocjqep25.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2010-02-15 11:06 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
>> 
>> > So i was thinking about how to safely but incrementally introduce
>> > targeted capabilities - which we decided was a prereq to making VFS
>> > handle user namespaces - and the following seemed doable.  My main
>> > motivations were (in order):
>> >
>> >         1. don't make any unconverted capable() checks unsafe
>> >         2. minimize performance impact on non-container case
>> >         3. minimize performance impact on containers
>> 
>> My motivation is a bit different.  I would like to get to the
>> unprivileged creation of new namespaces.  It looks like this gets us
>> 90% of the way there, with only potential uid confusion issues left.
>
> Just a pair of instances of uid comparison are now addressed in
>
> 	http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/sergeh/linux-cr.git;a=shortlog;h=refs/heads/feb13.userns.uid_equivs
>
> which has your patch "taking a crack at targeted capabilities" at its
> core.  Talk about your baby steps...  But I need to go back and re-read
> what we'd discussed over the last few years about how we wanted to
> tag superblocks/mounts->inodes before I go on.
>
> Anyway now uid equivalence checks are ns-aware for basic vfs_permission
> and task kill at least.  It's a start.

Thanks for keeping this alive.

I took a quick skim through your patches and things look a little rough
(you are patching your patches) but it looks like you are wrapping your
head around the ideas pretty well, and the ns_capable etc seem to be working.
Hooray!

The big idea was that the generic filesystem interface would speak multiple
uid namespaces, and the generic default would do something simple and pick
a single namespace for all of the comparisons to be against.  Then we would
have a generic library for filesystem to implement mount options describing
how they wanted to map uids in different namespaces into what they could
store on the filesystem.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]         ` <20100106201725.GA24242-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-01-06 21:11           ` Eric W. Biederman
@ 2010-02-15 14:27           ` Matt Helsley
       [not found]             ` <20100215142746.GD3714-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
  1 sibling, 1 reply; 18+ messages in thread
From: Matt Helsley @ 2010-02-15 14:27 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, Eric W. Biederman

On Wed, Jan 06, 2010 at 02:17:25PM -0600, Serge E. Hallyn wrote:
> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> > "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

<snip>

> > >From db104af741b5f0a2f128688905498cae68fbbde2 Mon Sep 17 00:00:00 2001
> > From: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> > Date: Wed, 6 Jan 2010 08:26:21 -0800
> > Subject: [PATCH] security:  Make capabilities relative to the user namespace.
> > 
> > - Introduce ns_capable to test for a capability in a non-default
> >   user namespace.
> > - Teach cap_capable to handle capabilities in a non-default
> >   user namespace.
> 
> So yeah, I didn't address the whole has_capability junk.  Feh.
> 
> So do you intend to tag all namespaces with the userns which
> created it?  So sys_hostname() can check utsname->uts_ns->creator,
> and net ioctl SIOCSIFNAME checks struct net->creator?

That makes sense but I'm getting a worried about the way those extra
namespace references are popping up in other namespace structs. Seems
like it would be easy to write code that could create reference
cycles and thus leak memory. Perhaps it will require splitting the
references sort of like struct mm_struct?

The other example of that idea was keeping a syslog_ns reference in
the netns for the iptables printks in ipt_LOG.c. What happens when
one of the CONFIG_*NS options isn't selected? Suddenly we're littering
the struct definitions with #ifdefs and making the code alot more
complicated to test (I suspect). Perhaps it's time to merge all
the CONFIG_*NS options into CONFIG_NAMESPACES?

Cheers,
	-Matt Helsley

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]             ` <20100215142746.GD3714-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2010-02-15 16:16               ` Eric W. Biederman
       [not found]                 ` <m13a12bhjq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2010-02-15 16:16 UTC (permalink / raw)
  To: Matt Helsley; +Cc: Linux Containers

Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> On Wed, Jan 06, 2010 at 02:17:25PM -0600, Serge E. Hallyn wrote:
>> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> > "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
>
> <snip>
>
>> > >From db104af741b5f0a2f128688905498cae68fbbde2 Mon Sep 17 00:00:00 2001
>> > From: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> > Date: Wed, 6 Jan 2010 08:26:21 -0800
>> > Subject: [PATCH] security:  Make capabilities relative to the user namespace.
>> > 
>> > - Introduce ns_capable to test for a capability in a non-default
>> >   user namespace.
>> > - Teach cap_capable to handle capabilities in a non-default
>> >   user namespace.
>> 
>> So yeah, I didn't address the whole has_capability junk.  Feh.
>> 
>> So do you intend to tag all namespaces with the userns which
>> created it?  So sys_hostname() can check utsname->uts_ns->creator,
>> and net ioctl SIOCSIFNAME checks struct net->creator?
>
> That makes sense but I'm getting a worried about the way those extra
> namespace references are popping up in other namespace structs. Seems
> like it would be easy to write code that could create reference
> cycles and thus leak memory. Perhaps it will require splitting the
> references sort of like struct mm_struct?

Not yet.  If we only grab references as namespace creation time
reference cycles are impossible, at least reference cycles outside
of the initial namespaces.

> The other example of that idea was keeping a syslog_ns reference in
> the netns for the iptables printks in ipt_LOG.c. What happens when
> one of the CONFIG_*NS options isn't selected? Suddenly we're littering
> the struct definitions with #ifdefs and making the code alot more
> complicated to test (I suspect). Perhaps it's time to merge all
> the CONFIG_*NS options into CONFIG_NAMESPACES?

Truthfully I am dubious about the syslog namespace.  Certainly the
implementations I have seen so far seem half thought out.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]                 ` <m13a12bhjq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2010-02-15 16:37                   ` Matt Helsley
       [not found]                     ` <20100215163708.GG3714-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Matt Helsley @ 2010-02-15 16:37 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

On Mon, Feb 15, 2010 at 08:16:41AM -0800, Eric W. Biederman wrote:
> Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > On Wed, Jan 06, 2010 at 02:17:25PM -0600, Serge E. Hallyn wrote:
> >> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> >> > "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> >
> > <snip>
> >
> >> > >From db104af741b5f0a2f128688905498cae68fbbde2 Mon Sep 17 00:00:00 2001
> >> > From: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> >> > Date: Wed, 6 Jan 2010 08:26:21 -0800
> >> > Subject: [PATCH] security:  Make capabilities relative to the user namespace.
> >> > 
> >> > - Introduce ns_capable to test for a capability in a non-default
> >> >   user namespace.
> >> > - Teach cap_capable to handle capabilities in a non-default
> >> >   user namespace.
> >> 
> >> So yeah, I didn't address the whole has_capability junk.  Feh.
> >> 
> >> So do you intend to tag all namespaces with the userns which
> >> created it?  So sys_hostname() can check utsname->uts_ns->creator,
> >> and net ioctl SIOCSIFNAME checks struct net->creator?
> >
> > That makes sense but I'm getting a worried about the way those extra
> > namespace references are popping up in other namespace structs. Seems
> > like it would be easy to write code that could create reference
> > cycles and thus leak memory. Perhaps it will require splitting the
> > references sort of like struct mm_struct?
> 
> Not yet.  If we only grab references as namespace creation time
> reference cycles are impossible, at least reference cycles outside
> of the initial namespaces.

Ahh, good point.

It occurs to me that one nice thing about sticking those references in
is the code will roughly document the fact that these types of namespaces
are related.

> > The other example of that idea was keeping a syslog_ns reference in
> > the netns for the iptables printks in ipt_LOG.c. What happens when
> > one of the CONFIG_*NS options isn't selected? Suddenly we're littering
> > the struct definitions with #ifdefs and making the code alot more
> > complicated to test (I suspect). Perhaps it's time to merge all
> > the CONFIG_*NS options into CONFIG_NAMESPACES?
> 
> Truthfully I am dubious about the syslog namespace.  Certainly the
> implementations I have seen so far seem half thought out.

Yeah. I agree that it's not quite clear what's needed. But that doesn't
really address my points in the second paragraph above (the first
paragraphed was answered nicely, thanks!).

Cheers,
	-Matt Helsley

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]                     ` <20100215163708.GG3714-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2010-02-15 16:48                       ` Eric W. Biederman
  0 siblings, 0 replies; 18+ messages in thread
From: Eric W. Biederman @ 2010-02-15 16:48 UTC (permalink / raw)
  To: Matt Helsley; +Cc: Linux Containers

Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

>> > The other example of that idea was keeping a syslog_ns reference in
>> > the netns for the iptables printks in ipt_LOG.c. What happens when
>> > one of the CONFIG_*NS options isn't selected? Suddenly we're littering
>> > the struct definitions with #ifdefs and making the code alot more
>> > complicated to test (I suspect). Perhaps it's time to merge all
>> > the CONFIG_*NS options into CONFIG_NAMESPACES?

In general the plan has been to support disabling the creation of namespaces
but that is about it.  The disables are there to prevent sysadmins from
dealing with under-construction code, as in general we can not remove the code
without having lots of weird paths.

Distro's will enable these, and the incremental cost of having the enabled
is small, at least if they are built properly.  If the incremental cost
of enabling a namespace is not small we probably need to go back to the
drawing board because maintainability will be affected.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] RFC: taking a crack at targeted capabilities
       [not found]             ` <m1ocjqep25.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2010-02-16 22:07               ` Serge E. Hallyn
  0 siblings, 0 replies; 18+ messages in thread
From: Serge E. Hallyn @ 2010-02-16 22:07 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> >> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> >> 
> >> > So i was thinking about how to safely but incrementally introduce
> >> > targeted capabilities - which we decided was a prereq to making VFS
> >> > handle user namespaces - and the following seemed doable.  My main
> >> > motivations were (in order):
> >> >
> >> >         1. don't make any unconverted capable() checks unsafe
> >> >         2. minimize performance impact on non-container case
> >> >         3. minimize performance impact on containers
> >> 
> >> My motivation is a bit different.  I would like to get to the
> >> unprivileged creation of new namespaces.  It looks like this gets us
> >> 90% of the way there, with only potential uid confusion issues left.
> >
> > Just a pair of instances of uid comparison are now addressed in
> >
> > 	http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/sergeh/linux-cr.git;a=shortlog;h=refs/heads/feb13.userns.uid_equivs
> >
> > which has your patch "taking a crack at targeted capabilities" at its
> > core.  Talk about your baby steps...  But I need to go back and re-read
> > what we'd discussed over the last few years about how we wanted to
> > tag superblocks/mounts->inodes before I go on.
> >
> > Anyway now uid equivalence checks are ns-aware for basic vfs_permission
> > and task kill at least.  It's a start.
> 
> Thanks for keeping this alive.
> 
> I took a quick skim through your patches and things look a little rough
> (you are patching your patches) but it looks like you are wrapping your

Oh!  I see what happened.  I had two patches sitting on top of my local
master branch, switched to an experimental branch and did the same
patches plus others plus fixes, then rebased on top of my messed-up
local master instead of origin/master.  So 
	"check user namespace for task->file uid equivalence."
shows up twice, once messed-up, and once as just a fix on top of the
messed up one.

Wow.

> head around the ideas pretty well, and the ns_capable etc seem to be working.
> Hooray!
> 
> The big idea was that the generic filesystem interface would speak multiple
> uid namespaces, and the generic default would do something simple and pick
> a single namespace for all of the comparisons to be against.  Then we would
> have a generic library for filesystem to implement mount options describing
> how they wanted to map uids in different namespaces into what they could
> store on the filesystem.
> 
> Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2010-02-16 22:07 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-06  6:28 [PATCH 1/1] RFC: taking a crack at targeted capabilities Serge E. Hallyn
     [not found] ` <20100106062809.GA17064-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-01-06 15:44   ` Eric W. Biederman
     [not found]     ` <m13a2j2q7c.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2010-01-06 17:30       ` Serge E. Hallyn
     [not found]         ` <20100106173056.GC15784-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-01-06 20:43           ` Eric W. Biederman
2010-01-06 16:56   ` Eric W. Biederman
     [not found]     ` <m17hrv18ad.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2010-01-06 17:35       ` Serge E. Hallyn
     [not found]         ` <20100106173536.GD15784-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-01-06 20:57           ` Eric W. Biederman
2010-01-06 20:17       ` Serge E. Hallyn
     [not found]         ` <20100106201725.GA24242-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-01-06 21:11           ` Eric W. Biederman
     [not found]             ` <m1skajszuw.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2010-01-06 21:57               ` Serge E. Hallyn
     [not found]                 ` <20100106215721.GA5823-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-01-07  0:16                   ` Eric W. Biederman
2010-02-15 14:27           ` Matt Helsley
     [not found]             ` <20100215142746.GD3714-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2010-02-15 16:16               ` Eric W. Biederman
     [not found]                 ` <m13a12bhjq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2010-02-15 16:37                   ` Matt Helsley
     [not found]                     ` <20100215163708.GG3714-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2010-02-15 16:48                       ` Eric W. Biederman
2010-02-15  4:05       ` Serge E. Hallyn
     [not found]         ` <20100215040529.GA20519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-02-15 11:06           ` Eric W. Biederman
     [not found]             ` <m1ocjqep25.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2010-02-16 22:07               ` Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.