All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] cr: credentials
@ 2009-05-19  1:44 Serge E. Hallyn
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
       [not found] ` <20090519014538.GD28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 2 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:44 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Here is my latest version of the task credentials c/r patchset.
The last patch isn't meant to go upstream - it just helped me
to straighten out the refcounting, to the point where nested
user namespace c/r now appears to be robust.

thanks,
-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/6] cr: break out new_user_ns()
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-19  1:45   ` Serge E. Hallyn
  2009-05-19  1:45   ` [PATCH 2/6] cr: split core function out of some set*{u,g}id functions Serge E. Hallyn
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:45 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Break out the core function which checks privilege and (if
allowed) creates a new user namespace, with the passed-in
creating user_struct.  Note that a user_namespace, unlike
other namespace pointers, is not stored in the nsproxy.
Rather it is purely a property of user_structs.

This will let us keep the task restore code simpler.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/user_namespace.h |    8 ++++++
 kernel/user_namespace.c        |   53 ++++++++++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index cc4f453..a2b82d5 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -20,6 +20,8 @@ extern struct user_namespace init_user_ns;
 
 #ifdef CONFIG_USER_NS
 
+struct user_namespace *new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot);
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -38,6 +40,12 @@ static inline void put_user_ns(struct user_namespace *ns)
 
 #else
 
+static inline struct user_namespace *new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot)
+{
+	return -EINVAL;
+}
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8..e624b0f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,15 +11,8 @@
 #include <linux/user_namespace.h>
 #include <linux/cred.h>
 
-/*
- * Create a new user namespace, deriving the creator from the user in the
- * passed credentials, and replacing that user with the new root user for the
- * new namespace.
- *
- * This is called by copy_creds(), which will finish setting the target task's
- * credentials.
- */
-int create_user_ns(struct cred *new)
+static struct user_namespace *_new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot)
 {
 	struct user_namespace *ns;
 	struct user_struct *root_user;
@@ -27,7 +20,7 @@ int create_user_ns(struct cred *new)
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
 	if (!ns)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	kref_init(&ns->kref);
 
@@ -38,12 +31,43 @@ int create_user_ns(struct cred *new)
 	root_user = alloc_uid(ns, 0);
 	if (!root_user) {
 		kfree(ns);
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	/* set the new root user in the credentials under preparation */
-	ns->creator = new->user;
-	new->user = root_user;
+	ns->creator = creator;
+
+	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
+	kref_set(&ns->kref, 1);
+
+	*newroot = root_user;
+	return ns;
+}
+
+struct user_namespace *new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	return _new_user_ns(creator, newroot);
+}
+
+/*
+ * Create a new user namespace, deriving the creator from the user in the
+ * passed credentials, and replacing that user with the new root user for the
+ * new namespace.
+ *
+ * This is called by copy_creds(), which will finish setting the target task's
+ * credentials.
+ */
+int create_user_ns(struct cred *new)
+{
+	struct user_namespace *ns;
+
+	ns = new_user_ns(new->user, &new->user);
+	if (IS_ERR(ns))
+		return PTR_ERR(ns);
+
 	new->uid = new->euid = new->suid = new->fsuid = 0;
 	new->gid = new->egid = new->sgid = new->fsgid = 0;
 	put_group_info(new->group_info);
@@ -54,9 +78,6 @@ int create_user_ns(struct cred *new)
 #endif
 	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
-	kref_set(&ns->kref, 1);
-
 	return 0;
 }
 
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/6] cr: split core function out of some set*{u,g}id functions
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-19  1:45   ` [PATCH 1/6] cr: break out new_user_ns() Serge E. Hallyn
@ 2009-05-19  1:45   ` Serge E. Hallyn
  2009-05-19  1:45   ` [PATCH 3/6] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:45 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

When restarting tasks, we want to be able to change xuid and
xgid in a struct cred, and do so with security checks.  Break
the core functionality of set{fs,res}{u,g}id into cred_setX
which performs the access checks based on current_cred(),
but performs the requested change on a passed-in cred.

This will allow us to securely construct struct creds based
on a checkpoint image, constrained by the caller's permissions,
and apply them to the caller at the end of sys_restart().

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/cred.h |    8 +++
 kernel/cred.c        |  114 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c         |  134 ++++++++------------------------------------------
 3 files changed, 143 insertions(+), 113 deletions(-)

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 3282ee4..bc5ffc2 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -20,6 +20,9 @@ struct user_struct;
 struct cred;
 struct inode;
 
+/* defined in sys.c, used in cred_setresuid */
+extern int set_user(struct cred *new);
+
 /*
  * COW Supplementary groups list
  */
@@ -343,4 +346,9 @@ do {						\
 	*(_fsgid) = __cred->fsgid;		\
 } while(0)
 
+int cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid);
+int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid, gid_t sgid);
+int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid);
+int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid);
+
 #endif /* _LINUX_CRED_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a03918..a017399 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -589,3 +589,117 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 	return security_kernel_create_files_as(new, inode);
 }
 EXPORT_SYMBOL(set_create_files_as);
+
+int cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid)
+{
+	int retval;
+	const struct cred *old;
+
+	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+	old = current_cred();
+
+	if (!capable(CAP_SETUID)) {
+		if (ruid != (uid_t) -1 && ruid != old->uid &&
+		    ruid != old->euid  && ruid != old->suid)
+			return -EPERM;
+		if (euid != (uid_t) -1 && euid != old->uid &&
+		    euid != old->euid  && euid != old->suid)
+			return -EPERM;
+		if (suid != (uid_t) -1 && suid != old->uid &&
+		    suid != old->euid  && suid != old->suid)
+			return -EPERM;
+	}
+
+	if (ruid != (uid_t) -1) {
+		new->uid = ruid;
+		if (ruid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				return retval;
+		}
+	}
+	if (euid != (uid_t) -1)
+		new->euid = euid;
+	if (suid != (uid_t) -1)
+		new->suid = suid;
+	new->fsuid = new->euid;
+
+	return security_task_fix_setuid(new, old, LSM_SETID_RES);
+}
+
+int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid,
+			gid_t sgid)
+{
+	const struct cred *old = current_cred();
+	int retval;
+
+	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+
+	if (!capable(CAP_SETGID)) {
+		if (rgid != (gid_t) -1 && rgid != old->gid &&
+		    rgid != old->egid  && rgid != old->sgid)
+			return -EPERM;
+		if (egid != (gid_t) -1 && egid != old->gid &&
+		    egid != old->egid  && egid != old->sgid)
+			return -EPERM;
+		if (sgid != (gid_t) -1 && sgid != old->gid &&
+		    sgid != old->egid  && sgid != old->sgid)
+			return -EPERM;
+	}
+
+	if (rgid != (gid_t) -1)
+		new->gid = rgid;
+	if (egid != (gid_t) -1)
+		new->egid = egid;
+	if (sgid != (gid_t) -1)
+		new->sgid = sgid;
+	new->fsgid = new->egid;
+	return 0;
+}
+
+int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid)
+{
+	const struct cred *old;
+
+	old = current_cred();
+	*old_fsuid = old->fsuid;
+
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+		return -EPERM;
+
+	if (uid == old->uid  || uid == old->euid  ||
+	    uid == old->suid || uid == old->fsuid ||
+	    capable(CAP_SETUID)) {
+		if (uid != *old_fsuid) {
+			new->fsuid = uid;
+			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+				return 0;
+		}
+	}
+	return -EPERM;
+}
+
+int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid)
+{
+	const struct cred *old;
+
+	old = current_cred();
+	*old_fsgid = old->fsgid;
+
+	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
+		return -EPERM;
+
+	if (gid == old->gid  || gid == old->egid  ||
+	    gid == old->sgid || gid == old->fsgid ||
+	    capable(CAP_SETGID)) {
+		if (gid != *old_fsgid) {
+			new->fsgid = gid;
+			return 0;
+		}
+	}
+	return -EPERM;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf..fe5dcfe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -558,11 +558,12 @@ error:
 /*
  * change the user struct in a credentials set to match the new UID
  */
-static int set_user(struct cred *new)
+int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current_user_ns(), new->uid);
+	/* is this ok? */
+	new_user = alloc_uid(new->user->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
@@ -703,14 +704,12 @@ error:
 	return retval;
 }
 
-
 /*
  * This function implements a generic ability to update ruid, euid,
  * and suid.  This allows you to implement the 4.4 compatible seteuid().
  */
 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 {
-	const struct cred *old;
 	struct cred *new;
 	int retval;
 
@@ -718,45 +717,10 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	if (!new)
 		return -ENOMEM;
 
-	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
-	if (retval)
-		goto error;
-	old = current_cred();
-
-	retval = -EPERM;
-	if (!capable(CAP_SETUID)) {
-		if (ruid != (uid_t) -1 && ruid != old->uid &&
-		    ruid != old->euid  && ruid != old->suid)
-			goto error;
-		if (euid != (uid_t) -1 && euid != old->uid &&
-		    euid != old->euid  && euid != old->suid)
-			goto error;
-		if (suid != (uid_t) -1 && suid != old->uid &&
-		    suid != old->euid  && suid != old->suid)
-			goto error;
-	}
+	retval = cred_setresuid(new, ruid, euid, suid);
+	if (retval == 0)
+		return commit_creds(new);
 
-	if (ruid != (uid_t) -1) {
-		new->uid = ruid;
-		if (ruid != old->uid) {
-			retval = set_user(new);
-			if (retval < 0)
-				goto error;
-		}
-	}
-	if (euid != (uid_t) -1)
-		new->euid = euid;
-	if (suid != (uid_t) -1)
-		new->suid = suid;
-	new->fsuid = new->euid;
-
-	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
-	if (retval < 0)
-		goto error;
-
-	return commit_creds(new);
-
-error:
 	abort_creds(new);
 	return retval;
 }
@@ -778,43 +742,17 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
  */
 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 {
-	const struct cred *old;
 	struct cred *new;
 	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
-	old = current_cred();
 
-	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
-	if (retval)
-		goto error;
+	retval = cred_setresgid(new, rgid, egid, sgid);
+	if (retval == 0)
+		return commit_creds(new);
 
-	retval = -EPERM;
-	if (!capable(CAP_SETGID)) {
-		if (rgid != (gid_t) -1 && rgid != old->gid &&
-		    rgid != old->egid  && rgid != old->sgid)
-			goto error;
-		if (egid != (gid_t) -1 && egid != old->gid &&
-		    egid != old->egid  && egid != old->sgid)
-			goto error;
-		if (sgid != (gid_t) -1 && sgid != old->gid &&
-		    sgid != old->egid  && sgid != old->sgid)
-			goto error;
-	}
-
-	if (rgid != (gid_t) -1)
-		new->gid = rgid;
-	if (egid != (gid_t) -1)
-		new->egid = egid;
-	if (sgid != (gid_t) -1)
-		new->sgid = sgid;
-	new->fsgid = new->egid;
-
-	return commit_creds(new);
-
-error:
 	abort_creds(new);
 	return retval;
 }
@@ -831,7 +769,6 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u
 	return retval;
 }
 
-
 /*
  * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
  * is used for "access()" and for the NFS daemon (letting nfsd stay at
@@ -840,35 +777,20 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u
  */
 SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 {
-	const struct cred *old;
 	struct cred *new;
 	uid_t old_fsuid;
+	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return current_fsuid();
-	old = current_cred();
-	old_fsuid = old->fsuid;
-
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
-		goto error;
 
-	if (uid == old->uid  || uid == old->euid  ||
-	    uid == old->suid || uid == old->fsuid ||
-	    capable(CAP_SETUID)) {
-		if (uid != old_fsuid) {
-			new->fsuid = uid;
-			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
-				goto change_okay;
-		}
-	}
-
-error:
-	abort_creds(new);
-	return old_fsuid;
+	retval = cred_setfsuid(new, uid, &old_fsuid);
+	if (retval == 0)
+		commit_creds(new);
+	else
+		abort_creds(new);
 
-change_okay:
-	commit_creds(new);
 	return old_fsuid;
 }
 
@@ -877,34 +799,20 @@ change_okay:
  */
 SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 {
-	const struct cred *old;
 	struct cred *new;
 	gid_t old_fsgid;
+	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return current_fsgid();
-	old = current_cred();
-	old_fsgid = old->fsgid;
-
-	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		goto error;
 
-	if (gid == old->gid  || gid == old->egid  ||
-	    gid == old->sgid || gid == old->fsgid ||
-	    capable(CAP_SETGID)) {
-		if (gid != old_fsgid) {
-			new->fsgid = gid;
-			goto change_okay;
-		}
-	}
-
-error:
-	abort_creds(new);
-	return old_fsgid;
+	retval = cred_setfsgid(new, gid, &old_fsgid);
+	if (retval == 0)
+		commit_creds(new);
+	else
+		abort_creds(new);
 
-change_okay:
-	commit_creds(new);
 	return old_fsgid;
 }
 
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/6] cr: capabilities: define checkpoint and restore fns
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-19  1:45   ` [PATCH 1/6] cr: break out new_user_ns() Serge E. Hallyn
  2009-05-19  1:45   ` [PATCH 2/6] cr: split core function out of some set*{u,g}id functions Serge E. Hallyn
@ 2009-05-19  1:45   ` Serge E. Hallyn
  2009-05-19  1:45   ` [PATCH 4/6] cr: checkpoint and restore task credentials Serge E. Hallyn
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:45 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

An application checkpoint image will store capability sets
(and the bounding set) as __u64s.  Define checkpoint and
restart functions to translate between those and kernel_cap_t's.

Define a common function do_capset_tocred() which applies capability
set changes to a passed-in struct cred.

The restore function uses do_capset_tocred() to apply the restored
capabilities to the struct cred being crafted, subject to the
current task's (task executing sys_restart()) permissions.

TODO: one day we'll want to c/r the securebits as well.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/capability.h |    5 ++
 kernel/capability.c        |   94 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c302110..572b5a0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -536,6 +536,11 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+extern void checkpoint_save_cap(__u64 *dest, kernel_cap_t src);
+struct cred;
+extern int checkpoint_restore_cap(__u64 e, __u64 i, __u64 p, __u64 x,
+				struct cred *cred);
+
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041..d2c9bb3 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -217,6 +217,45 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 	return ret;
 }
 
+static int do_capset_tocred(kernel_cap_t *effective, kernel_cap_t *inheritable,
+			kernel_cap_t *permitted, struct cred *new)
+{
+	int ret;
+
+	ret = security_capset(new, current_cred(),
+			      effective, inheritable, permitted);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * for checkpoint-restart, do we want to wait until end of restart?
+	 * not sure we care */
+	audit_log_capset(current->pid, new, current_cred());
+
+	return 0;
+}
+
+static int do_capset(kernel_cap_t *effective, kernel_cap_t *inheritable,
+			kernel_cap_t *permitted)
+{
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = do_capset_tocred(effective, inheritable, permitted, new);
+	if (ret < 0)
+		goto error;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return ret;
+}
+
 /**
  * sys_capset - set capabilities for a process or (*) a group of processes
  * @header: pointer to struct that contains capability version and
@@ -240,7 +279,6 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -271,22 +309,52 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 		i++;
 	}
 
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
+	return do_capset(&effective, &inheritable, &permitted);
 
-	ret = security_capset(new, current_cred(),
-			      &effective, &inheritable, &permitted);
-	if (ret < 0)
-		goto error;
+}
 
-	audit_log_capset(pid, new, current_cred());
 
-	return commit_creds(new);
+void checkpoint_save_cap(__u64 *dest, kernel_cap_t src)
+{
+	*dest = src.cap[0] | (src.cap[1] << sizeof(__u32));
+}
 
-error:
-	abort_creds(new);
-	return ret;
+static void do_capbset_drop(struct cred *cred, int cap)
+{
+	cap_lower(cred->cap_bset, cap);
+}
+
+int checkpoint_restore_cap(__u64 newe, __u64 newi, __u64 newp, __u64 newx,
+			struct cred *cred)
+{
+	kernel_cap_t effective, inheritable, permitted, bset;
+	int may_dropbcap = capable(CAP_SETPCAP);
+	int ret, i;
+
+	effective.cap[0] = newe;
+	effective.cap[1] = (newe >> sizeof(__u32));
+	inheritable.cap[0] = newi;
+	inheritable.cap[1] = (newi >> sizeof(__u32));
+	permitted.cap[0] = newp;
+	permitted.cap[1] = (newp >> sizeof(__u32));
+	bset.cap[0] = newx;
+	bset.cap[1] = (newx >> sizeof(__u32));
+
+	ret = do_capset_tocred(&effective, &inheritable, &permitted, cred);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < CAP_LAST_CAP; i++) {
+		if (cap_raised(bset, i))
+			continue;
+		if (!cap_raised(current_cred()->cap_bset, i))
+			continue;
+		if (!may_dropbcap)
+			return -EPERM;
+		do_capbset_drop(cred, i);
+	}
+
+	return 0;
 }
 
 /**
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-05-19  1:45   ` [PATCH 3/6] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
@ 2009-05-19  1:45   ` Serge E. Hallyn
  2009-05-19  1:45   ` [PATCH 5/6] cr: restore file->f_cred Serge E. Hallyn
  2009-05-19  1:45   ` [PATCH 6/6] user namespaces: debug refcounts Serge E. Hallyn
  5 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:45 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

This patch adds the checkpointing and restart of credentials
(uids, gids, and capabilities) to Oren's c/r patchset (on top
of v14).  It goes to great pains to re-use (and define when
needed) common helpers, in order to make sure that as security
code is modified, the cr code will be updated.  Some of the
helpers should still be moved (i.e. _creds() functions should
be in kernel/cred.c).

When building the credentials for the restarted process, I
1. create a new struct cred as a copy of the running task's
cred (using prepare_cred())
2. always authorize any changes to the new struct cred
based on the permissions of current_cred() (not the current
transient state of the new cred).

While this may mean that certain transient_cred1->transient_cred2
states are allowed which otherwise wouldn't be allowed, the
fact remains that current_cred() is allowed to transition to
transient_cred2.

The reconstructed creds are applied to the task at the very
end of the sys_restart call.  This ensures that any objects which
need to be re-created (file, socket, etc) are re-created using
the creds of the task calling sys_restart - preventing an unpriv
user from creating a privileged object, and ensuring that a
root task can restart a process which had started out privileged,
created some privileged objects, then dropped its privilege.

With these patches, the root user can restart checkpoint images
(created by either hallyn or root) of user hallyn's tasks,
resulting in a program owned by hallyn.

Plenty of bugs to be found, no doubt.

Changelog:
	May 18: fix more refcounting: if (userns 5, uid 0) had
		no active tasks or child user_namespaces, then
		it shouldn't exist at restart or it, its namespace,
		and its whole chain of creators will be leaked.
	May 14: fix some refcounting:
		1. a new user_ns needs a ref to remain pinned
		   by its root user
		2. current_user_ns needs an extra ref bc objhash
		   drops two on restart
		3. cred needs a ref for the real credentials bc
		   commit_creds eats one ref.
	May 13: folded in fix to userns refcounting.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/objhash.c             |  119 ++++++++++-
 checkpoint/process.c             |  457 +++++++++++++++++++++++++++++++++++++-
 include/linux/checkpoint.h       |   11 +
 include/linux/checkpoint_hdr.h   |   57 +++++
 include/linux/checkpoint_types.h |    1 +
 5 files changed, 642 insertions(+), 3 deletions(-)

diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 5618fff..d2268c2 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/sched.h>
 #include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -155,6 +156,71 @@ static int obj_ipc_ns_users(void *ptr)
 	return atomic_read(&((struct ipc_namespace *) ptr)->count);
 }
 
+static int obj_cred_grab(void *ptr)
+{
+	get_cred((struct cred *) ptr);
+	return 0;
+}
+
+static void obj_cred_drop(void *ptr)
+{
+	put_cred((struct cred *) ptr);
+}
+
+static int obj_cred_users(void *ptr)
+{
+	return atomic_read(&((struct cred *) ptr)->usage);
+}
+
+static int obj_user_grab(void *ptr)
+{
+	struct user_struct *u = ptr;
+	(void) get_uid(u);
+	return 0;
+}
+
+static void obj_user_drop(void *ptr)
+{
+	free_uid((struct user_struct *) ptr);
+}
+
+static int obj_user_users(void *ptr)
+{
+	return atomic_read(&((struct user_struct *) ptr)->__count);
+}
+
+static int obj_userns_grab(void *ptr)
+{
+	get_user_ns((struct user_namespace *) ptr);
+	return 0;
+}
+
+static void obj_userns_drop(void *ptr)
+{
+	put_user_ns((struct user_namespace *) ptr);
+}
+
+static int obj_user_ns_users(void *ptr)
+{
+	return atomic_read(&((struct user_namespace *) ptr)->kref.refcount);
+}
+
+static int obj_groupinfo_grab(void *ptr)
+{
+	get_group_info((struct group_info *) ptr);
+	return 0;
+}
+
+static void obj_groupinfo_drop(void *ptr)
+{
+	put_group_info((struct group_info *) ptr);
+}
+
+static int obj_groupinfo_users(void *ptr)
+{
+	return atomic_read(&((struct group_info *) ptr)->usage);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -221,6 +287,46 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_bad,
 		.restore = restore_bad,
 	},
+	/* user_ns object */
+	{
+		.obj_name = "USER_NS",
+		.obj_type = CKPT_OBJ_USER_NS,
+		.ref_drop = obj_userns_drop,
+		.ref_grab = obj_userns_grab,
+		.ref_users = obj_user_ns_users,
+		.checkpoint = checkpoint_userns,
+		.restore = restore_userns,
+	},
+	/* struct cred */
+	{
+		.obj_name = "CRED",
+		.obj_type = CKPT_OBJ_CRED,
+		.ref_drop = obj_cred_drop,
+		.ref_grab = obj_cred_grab,
+		.ref_users = obj_cred_users,
+		.checkpoint = checkpoint_cred,
+		.restore = restore_cred,
+	},
+	/* user object */
+	{
+		.obj_name = "USER",
+		.obj_type = CKPT_OBJ_USER,
+		.ref_drop = obj_user_drop,
+		.ref_grab = obj_user_grab,
+		.ref_users = obj_user_users,
+		.checkpoint = checkpoint_user,
+		.restore = restore_user,
+	},
+	/* struct groupinfo */
+	{
+		.obj_name = "GROUPINFO",
+		.obj_type = CKPT_OBJ_GROUPINFO,
+		.ref_drop = obj_groupinfo_drop,
+		.ref_grab = obj_groupinfo_grab,
+		.ref_users = obj_groupinfo_users,
+		.checkpoint = checkpoint_groupinfo,
+		.restore = restore_groupinfo,
+	},
 };
 
 
@@ -290,6 +396,18 @@ static struct ckpt_obj *obj_find_by_ptr(struct ckpt_ctx *ctx, void *ptr)
 	return NULL;
 }
 
+/*
+ * look up an obj and return objref if in hash, else
+ * return 0.  Used during checkpoint.
+ */
+int obj_lookup_dontadd(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_obj *obj = obj_find_by_ptr(ctx, ptr);
+	if (obj)
+		return obj->objref;
+	return 0;
+}
+
 static struct ckpt_obj *obj_find_by_objref(struct ckpt_ctx *ctx, int objref)
 {
 	struct hlist_head *h;
@@ -389,7 +507,6 @@ int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
 		*first = 0;
 	}
 
-	ckpt_debug("%s objref %d first %d\n", ops->obj_name, objref, *first);
 	return objref;
 }
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index d8d346b..74b411a 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -17,6 +17,7 @@
 #include <linux/poll.h>
 #include <linux/nsproxy.h>
 #include <linux/utsname.h>
+#include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 #include <linux/syscalls.h>
@@ -27,16 +28,210 @@
  * Checkpoint
  */
 
+#define CKPT_MAXGROUPS 15
+#define MAX_GROUPINFO_SIZE (sizeof(*h)+CKPT_MAXGROUPS*sizeof(gid_t))
+/* move this fn into kernel/sys.c next to group functions? */
+static int checkpoint_write_groupinfo(struct ckpt_ctx *ctx,
+					struct group_info *g)
+{
+	int ret, i, size;
+	struct ckpt_hdr_groupinfo *h;
+
+	if (g->ngroups > CKPT_MAXGROUPS) {
+		ckpt_debug("Too many groups: %d  (max is %d)\n",
+			g->ngroups, CKPT_MAXGROUPS);
+		return -E2BIG;
+	}
+	size = sizeof(*h) + g->ngroups * sizeof(__u32);
+	h = ckpt_hdr_get_type(ctx, size, CKPT_HDR_GROUPINFO);
+	if (!h)
+		return -ENOMEM;
+
+	h->ngroups = g->ngroups;
+	for (i = 0; i < g->ngroups; i++)
+		h->groups[i] = GROUP_AT(g, i);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+int checkpoint_groupinfo(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_groupinfo(ctx, (struct group_info *)ptr);
+}
+
+static int checkpoint_write_userns(struct ckpt_ctx *ctx,
+				   struct user_namespace *ns)
+{
+	struct ckpt_hdr_user_ns *h;
+	int creator_ref = 0;
+	unsigned int flags = 0;
+	struct user_namespace *root_ns;
+	int ret;
+
+	root_ns = task_cred_xxx(ctx->root_task, user)->user_ns;
+	if (ns == root_ns)
+		flags = CKPT_USERNS_INIT;
+	else
+		creator_ref = obj_lookup_dontadd(ctx, ns->creator);
+	if (!flags && !creator_ref)
+		return -EINVAL;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_USER_NS);
+	if (!h)
+		return -ENOMEM;
+	h->creator_ref = creator_ref;
+	h->flags = flags;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+int checkpoint_userns(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_userns(ctx, (struct user_namespace *) ptr);
+}
+
+/*
+ * write the user struct
+ * TODO keyring will need to be dumped
+ */
+#define UNSAVED_NS_MAX 5
+static int checkpoint_write_user(struct ckpt_ctx *ctx, struct user_struct *u)
+{
+	struct user_namespace *ns, *root_ns;
+	struct ckpt_hdr_user_struct *h;
+	int ns_objref;
+	int ret, i, unsaved_ns_nr = 0;
+	struct user_struct *save_u;
+	struct user_struct *unsaved_creators[UNSAVED_NS_MAX+1];
+
+	/* if we've already saved the userns, then life is good */
+	ns_objref = obj_lookup_dontadd(ctx, u->user_ns);
+	if (ns_objref)
+		goto write_user;
+
+	root_ns = task_cred_xxx(ctx->root_task, user)->user_ns;
+
+	if (u->user_ns == root_ns)
+		goto save_last_ns;
+
+	save_u = u;
+	do {
+		ns = save_u->user_ns;
+		save_u = ns->creator;
+		if (obj_lookup_dontadd(ctx, save_u))
+			goto found;
+		unsaved_creators[unsaved_ns_nr++] = save_u;
+	} while (ns != root_ns && unsaved_ns_nr < UNSAVED_NS_MAX);
+
+	if (unsaved_ns_nr == UNSAVED_NS_MAX)
+		return -E2BIG;
+found:
+	for (i = unsaved_ns_nr-1; i >= 0; i--) {
+		ret = checkpoint_obj(ctx, unsaved_creators[i], CKPT_OBJ_USER);
+		if (ret < 0)
+			return ret;
+	}
+
+save_last_ns:
+	ns_objref = checkpoint_obj(ctx, u->user_ns, CKPT_OBJ_USER_NS);
+	if (ns_objref < 0)
+		return ns_objref;
+
+write_user:
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_USER);
+	if (!h)
+		return -ENOMEM;
+
+	h->uid = u->uid;
+	h->userns_ref = ns_objref;
+
+	/* write out the user_struct */
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+int checkpoint_user(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_user(ctx, (struct user_struct *)ptr);
+}
+
+/* This probably should go into kernel/cred.c */
+static int checkpoint_write_cred(struct ckpt_ctx *ctx, const struct cred *cred)
+{
+	int ret;
+	int groupinfo_ref, user_ref;
+	struct ckpt_hdr_cred *h;
+
+	groupinfo_ref = checkpoint_obj(ctx, cred->group_info,
+					CKPT_OBJ_GROUPINFO);
+	if (groupinfo_ref < 0)
+		return groupinfo_ref;
+	user_ref = checkpoint_obj(ctx, cred->user, CKPT_OBJ_USER);
+	if (user_ref < 0)
+		return user_ref;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CRED);
+	if (!h)
+		return -ENOMEM;
+
+	h->version = 1;
+	h->uid = cred->uid;
+	h->suid = cred->suid;
+	h->euid = cred->euid;
+	h->fsuid = cred->fsuid;
+
+	h->gid = cred->gid;
+	h->sgid = cred->sgid;
+	h->egid = cred->egid;
+	h->fsgid = cred->fsgid;
+
+	checkpoint_save_cap(&h->cap_i, cred->cap_inheritable);
+	checkpoint_save_cap(&h->cap_p, cred->cap_permitted);
+	checkpoint_save_cap(&h->cap_e, cred->cap_effective);
+	checkpoint_save_cap(&h->cap_x, cred->cap_bset);
+
+	h->user_ref = user_ref;
+	h->groupinfo_ref = groupinfo_ref;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+int checkpoint_cred(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_cred(ctx, (struct cred *) ptr);
+}
+
 /* dump the task_struct of a given task */
 static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 {
 	struct ckpt_hdr_task *h;
 	int ret;
+	int realcred_ref, ecred_ref;
+
+	realcred_ref = checkpoint_obj(ctx, t->real_cred, CKPT_OBJ_CRED);
+	if (realcred_ref < 0)
+		return realcred_ref;
+
+	ecred_ref = checkpoint_obj(ctx, t->cred, CKPT_OBJ_CRED);
+	if (ecred_ref < 0)
+		return ecred_ref;
 
 	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
 	if (!h)
 		return -ENOMEM;
 
+	h->cred_ref = realcred_ref;
+	h->ecred_ref = ecred_ref;
 	h->state = t->state;
 	h->exit_state = t->exit_state;
 	h->exit_code = t->exit_code;
@@ -348,8 +543,226 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
  * Restart
  */
 
+static struct group_info *restore_read_groupinfo(struct ckpt_ctx *ctx)
+{
+	struct group_info *g;
+	struct ckpt_hdr_groupinfo *h;
+	int i;
+
+	h = ckpt_read_buf_type(ctx, MAX_GROUPINFO_SIZE, CKPT_HDR_GROUPINFO);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+	if (h->ngroups > CKPT_MAXGROUPS) {
+		g = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	g = groups_alloc(h->ngroups);
+	if (!g) {
+		g = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	for (i = 0; i < h->ngroups; i++)
+		GROUP_AT(g, i) = h->groups[i];
+
+out:
+	ckpt_hdr_put(ctx, h);
+	return g;
+}
+
+void *restore_groupinfo(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_groupinfo(ctx);
+}
+
+static struct user_namespace *restore_read_userns(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_user_ns *h;
+	struct user_namespace *ns;
+	struct user_struct *new_root, *creator;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_USER_NS);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+	if (h->flags & CKPT_USERNS_INIT) {
+		ckpt_hdr_put(ctx, h);
+		/* grab an extra ref bc objhash will drop an extra */
+		return get_user_ns(current_user_ns());
+	}
+	creator = ckpt_obj_fetch(ctx, h->creator_ref, CKPT_OBJ_USER);
+	ckpt_hdr_put(ctx, h);
+
+	if (IS_ERR(creator))
+		return ERR_PTR(-EINVAL);
+	ns = new_user_ns(creator, &new_root);
+
+	if (IS_ERR(ns))
+		return ns;
+
+	/* new_user_ns() doesn't bump creator's refcount */
+	get_uid(creator);
+
+	/* objhash will drop new_ns refcount, but new_root
+	 * should hold a ref */
+	get_user_ns(ns);
+
+	/*
+	 * Free the new root user.  If we actually needed it,
+	 * then it will show up later in the checkpoint image
+	 * The objhash will keep the userns pinned until then.
+	 */
+	free_uid(new_root);
+
+	return ns;
+}
+
+void *restore_userns(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_userns(ctx);
+}
+
+static int may_setuid(struct user_namespace *ns, uid_t uid)
+{
+	/*
+	 * this next check will one day become
+	 * if capable(CAP_SETUID, ns) return 1;
+	 * followed by uid_equiv(current_userns, current_uid, ns, uid)
+	 * instead of just uids.
+	 */
+	if (capable(CAP_SETUID))
+		return 1;
+
+	/*
+	 * this may be overly strict, but since we might end up
+	 * restarting a privileged program here, we do not want
+	 * someone with only CAP_SYS_ADMIN but no CAP_SETUID to
+	 * be able to create random userids even in a userns he
+	 * created.
+	 */
+	if (current_user()->user_ns != ns)
+		return 0;
+	if (current_uid() == uid ||
+		current_euid() == uid ||
+		current_suid() == uid)
+		return 1;
+	return 0;
+}
+
+static struct user_struct *restore_read_user(struct ckpt_ctx *ctx)
+{
+	struct user_struct *u;
+	struct user_namespace *ns;
+	struct ckpt_hdr_user_struct *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_USER);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+
+	ns = ckpt_obj_fetch(ctx, h->userns_ref, CKPT_OBJ_USER_NS);
+	if (IS_ERR(ns)) {
+		u = ERR_PTR(PTR_ERR(ns));
+		goto out;
+	}
+
+	if (!may_setuid(ns, h->uid)) {
+		u = ERR_PTR(-EPERM);
+		goto out;
+	}
+	u = alloc_uid(ns, h->uid);
+	if (!u)
+		u = ERR_PTR(-EINVAL);
+
+out:
+	ckpt_hdr_put(ctx, h);
+	return u;
+}
+
+void *restore_user(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_user(ctx);
+}
+
+/* move this code into kernel/cred.c and do proper perms checking of course */
+struct cred *restore_read_cred(struct ckpt_ctx *ctx)
+{
+	struct cred *cred;
+	struct ckpt_hdr_cred *h;
+	struct user_struct *user;
+	struct group_info *groupinfo;
+	int ret = -EINVAL;
+	uid_t olduid;
+	gid_t oldgid;
+	int i;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CRED);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+	if (h->version != 1)
+		goto error;
+
+	cred = prepare_creds();
+	if (!cred)
+		goto error;
+
+
+	/* Do we care if the target user and target group were compatible?
+	 * Probably.  But then, we can't do any setuid without CAP_SETUID,
+	 * so we must have been privileged to abuse it... */
+	groupinfo = ckpt_obj_fetch(ctx, h->groupinfo_ref, CKPT_OBJ_GROUPINFO);
+	if (IS_ERR(groupinfo))
+		goto err_putcred;
+	user = ckpt_obj_fetch(ctx, h->user_ref, CKPT_OBJ_USER);
+	if (IS_ERR(user))
+		goto err_putcred;
+
+	/*
+	 * TODO: this check should  go into the common helper in
+	 * kernel/sys.c, and should account for user namespaces
+	 */
+	if (!capable(CAP_SETGID))
+		for (i = 0; i < groupinfo->ngroups; i++) {
+			if (!in_egroup_p(GROUP_AT(groupinfo, i)))
+				goto err_putcred;
+		}
+	ret = set_groups(cred, groupinfo);
+	if (ret < 0)
+		goto err_putcred;
+	free_uid(cred->user);
+	cred->user = get_uid(user);
+	ret = cred_setresuid(cred, h->uid, h->euid, h->suid);
+	if (ret < 0)
+		goto err_putcred;
+	ret = cred_setfsuid(cred, h->fsuid, &olduid);
+	if (olduid != h->fsuid && ret < 0)
+		goto err_putcred;
+	ret = cred_setresgid(cred, h->gid, h->egid, h->sgid);
+	if (ret < 0)
+		goto err_putcred;
+	ret = cred_setfsgid(cred, h->fsgid, &oldgid);
+	if (oldgid != h->fsgid && ret < 0)
+		goto err_putcred;
+	ret = checkpoint_restore_cap(h->cap_e, h->cap_i, h->cap_p, h->cap_x,
+				cred);
+	if (ret)
+		goto err_putcred;
+
+	ckpt_hdr_put(ctx, h);
+	return cred;
+
+err_putcred:
+	abort_creds(cred);
+error:
+	ckpt_hdr_put(ctx, h);
+	return ERR_PTR(ret);
+}
+
+void *restore_cred(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_cred(ctx);
+}
+
 /* read the task_struct into the current task */
-static int restore_task_struct(struct ckpt_ctx *ctx)
+static int restore_task_struct(struct ckpt_ctx *ctx, struct cred **realcredp,
+				struct cred **ecredp)
 {
 	struct ckpt_hdr_task *h;
 	struct task_struct *t = current;
@@ -365,8 +778,21 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 
 	memset(t->comm, 0, TASK_COMM_LEN);
 	ret = _ckpt_read_string(ctx, t->comm, h->task_comm_len);
+	if (ret < 0)
+		goto out;
 
 	/* FIXME: restore remaining relevant task_struct fields */
+
+	ret = 0;
+	*realcredp = ckpt_obj_fetch(ctx, h->cred_ref, CKPT_OBJ_CRED);
+	if (IS_ERR(*realcredp)) {
+		ret = PTR_ERR(*realcredp);
+		goto out;
+	}
+	*ecredp = ckpt_obj_fetch(ctx, h->ecred_ref, CKPT_OBJ_CRED);
+	if (IS_ERR(*ecredp))
+		ret = PTR_ERR(*ecredp);
+
  out:
 	ckpt_hdr_put(ctx, h);
 	return ret;
@@ -648,12 +1074,35 @@ static int restore_task_shared(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int restore_creds(struct ckpt_ctx *ctx, struct cred *rcred,
+			struct cred *ecred)
+{
+	int ret;
+	const struct cred *old;
+
+	/* commit_creds will take one ref for the eff creds, but
+	 * expects us to hold a ref for the obj creds, so take a
+	 * ref here */
+	get_cred(rcred);
+	ret = commit_creds(rcred);
+	if (ret)
+		return ret;
+
+	if (ecred == rcred)
+		return 0;
+
+	old =  override_creds(ecred); /* override_creds otoh takes new ref */
+	put_cred(old);
+	return 0;
+}
+
 /* read the entire state of the current task */
 int restore_task(struct ckpt_ctx *ctx)
 {
 	int ret;
+	struct cred *realcred, *ecred;
 
-	ret = restore_task_struct(ctx);
+	ret = restore_task_struct(ctx, &realcred, &ecred);
 	ckpt_debug("ret %d\n", ret);
 	if (ret < 0)
 		goto out;
@@ -671,6 +1120,10 @@ int restore_task(struct ckpt_ctx *ctx)
 		goto out;
 	ret = restore_cpu(ctx);
 	ckpt_debug("cpu: ret %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_creds(ctx, realcred, ecred);
+	ckpt_debug("creds: ret %d\n", ret);
  out:
 	return ret;
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d966f19..1b1326d 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -53,6 +53,7 @@ extern void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref,
 			    enum obj_type type);
 extern int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
 			       enum obj_type type, int *first);
+extern int obj_lookup_dontadd(struct ckpt_ctx *ctx, void *ptr);
 extern int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref,
 			   enum obj_type type);
 
@@ -90,6 +91,16 @@ static inline int restore_ipc_ns(struct ckpt_ctx *ctx)
 extern int checkpoint_ipcns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns);
 extern int restore_ipcns(struct ckpt_ctx *ctx);
 
+/* credentials */
+int checkpoint_groupinfo(struct ckpt_ctx *ctx, void *ptr);
+int checkpoint_userns(struct ckpt_ctx *ctx, void *ptr);
+int checkpoint_user(struct ckpt_ctx *ctx, void *ptr);
+int checkpoint_cred(struct ckpt_ctx *ctx, void *ptr);
+void *restore_groupinfo(struct ckpt_ctx *ctx);
+void *restore_userns(struct ckpt_ctx *ctx);
+void *restore_user(struct ckpt_ctx *ctx);
+void *restore_cred(struct ckpt_ctx *ctx);
+
 /* memory */
 extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 90e036d..98fc0f7 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -57,6 +57,10 @@ enum {
 	CKPT_HDR_NS,
 	CKPT_HDR_UTS_NS,
 	CKPT_HDR_IPC_NS,
+	CKPT_HDR_USER_NS,
+	CKPT_HDR_CRED,
+	CKPT_HDR_USER,
+	CKPT_HDR_GROUPINFO,
 
 	CKPT_HDR_MM = 201,
 	CKPT_HDR_VMA,
@@ -101,6 +105,10 @@ enum obj_type {
 	CKPT_OBJ_NS,
 	CKPT_OBJ_UTS_NS,
 	CKPT_OBJ_IPC_NS,
+	CKPT_OBJ_USER_NS,
+	CKPT_OBJ_CRED,
+	CKPT_OBJ_USER,
+	CKPT_OBJ_GROUPINFO,
 	CKPT_OBJ_MAX
 };
 
@@ -158,10 +166,59 @@ struct ckpt_hdr_task {
 	__u32 exit_state;
 	__u32 exit_code;
 	__u32 exit_signal;
+	__s32 cred_ref;
+	__s32 ecred_ref;
+
+#ifdef CONFIG_AUDITSYSCALL
+	/* would audit want to track the checkpointed ids,
+	   or (more likely) who actually restarted? */
+#endif
 
 	__u32 task_comm_len;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_cred {
+	struct ckpt_hdr h;
+	__u32 version; /* especially since capability sets might grow */
+	__u32 uid, suid, euid, fsuid;
+	__u32 gid, sgid, egid, fsgid;
+	__u64 cap_i, cap_p, cap_e;
+	__u64 cap_x;  /* bounding set ('X') */
+	__s32 user_ref;
+	__s32 groupinfo_ref;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_groupinfo {
+	struct ckpt_hdr h;
+	__u32 ngroups;
+	/*
+	 * This is followed by ngroups __u32s
+	 */
+	__u32 groups[0];
+} __attribute__((aligned(8)));
+
+/*
+ * todo - keyrings and LSM
+ * These may be better done with userspace help though
+ */
+struct ckpt_hdr_user_struct {
+	struct ckpt_hdr h;
+	__u32 uid;
+	__s32 userns_ref;
+} __attribute__((aligned(8)));
+
+/*
+ * The user-struct mostly tracks system resource usage.
+ * Most of it's contents therefore will simply be set
+ * correctly as restart opens resources
+ */
+#define CKPT_USERNS_INIT 1
+struct ckpt_hdr_user_ns {
+	struct ckpt_hdr h;
+	__u32 flags;
+	__s32 creator_ref;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_task_ns {
 	struct ckpt_hdr h;
 	__s32 ns_objref;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 85ee304..2c4a9f0 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -13,6 +13,7 @@
 #define CKPT_VERSION  1
 
 #define CHECKPOINT_SUBTREE	0x4
+#define RESTORE_CREATE_USERNS	0x8
 
 
 #ifdef __KERNEL__
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 5/6] cr: restore file->f_cred
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-05-19  1:45   ` [PATCH 4/6] cr: checkpoint and restore task credentials Serge E. Hallyn
@ 2009-05-19  1:45   ` Serge E. Hallyn
       [not found]     ` <20090519014546.GE28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-19  1:45   ` [PATCH 6/6] user namespaces: debug refcounts Serge E. Hallyn
  5 siblings, 1 reply; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:45 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Ony seems useful if you're using coda or hppfs, but go ahead and
restore a file's f_cred.  This is set to the cred of the task doing
the open, so often it will be the same as that of the restarted task.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/files.c             |   16 ++++++++++++++--
 include/linux/checkpoint_hdr.h |    1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/checkpoint/files.c b/checkpoint/files.c
index 6107361..edf81ce 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -151,7 +151,11 @@ int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
 	h->f_pos = file->f_pos;
 	h->f_version = file->f_version;
 
-	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
+	h->f_credref = checkpoint_obj(ctx, file->f_cred, CKPT_OBJ_CRED);
+	if (h->f_credref < 0)
+		return h->f_credref;
+
+	/* FIX: need also file->f_owner, etc */
 
 	return 0;
 }
@@ -359,8 +363,16 @@ int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
 			struct ckpt_hdr_file *h)
 {
 	int ret;
+	struct cred *cred;
+
+	/* FIX: need to restore owner etc */
 
-	/* FIX: need to restore uid, gid, owner etc */
+	/* restore the cred */
+	cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+	put_cred(file->f_cred);
+	file->f_cred = get_cred(cred);
 
 	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
 	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 98fc0f7..2693a72 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -341,6 +341,7 @@ struct ckpt_hdr_file {
 	__u32 _padding;
 	__u64 f_pos;
 	__u64 f_version;
+	__s32 f_credref;
 } __attribute__((aligned(8)));
 
 struct ckpt_hdr_file_generic {
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 6/6] user namespaces: debug refcounts
       [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2009-05-19  1:45   ` [PATCH 5/6] cr: restore file->f_cred Serge E. Hallyn
@ 2009-05-19  1:45   ` Serge E. Hallyn
  5 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19  1:45 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Create /proc/userns, which prints out all user namespaces.  It
prints the address of the user_ns itself, the uid and userns address
of the user who created it, and the reference count.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/user_namespace.h |    2 +
 kernel/user.c                  |    1 +
 kernel/user_namespace.c        |   84 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 0 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index a2b82d5..e64d602 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -14,8 +14,10 @@ struct user_namespace {
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*creator;
 	struct work_struct	destroyer;
+	struct list_head	list;
 };
 
+extern spinlock_t usernslist_lock;
 extern struct user_namespace init_user_ns;
 
 #ifdef CONFIG_USER_NS
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba..6474422 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -23,6 +23,7 @@ struct user_namespace init_user_ns = {
 		.refcount	= ATOMIC_INIT(2),
 	},
 	.creator = &root_user,
+	.list = LIST_HEAD_INIT(init_user_ns.list),
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e624b0f..8d7c725 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -10,6 +10,11 @@
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/cred.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+DEFINE_SPINLOCK(usernslist_lock);
 
 static struct user_namespace *_new_user_ns(struct user_struct *creator,
 				   struct user_struct **newroot)
@@ -40,6 +45,9 @@ static struct user_namespace *_new_user_ns(struct user_struct *creator,
 	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
 	kref_set(&ns->kref, 1);
 
+	spin_lock(&usernslist_lock);
+	list_add_tail(&ns->list, &init_user_ns.list);
+	spin_unlock(&usernslist_lock);
 	*newroot = root_user;
 	return ns;
 }
@@ -90,6 +98,9 @@ static void free_user_ns_work(struct work_struct *work)
 {
 	struct user_namespace *ns =
 		container_of(work, struct user_namespace, destroyer);
+	spin_lock(&usernslist_lock);
+	list_del(&ns->list);
+	spin_unlock(&usernslist_lock);
 	free_uid(ns->creator);
 	kfree(ns);
 }
@@ -103,3 +114,76 @@ void free_user_ns(struct kref *kref)
 	schedule_work(&ns->destroyer);
 }
 EXPORT_SYMBOL(free_user_ns);
+
+#ifdef CONFIG_PROC_FS
+static int proc_userns_show(struct seq_file *m, void *v)
+{
+	struct user_namespace *ns = v;
+	seq_printf(m, "userns %p creator (uid %d ns %p) count %d\n",
+		(void *)ns, ns->creator->uid, (void *) ns->creator->user_ns,
+		atomic_read(&ns->kref.refcount));
+	return 0;
+}
+
+static void *proc_userns_start(struct seq_file *p, loff_t *_pos)
+{
+	loff_t pos = *_pos;
+	struct user_namespace *ns = &init_user_ns;
+	spin_lock(&usernslist_lock);
+	while (pos) {
+		pos--;
+		ns = list_entry(ns->list.next, struct user_namespace, list);
+		if (ns  == &init_user_ns)
+			return NULL;
+	}
+	return ns;
+}
+
+static void *proc_userns_next(struct seq_file *p, void *v, loff_t *_pos)
+{
+	struct user_namespace *ns = v;
+	(*_pos)++;
+	ns = list_entry(ns->list.next, struct user_namespace, list);
+	if (ns == &init_user_ns)
+		return NULL;
+	return ns;
+}
+
+static void proc_userns_stop(struct seq_file *p, void *v)
+{
+	spin_unlock(&usernslist_lock);
+}
+
+static const struct seq_operations proc_userns_ops;
+
+static int proc_userns_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &proc_userns_ops);
+}
+
+static const struct seq_operations proc_userns_ops = {
+	.start	= proc_userns_start,
+	.next	= proc_userns_next,
+	.stop	= proc_userns_stop,
+	.show	= proc_userns_show,
+};
+
+const struct file_operations proc_userns_fops = {
+	.open		= proc_userns_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static __init int user_ns_debug(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create("userns", 0, NULL, &proc_userns_fops);
+	if (!p)
+		panic("cannot create /proc/userns\n");
+	return 0;
+}
+
+__initcall(user_ns_debug);
+#endif
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found] ` <20090519014538.GD28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-19  8:26   ` David Howells
       [not found]     ` <16258.1242721606-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
       [not found]     ` <20090519133526.GB32685-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-20 15:35   ` Oren Laadan
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 27+ messages in thread
From: David Howells @ 2009-05-19  8:26 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: dhowells-H+wXaHxf7aLQT0dZR+AlfA, Linux Containers,
	alexey-r/Jw6+rmf7HQT0dZR+AlfA

Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:

> +/* move this code into kernel/cred.c and do proper perms checking of course */
> +struct cred *restore_read_cred(struct ckpt_ctx *ctx)
> +{

This function needs to fix up cred->security.

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]     ` <16258.1242721606-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2009-05-19 13:35       ` Serge E. Hallyn
  0 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19 13:35 UTC (permalink / raw)
  To: David Howells; +Cc: Linux Containers, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Quoting David Howells (dhowells-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org):
> Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:
> 
> > +/* move this code into kernel/cred.c and do proper perms checking of course */
> > +struct cred *restore_read_cred(struct ckpt_ctx *ctx)
> > +{
> 
> This function needs to fix up cred->security.

Yup -it's not at all clear to me yet how to go about that, so I'll
need to have a discussion on the LSM list about whether a pair
of new security_ops hook is called for.  One to authorize restart,
based on the current domain and the type of the mm->exe_file being
executed (and maybe the type of the checkpoint image file), and
one to calculate the new domain to enter at the end of restart.

Or did you mean something else by 'fix up' cred->security?

thanks,
-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]     ` <20090519133526.GB32685-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-19 14:26       ` David Howells
       [not found]         ` <19394.1242743199-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: David Howells @ 2009-05-19 14:26 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: dhowells-H+wXaHxf7aLQT0dZR+AlfA, Linux Containers,
	alexey-r/Jw6+rmf7HQT0dZR+AlfA

Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:

> Or did you mean something else by 'fix up' cred->security?

cred->security is inherited from the current process by virtue of calling
prepare_creds() - as such, it is almost certainly going to be wrong.  Can you
just ask the LSM for a set of textual security labels when saving, and then
set those back when restoring?

David

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]         ` <19394.1242743199-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2009-05-19 14:46           ` Serge E. Hallyn
  0 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-19 14:46 UTC (permalink / raw)
  To: David Howells; +Cc: Linux Containers, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Quoting David Howells (dhowells-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org):
> Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:
> 
> > Or did you mean something else by 'fix up' cred->security?
> 
> cred->security is inherited from the current process by virtue of calling
> prepare_creds() - as such, it is almost certainly going to be wrong.  Can you
> just ask the LSM for a set of textual security labels when saving, and then
> set those back when restoring?

That would be too easy a way for users (even privileged root users but
constrained by selinux) to bypass selinux restrictions.  All they'd have
to do is checkpoint their shell, and rewrite the ->security field in the
checkpoint image with 'shadow_t', to get a shell that can write to the
shadow file, for instance.

-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/6] cr: restore file->f_cred
       [not found]     ` <20090519014546.GE28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-20 15:08       ` Oren Laadan
       [not found]         ` <4A141CEE.2080100-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Oren Laadan @ 2009-05-20 15:08 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA



Serge E. Hallyn wrote:
> Ony seems useful if you're using coda or hppfs, but go ahead and

  ^^^

(It is also used in some place under net/*...)

> restore a file's f_cred.  This is set to the cred of the task doing
> the open, so often it will be the same as that of the restarted task.
> 
> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/files.c             |   16 ++++++++++++++--
>  include/linux/checkpoint_hdr.h |    1 +
>  2 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index 6107361..edf81ce 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -151,7 +151,11 @@ int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
>  	h->f_pos = file->f_pos;
>  	h->f_version = file->f_version;
>  
> -	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
> +	h->f_credref = checkpoint_obj(ctx, file->f_cred, CKPT_OBJ_CRED);
> +	if (h->f_credref < 0)
> +		return h->f_credref;
> +
> +	/* FIX: need also file->f_owner, etc */
>  
>  	return 0;
>  }
> @@ -359,8 +363,16 @@ int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
>  			struct ckpt_hdr_file *h)
>  {
>  	int ret;
> +	struct cred *cred;
> +
> +	/* FIX: need to restore owner etc */
>  
> -	/* FIX: need to restore uid, gid, owner etc */
> +	/* restore the cred */
> +	cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
> +	if (IS_ERR(cred))
> +		return PTR_ERR(cred);
> +	put_cred(file->f_cred);
> +	file->f_cred = get_cred(cred);
>  
>  	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
>  	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 98fc0f7..2693a72 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -341,6 +341,7 @@ struct ckpt_hdr_file {
>  	__u32 _padding;
>  	__u64 f_pos;
>  	__u64 f_version;
> +	__s32 f_credref;

This can replace the unused _padding field above ?

>  } __attribute__((aligned(8)));
>  
>  struct ckpt_hdr_file_generic {

Acked-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/6] cr: restore file->f_cred
       [not found]         ` <4A141CEE.2080100-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-20 15:25           ` Serge E. Hallyn
       [not found]             ` <20090520152527.GA28585-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-20 15:25 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Serge E. Hallyn wrote:
> > Ony seems useful if you're using coda or hppfs, but go ahead and
> 
>   ^^^
> 
> (It is also used in some place under net/*...)

ok

> > diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> > index 98fc0f7..2693a72 100644
> > --- a/include/linux/checkpoint_hdr.h
> > +++ b/include/linux/checkpoint_hdr.h
> > @@ -341,6 +341,7 @@ struct ckpt_hdr_file {
> >  	__u32 _padding;
> >  	__u64 f_pos;
> >  	__u64 f_version;
> > +	__s32 f_credref;
> 
> This can replace the unused _padding field above ?

Oh, of course.

> >  } __attribute__((aligned(8)));
> >  
> >  struct ckpt_hdr_file_generic {
> 
> Acked-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Thanks - did you want me to send a new version, or will you apply the
updated patch?

thanks,
-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/6] cr: restore file->f_cred
       [not found]             ` <20090520152527.GA28585-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-20 15:26               ` Oren Laadan
  0 siblings, 0 replies; 27+ messages in thread
From: Oren Laadan @ 2009-05-20 15:26 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>
>> Serge E. Hallyn wrote:
>>> Ony seems useful if you're using coda or hppfs, but go ahead and
>>   ^^^
>>
>> (It is also used in some place under net/*...)
> 
> ok
> 
>>> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
>>> index 98fc0f7..2693a72 100644
>>> --- a/include/linux/checkpoint_hdr.h
>>> +++ b/include/linux/checkpoint_hdr.h
>>> @@ -341,6 +341,7 @@ struct ckpt_hdr_file {
>>>  	__u32 _padding;
>>>  	__u64 f_pos;
>>>  	__u64 f_version;
>>> +	__s32 f_credref;
>> This can replace the unused _padding field above ?
> 
> Oh, of course.
> 
>>>  } __attribute__((aligned(8)));
>>>  
>>>  struct ckpt_hdr_file_generic {
>> Acked-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> 
> Thanks - did you want me to send a new version, or will you apply the
> updated patch?

I just assumed there will be a next version of the series :p

Oren.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found] ` <20090519014538.GD28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-19  8:26   ` [PATCH 4/6] cr: checkpoint and restore task credentials David Howells
@ 2009-05-20 15:35   ` Oren Laadan
       [not found]     ` <4A142350.1060308-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-20 16:54   ` Oren Laadan
  2009-05-20 16:56   ` Oren Laadan
  3 siblings, 1 reply; 27+ messages in thread
From: Oren Laadan @ 2009-05-20 15:35 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, David Howells


Serge -

The 'creator' makes the 'struct user' recursive because to save
an object you need to first save its creator etc. However the
implementation may not call checkpoint_obj() recursively, if
the depth isn't bound a-priory. You probably need to convert
checkpoint_write_user() to do an iterative (loop) implementation
of the recursion...

Oren.


Serge E. Hallyn wrote:
> This patch adds the checkpointing and restart of credentials
> (uids, gids, and capabilities) to Oren's c/r patchset (on top
> of v14).  It goes to great pains to re-use (and define when
> needed) common helpers, in order to make sure that as security
> code is modified, the cr code will be updated.  Some of the
> helpers should still be moved (i.e. _creds() functions should
> be in kernel/cred.c).
> 
> When building the credentials for the restarted process, I
> 1. create a new struct cred as a copy of the running task's
> cred (using prepare_cred())
> 2. always authorize any changes to the new struct cred
> based on the permissions of current_cred() (not the current
> transient state of the new cred).
> 
> While this may mean that certain transient_cred1->transient_cred2
> states are allowed which otherwise wouldn't be allowed, the
> fact remains that current_cred() is allowed to transition to
> transient_cred2.
> 
> The reconstructed creds are applied to the task at the very
> end of the sys_restart call.  This ensures that any objects which
> need to be re-created (file, socket, etc) are re-created using
> the creds of the task calling sys_restart - preventing an unpriv
> user from creating a privileged object, and ensuring that a
> root task can restart a process which had started out privileged,
> created some privileged objects, then dropped its privilege.
> 
> With these patches, the root user can restart checkpoint images
> (created by either hallyn or root) of user hallyn's tasks,
> resulting in a program owned by hallyn.
> 
> Plenty of bugs to be found, no doubt.
> 
> Changelog:
> 	May 18: fix more refcounting: if (userns 5, uid 0) had
> 		no active tasks or child user_namespaces, then
> 		it shouldn't exist at restart or it, its namespace,
> 		and its whole chain of creators will be leaked.
> 	May 14: fix some refcounting:
> 		1. a new user_ns needs a ref to remain pinned
> 		   by its root user
> 		2. current_user_ns needs an extra ref bc objhash
> 		   drops two on restart
> 		3. cred needs a ref for the real credentials bc
> 		   commit_creds eats one ref.
> 	May 13: folded in fix to userns refcounting.
> 
> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

[...]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]     ` <4A142350.1060308-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-20 15:53       ` Serge E. Hallyn
       [not found]         ` <20090520155332.GA28999-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-20 15:53 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, David Howells

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> Serge -
> 
> The 'creator' makes the 'struct user' recursive because to save
> an object you need to first save its creator etc. However the
> implementation may not call checkpoint_obj() recursively, if
> the depth isn't bound a-priory. You probably need to convert
> checkpoint_write_user() to do an iterative (loop) implementation
> of the recursion...

It's not done recursively, though.  checkpoint_write_userns() will
only be called by checkpoint_write_user().   checkpoint_write_user()
will make sure there are only 5 levels deep of unwritten creator
user namespace (else bail), then start at the oldest creator, and
write it.  That checkpoint_write_user will cause a write of its
user_ns, then write the user, then return to the original
checkpoint_write_user() which will now write the next-oldest
user_struct

So the max recursion depth we get is:

	checkpoint_task_struct -> checkpoint_write_cred ->
	checkpoint_write_user -> checkpoint_write_user ->
	checkpoint_write_userns (END)

-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]         ` <20090520155332.GA28999-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-20 16:08           ` Oren Laadan
       [not found]             ` <4A142B05.4040907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Oren Laadan @ 2009-05-20 16:08 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, David Howells



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>> Serge -
>>
>> The 'creator' makes the 'struct user' recursive because to save
>> an object you need to first save its creator etc. However the
>> implementation may not call checkpoint_obj() recursively, if
>> the depth isn't bound a-priory. You probably need to convert
>> checkpoint_write_user() to do an iterative (loop) implementation
>> of the recursion...
> 
> It's not done recursively, though.  checkpoint_write_userns() will
> only be called by checkpoint_write_user().   checkpoint_write_user()
> will make sure there are only 5 levels deep of unwritten creator
> user namespace (else bail), then start at the oldest creator, and
> write it.  That checkpoint_write_user will cause a write of its
> user_ns, then write the user, then return to the original
> checkpoint_write_user() which will now write the next-oldest
> user_struct
> 
> So the max recursion depth we get is:
> 
> 	checkpoint_task_struct -> checkpoint_write_cred ->
> 	checkpoint_write_user -> checkpoint_write_user ->
> 	checkpoint_write_userns (END)

Ok. Let's not restrict to 5 levels :)

Oren.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]             ` <4A142B05.4040907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-20 16:13               ` Serge E. Hallyn
  0 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-20 16:13 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, David Howells

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Serge E. Hallyn wrote:
> > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >> Serge -
> >>
> >> The 'creator' makes the 'struct user' recursive because to save
> >> an object you need to first save its creator etc. However the
> >> implementation may not call checkpoint_obj() recursively, if
> >> the depth isn't bound a-priory. You probably need to convert
> >> checkpoint_write_user() to do an iterative (loop) implementation
> >> of the recursion...
> > 
> > It's not done recursively, though.  checkpoint_write_userns() will
> > only be called by checkpoint_write_user().   checkpoint_write_user()
> > will make sure there are only 5 levels deep of unwritten creator
> > user namespace (else bail), then start at the oldest creator, and
> > write it.  That checkpoint_write_user will cause a write of its
> > user_ns, then write the user, then return to the original
> > checkpoint_write_user() which will now write the next-oldest
> > user_struct
> > 
> > So the max recursion depth we get is:
> > 
> > 	checkpoint_task_struct -> checkpoint_write_cred ->
> > 	checkpoint_write_user -> checkpoint_write_user ->
> > 	checkpoint_write_userns (END)
> 
> Ok. Let's not restrict to 5 levels :)

Will change that, and resend the set hopefully tonight.

thanks,
-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found] ` <20090519014538.GD28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-19  8:26   ` [PATCH 4/6] cr: checkpoint and restore task credentials David Howells
  2009-05-20 15:35   ` Oren Laadan
@ 2009-05-20 16:54   ` Oren Laadan
       [not found]     ` <4A1435E0.3010306-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-20 16:56   ` Oren Laadan
  3 siblings, 1 reply; 27+ messages in thread
From: Oren Laadan @ 2009-05-20 16:54 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, David Howells



Serge E. Hallyn wrote:
> This patch adds the checkpointing and restart of credentials
> (uids, gids, and capabilities) to Oren's c/r patchset (on top
> of v14).  It goes to great pains to re-use (and define when
> needed) common helpers, in order to make sure that as security
> code is modified, the cr code will be updated.  Some of the
> helpers should still be moved (i.e. _creds() functions should
> be in kernel/cred.c).
> 
> When building the credentials for the restarted process, I
> 1. create a new struct cred as a copy of the running task's
> cred (using prepare_cred())
> 2. always authorize any changes to the new struct cred
> based on the permissions of current_cred() (not the current
> transient state of the new cred).
> 
> While this may mean that certain transient_cred1->transient_cred2
> states are allowed which otherwise wouldn't be allowed, the
> fact remains that current_cred() is allowed to transition to
> transient_cred2.
> 
> The reconstructed creds are applied to the task at the very
> end of the sys_restart call.  This ensures that any objects which
> need to be re-created (file, socket, etc) are re-created using
> the creds of the task calling sys_restart - preventing an unpriv
> user from creating a privileged object, and ensuring that a
> root task can restart a process which had started out privileged,
> created some privileged objects, then dropped its privilege.
> 
> With these patches, the root user can restart checkpoint images
> (created by either hallyn or root) of user hallyn's tasks,
> resulting in a program owned by hallyn.

In this case, it is necessary to ensure that all resources created
during the restart are properly owned. You do take care of open
files (next patch); can you also add a patch for IPC ?

> 
> Plenty of bugs to be found, no doubt.
> 
> Changelog:
> 	May 18: fix more refcounting: if (userns 5, uid 0) had
> 		no active tasks or child user_namespaces, then
> 		it shouldn't exist at restart or it, its namespace,
> 		and its whole chain of creators will be leaked.
> 	May 14: fix some refcounting:
> 		1. a new user_ns needs a ref to remain pinned
> 		   by its root user
> 		2. current_user_ns needs an extra ref bc objhash
> 		   drops two on restart
> 		3. cred needs a ref for the real credentials bc
> 		   commit_creds eats one ref.
> 	May 13: folded in fix to userns refcounting.
> 
> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---

[...]

> @@ -290,6 +396,18 @@ static struct ckpt_obj *obj_find_by_ptr(struct ckpt_ctx *ctx, void *ptr)
>  	return NULL;
>  }
>  
> +/*
> + * look up an obj and return objref if in hash, else
> + * return 0.  Used during checkpoint.
> + */
> +int obj_lookup_dontadd(struct ckpt_ctx *ctx, void *ptr)

Do you mind calling it simply obj_lookup() ?

> +{
> +	struct ckpt_obj *obj = obj_find_by_ptr(ctx, ptr);
> +	if (obj)
> +		return obj->objref;
> +	return 0;
> +}
> +
>  static struct ckpt_obj *obj_find_by_objref(struct ckpt_ctx *ctx, int objref)
>  {
>  	struct hlist_head *h;
> @@ -389,7 +507,6 @@ int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
>  		*first = 0;
>  	}
>  
> -	ckpt_debug("%s objref %d first %d\n", ops->obj_name, objref, *first);

Please leave the debug messages inside... In fact, a debug message
above in obj_lookup() may prove useful as well.

>  	return objref;
>  }
>  
> diff --git a/checkpoint/process.c b/checkpoint/process.c
> index d8d346b..74b411a 100644
> --- a/checkpoint/process.c
> +++ b/checkpoint/process.c
> @@ -17,6 +17,7 @@
>  #include <linux/poll.h>
>  #include <linux/nsproxy.h>
>  #include <linux/utsname.h>
> +#include <linux/user_namespace.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
>  #include <linux/syscalls.h>
> @@ -27,16 +28,210 @@
>   * Checkpoint
>   */
>  
> +#define CKPT_MAXGROUPS 15
> +#define MAX_GROUPINFO_SIZE (sizeof(*h)+CKPT_MAXGROUPS*sizeof(gid_t))

Is this really necessary ?  If you are worried about DoS by alloc'ing
a lot of memory on checkpoint, you can write it out in chunks (e.g.
like the pids array), or set a sysctl tunable.  Likewise for restart.

> +/* move this fn into kernel/sys.c next to group functions? */
> +static int checkpoint_write_groupinfo(struct ckpt_ctx *ctx,
> +					struct group_info *g)

[...]

> +
>  /* dump the task_struct of a given task */
>  static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
>  {
>  	struct ckpt_hdr_task *h;
>  	int ret;
> +	int realcred_ref, ecred_ref;
> +
> +	realcred_ref = checkpoint_obj(ctx, t->real_cred, CKPT_OBJ_CRED);
> +	if (realcred_ref < 0)
> +		return realcred_ref;
> +
> +	ecred_ref = checkpoint_obj(ctx, t->cred, CKPT_OBJ_CRED);
> +	if (ecred_ref < 0)
> +		return ecred_ref;

Is this safe even if the checkpointed task's state changes ?
(e.g. unfrozen - yes, I know there is a patch in the works to
prevent this; but if we ever want to checkpoint STOPPED tasks...
for instance).

Would incrementing the refcount on t->{cred,real_cred} help ?

[...]

> +}
> +
> +/* move this code into kernel/cred.c and do proper perms checking of course */
> +struct cred *restore_read_cred(struct ckpt_ctx *ctx)
> +{
> +	struct cred *cred;
> +	struct ckpt_hdr_cred *h;
> +	struct user_struct *user;
> +	struct group_info *groupinfo;
> +	int ret = -EINVAL;
> +	uid_t olduid;
> +	gid_t oldgid;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CRED);
> +	if (IS_ERR(h))
> +		return ERR_PTR(PTR_ERR(h));
> +	if (h->version != 1)
			 ^^^
This probably deserves some constant defined somewhere ... (heh,
I must have missed it in the checkpoint code above).

[...]

>  /* read the task_struct into the current task */
> -static int restore_task_struct(struct ckpt_ctx *ctx)
> +static int restore_task_struct(struct ckpt_ctx *ctx, struct cred **realcredp,
> +				struct cred **ecredp)

				^^^^^^^^^^^^^^^^^^^^^    ^^^^^^^^^^^^^^^^^^^^^^^
				
This probably belongs to restore_task_shared() ?

>  {
>  	struct ckpt_hdr_task *h;
>  	struct task_struct *t = current;
> @@ -365,8 +778,21 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
>  
>  	memset(t->comm, 0, TASK_COMM_LEN);
>  	ret = _ckpt_read_string(ctx, t->comm, h->task_comm_len);
> +	if (ret < 0)
> +		goto out;
>  
>  	/* FIXME: restore remaining relevant task_struct fields */
> +
> +	ret = 0;
> +	*realcredp = ckpt_obj_fetch(ctx, h->cred_ref, CKPT_OBJ_CRED);
> +	if (IS_ERR(*realcredp)) {
> +		ret = PTR_ERR(*realcredp);
> +		goto out;
> +	}
> +	*ecredp = ckpt_obj_fetch(ctx, h->ecred_ref, CKPT_OBJ_CRED);
> +	if (IS_ERR(*ecredp))
> +		ret = PTR_ERR(*ecredp);
> +
>   out:
>  	ckpt_hdr_put(ctx, h);
>  	return ret;
> @@ -648,12 +1074,35 @@ static int restore_task_shared(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +static int restore_creds(struct ckpt_ctx *ctx, struct cred *rcred,
> +			struct cred *ecred)
> +{
> +	int ret;
> +	const struct cred *old;
> +
> +	/* commit_creds will take one ref for the eff creds, but
> +	 * expects us to hold a ref for the obj creds, so take a
> +	 * ref here */
> +	get_cred(rcred);
> +	ret = commit_creds(rcred);
> +	if (ret)
> +		return ret;
> +
> +	if (ecred == rcred)
> +		return 0;
> +
> +	old =  override_creds(ecred); /* override_creds otoh takes new ref */
> +	put_cred(old);
> +	return 0;
> +}
> +
>  /* read the entire state of the current task */
>  int restore_task(struct ckpt_ctx *ctx)
>  {
>  	int ret;
> +	struct cred *realcred, *ecred;
>  
> -	ret = restore_task_struct(ctx);
> +	ret = restore_task_struct(ctx, &realcred, &ecred);

Actually, this is one of several cases where we need to restore some
resources but only apply it to a process at the end of its restart.

Another example would be restoring pending signals and the blocked
signal mask in the future.

I suggest that we keep a pointer on the task_struct to a structure
that will hold all that do-later work. The structure can encapsulate
the pending work either explicitly - e.g. a struct with fields like
realcred, ecred, signal mask, etc... - or implicitly, by reusing the
deferqueue framework, per task.

Actually, that pointer can be kept on the ckpt_ctx structure, to be
used by the current-restarting-task only.

>  	ckpt_debug("ret %d\n", ret);
>  	if (ret < 0)
>  		goto out;
> @@ -671,6 +1120,10 @@ int restore_task(struct ckpt_ctx *ctx)
>  		goto out;
>  	ret = restore_cpu(ctx);
>  	ckpt_debug("cpu: ret %d\n", ret);
> +	if (ret < 0)
> +		goto out;
> +	ret = restore_creds(ctx, realcred, ecred);

... and this would then be called from a restore_task_finalize()
function explicitly or implicitly by deferqueue_run().

> +	ckpt_debug("creds: ret %d\n", ret);
>   out:
>  	return ret;
>  }

[...]

Oren.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found] ` <20090519014538.GD28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-05-20 16:54   ` Oren Laadan
@ 2009-05-20 16:56   ` Oren Laadan
  3 siblings, 0 replies; 27+ messages in thread
From: Oren Laadan @ 2009-05-20 16:56 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Linux Containers, David Howells, alexey-r/Jw6+rmf7HQT0dZR+AlfA



Serge E. Hallyn wrote:

> +static struct user_namespace *restore_read_userns(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_user_ns *h;
> +	struct user_namespace *ns;
> +	struct user_struct *new_root, *creator;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_USER_NS);
> +	if (IS_ERR(h))
> +		return ERR_PTR(PTR_ERR(h));
> +	if (h->flags & CKPT_USERNS_INIT) {

Perhaps make it future-safe by disallowing other flags ?

> +		ckpt_hdr_put(ctx, h);
> +		/* grab an extra ref bc objhash will drop an extra */
> +		return get_user_ns(current_user_ns());
> +	}

Oren.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]     ` <4A1435E0.3010306-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-20 21:40       ` Serge E. Hallyn
       [not found]         ` <20090520214027.GA3517-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-20 21:52       ` Serge E. Hallyn
  2009-05-20 22:16       ` Serge E. Hallyn
  2 siblings, 1 reply; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-20 21:40 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, David Howells

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >  /* dump the task_struct of a given task */
> >  static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
> >  {
> >  	struct ckpt_hdr_task *h;
> >  	int ret;
> > +	int realcred_ref, ecred_ref;
> > +
> > +	realcred_ref = checkpoint_obj(ctx, t->real_cred, CKPT_OBJ_CRED);
> > +	if (realcred_ref < 0)
> > +		return realcred_ref;
> > +
> > +	ecred_ref = checkpoint_obj(ctx, t->cred, CKPT_OBJ_CRED);
> > +	if (ecred_ref < 0)
> > +		return ecred_ref;
> 
> Is this safe even if the checkpointed task's state changes ?
> (e.g. unfrozen - yes, I know there is a patch in the works to
> prevent this; but if we ever want to checkpoint STOPPED tasks...
> for instance).
> 
> Would incrementing the refcount on t->{cred,real_cred} help ?

Doesn't checkpoint_obj already do that through obj_new?

-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]     ` <4A1435E0.3010306-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-20 21:40       ` Serge E. Hallyn
@ 2009-05-20 21:52       ` Serge E. Hallyn
       [not found]         ` <20090520215250.GB3517-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-20 22:16       ` Serge E. Hallyn
  2 siblings, 1 reply; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-20 21:52 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, David Howells

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >  /* read the task_struct into the current task */
> > -static int restore_task_struct(struct ckpt_ctx *ctx)
> > +static int restore_task_struct(struct ckpt_ctx *ctx, struct cred **realcredp,
> > +				struct cred **ecredp)
> 
> 				^^^^^^^^^^^^^^^^^^^^^    ^^^^^^^^^^^^^^^^^^^^^^^
> 				
> This probably belongs to restore_task_shared() ?

Why?  The task->cred and task->ecred are task properties, so
their reference should be stored in the ckpt_hdr_task->cred_ref
and ckpt_hdr_task->ecred_ref, no?

-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]     ` <4A1435E0.3010306-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-20 21:40       ` Serge E. Hallyn
  2009-05-20 21:52       ` Serge E. Hallyn
@ 2009-05-20 22:16       ` Serge E. Hallyn
       [not found]         ` <20090520221600.GA3925-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2 siblings, 1 reply; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-20 22:16 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >  /* read the entire state of the current task */
> >  int restore_task(struct ckpt_ctx *ctx)
> >  {
> >  	int ret;
> > +	struct cred *realcred, *ecred;
> >  
> > -	ret = restore_task_struct(ctx);
> > +	ret = restore_task_struct(ctx, &realcred, &ecred);
> 
> Actually, this is one of several cases where we need to restore some
> resources but only apply it to a process at the end of its restart.
> 
> Another example would be restoring pending signals and the blocked
> signal mask in the future.
> 
> I suggest that we keep a pointer on the task_struct to a structure
> that will hold all that do-later work. The structure can encapsulate
> the pending work either explicitly - e.g. a struct with fields like
> realcred, ecred, signal mask, etc... - or implicitly, by reusing the
> deferqueue framework, per task.
> 
> Actually, that pointer can be kept on the ckpt_ctx structure, to be
> used by the current-restarting-task only.
> 
> >  	ckpt_debug("ret %d\n", ret);
> >  	if (ret < 0)
> >  		goto out;
> > @@ -671,6 +1120,10 @@ int restore_task(struct ckpt_ctx *ctx)
> >  		goto out;
> >  	ret = restore_cpu(ctx);
> >  	ckpt_debug("cpu: ret %d\n", ret);
> > +	if (ret < 0)
> > +		goto out;
> > +	ret = restore_creds(ctx, realcred, ecred);
> 
> ... and this would then be called from a restore_task_finalize()
> function explicitly or implicitly by deferqueue_run().

deferqueue_run() won't do, since that's done only once for the
whole container, and we (as you say above) want to reuse one
set of fields in the ckpt_ctx for each task's sys_restart() run.

I'll go ahead and put fields in the ckpt_ctx this time around
and use those, but won't go further right now as I'd be
overgeneralizing before we have the signals and such work
done.  When we do that, we can move the restore_creds() fn
if appropriate.

-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]         ` <20090520221600.GA3925-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-21  6:03           ` Oren Laadan
  0 siblings, 0 replies; 27+ messages in thread
From: Oren Laadan @ 2009-05-21  6:03 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>  /* read the entire state of the current task */
>>>  int restore_task(struct ckpt_ctx *ctx)
>>>  {
>>>  	int ret;
>>> +	struct cred *realcred, *ecred;
>>>  
>>> -	ret = restore_task_struct(ctx);
>>> +	ret = restore_task_struct(ctx, &realcred, &ecred);
>> Actually, this is one of several cases where we need to restore some
>> resources but only apply it to a process at the end of its restart.
>>
>> Another example would be restoring pending signals and the blocked
>> signal mask in the future.
>>
>> I suggest that we keep a pointer on the task_struct to a structure
>> that will hold all that do-later work. The structure can encapsulate
>> the pending work either explicitly - e.g. a struct with fields like
>> realcred, ecred, signal mask, etc... - or implicitly, by reusing the
>> deferqueue framework, per task.
>>
>> Actually, that pointer can be kept on the ckpt_ctx structure, to be
>> used by the current-restarting-task only.
>>
>>>  	ckpt_debug("ret %d\n", ret);
>>>  	if (ret < 0)
>>>  		goto out;
>>> @@ -671,6 +1120,10 @@ int restore_task(struct ckpt_ctx *ctx)
>>>  		goto out;
>>>  	ret = restore_cpu(ctx);
>>>  	ckpt_debug("cpu: ret %d\n", ret);
>>> +	if (ret < 0)
>>> +		goto out;
>>> +	ret = restore_creds(ctx, realcred, ecred);
>> ... and this would then be called from a restore_task_finalize()
>> function explicitly or implicitly by deferqueue_run().
> 
> deferqueue_run() won't do, since that's done only once for the
> whole container, and we (as you say above) want to reuse one
> set of fields in the ckpt_ctx for each task's sys_restart() run.

I meant to add another deferqueue (either per task or on the
ckpt_ctx), for this specific purpose.

Oren.

> 
> I'll go ahead and put fields in the ckpt_ctx this time around
> and use those, but won't go further right now as I'd be
> overgeneralizing before we have the signals and such work
> done.  When we do that, we can move the restore_creds() fn
> if appropriate.
> 
> -serge
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]         ` <20090520214027.GA3517-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-21 14:02           ` Oren Laadan
       [not found]             ` <4A155EEC.9070509-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Oren Laadan @ 2009-05-21 14:02 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, David Howells



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>  /* dump the task_struct of a given task */
>>>  static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
>>>  {
>>>  	struct ckpt_hdr_task *h;
>>>  	int ret;
>>> +	int realcred_ref, ecred_ref;
>>> +
>>> +	realcred_ref = checkpoint_obj(ctx, t->real_cred, CKPT_OBJ_CRED);
>>> +	if (realcred_ref < 0)
>>> +		return realcred_ref;
>>> +
>>> +	ecred_ref = checkpoint_obj(ctx, t->cred, CKPT_OBJ_CRED);
>>> +	if (ecred_ref < 0)
>>> +		return ecred_ref;
>> Is this safe even if the checkpointed task's state changes ?
>> (e.g. unfrozen - yes, I know there is a patch in the works to
>> prevent this; but if we ever want to checkpoint STOPPED tasks...
>> for instance).
>>
>> Would incrementing the refcount on t->{cred,real_cred} help ?
> 
> Doesn't checkpoint_obj already do that through obj_new?
> 

No, it does not. There is a (potentially long) window of opportunity
between the callback invoked from checkpoint_obj() - where the pointer
is used, and when checkpoint_obj() later takes the extra reference.

See for comparison checkpoint_mm_obj(), it safely grabs the task->mm
(with a reference) around the invocation of checkpoint_obj().

Oren.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]         ` <20090520215250.GB3517-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-21 14:13           ` Oren Laadan
  0 siblings, 0 replies; 27+ messages in thread
From: Oren Laadan @ 2009-05-21 14:13 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, David Howells



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>  /* read the task_struct into the current task */
>>> -static int restore_task_struct(struct ckpt_ctx *ctx)
>>> +static int restore_task_struct(struct ckpt_ctx *ctx, struct cred **realcredp,
>>> +				struct cred **ecredp)
>> 				^^^^^^^^^^^^^^^^^^^^^    ^^^^^^^^^^^^^^^^^^^^^^^
>> 				
>> This probably belongs to restore_task_shared() ?
> 
> Why?  The task->cred and task->ecred are task properties, so
> their reference should be stored in the ckpt_hdr_task->cred_ref
> and ckpt_hdr_task->ecred_ref, no?
> 

Because they are shared objects; task->mm is also a task property...

checkpoint,restore}_task_struct() handles c/r of task properties
that are "private" to the task.

{checkpoint,restore}_task_shared() handles c/r of task properties
that are shared.

This division makes it easier to see the dependencies between
shared resources (and themselves, or others), by explicitly
reordering if necessary. For example, ns precedes mm because
sysvipc shm depends on that of ns.

In this case, for similar reasons, I'd place the user ns right
there before the rest of the ns, because sysvipc of ns depends
on credentials from user-ns.

Oren.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] cr: checkpoint and restore task credentials
       [not found]             ` <4A155EEC.9070509-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-21 14:14               ` Serge E. Hallyn
  0 siblings, 0 replies; 27+ messages in thread
From: Serge E. Hallyn @ 2009-05-21 14:14 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, David Howells

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Serge E. Hallyn wrote:
> > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >>>  /* dump the task_struct of a given task */
> >>>  static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
> >>>  {
> >>>  	struct ckpt_hdr_task *h;
> >>>  	int ret;
> >>> +	int realcred_ref, ecred_ref;
> >>> +
> >>> +	realcred_ref = checkpoint_obj(ctx, t->real_cred, CKPT_OBJ_CRED);
> >>> +	if (realcred_ref < 0)
> >>> +		return realcred_ref;
> >>> +
> >>> +	ecred_ref = checkpoint_obj(ctx, t->cred, CKPT_OBJ_CRED);
> >>> +	if (ecred_ref < 0)
> >>> +		return ecred_ref;
> >> Is this safe even if the checkpointed task's state changes ?
> >> (e.g. unfrozen - yes, I know there is a patch in the works to
> >> prevent this; but if we ever want to checkpoint STOPPED tasks...
> >> for instance).
> >>
> >> Would incrementing the refcount on t->{cred,real_cred} help ?
> > 
> > Doesn't checkpoint_obj already do that through obj_new?
> > 
> 
> No, it does not. There is a (potentially long) window of opportunity
> between the callback invoked from checkpoint_obj() - where the pointer
> is used, and when checkpoint_obj() later takes the extra reference.
> 
> See for comparison checkpoint_mm_obj(), it safely grabs the task->mm
> (with a reference) around the invocation of checkpoint_obj().

Hmm, ok.  Will do.

-serge

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2009-05-21 14:14 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-19  1:44 [PATCH 0/6] cr: credentials Serge E. Hallyn
     [not found] ` <20090519014446.GA28277-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-19  1:45   ` [PATCH 1/6] cr: break out new_user_ns() Serge E. Hallyn
2009-05-19  1:45   ` [PATCH 2/6] cr: split core function out of some set*{u,g}id functions Serge E. Hallyn
2009-05-19  1:45   ` [PATCH 3/6] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
2009-05-19  1:45   ` [PATCH 4/6] cr: checkpoint and restore task credentials Serge E. Hallyn
2009-05-19  1:45   ` [PATCH 5/6] cr: restore file->f_cred Serge E. Hallyn
     [not found]     ` <20090519014546.GE28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-20 15:08       ` Oren Laadan
     [not found]         ` <4A141CEE.2080100-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-05-20 15:25           ` Serge E. Hallyn
     [not found]             ` <20090520152527.GA28585-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-20 15:26               ` Oren Laadan
2009-05-19  1:45   ` [PATCH 6/6] user namespaces: debug refcounts Serge E. Hallyn
     [not found] ` <20090519014538.GD28312-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-19  8:26   ` [PATCH 4/6] cr: checkpoint and restore task credentials David Howells
     [not found]     ` <16258.1242721606-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2009-05-19 13:35       ` Serge E. Hallyn
     [not found]     ` <20090519133526.GB32685-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-19 14:26       ` David Howells
     [not found]         ` <19394.1242743199-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2009-05-19 14:46           ` Serge E. Hallyn
2009-05-20 15:35   ` Oren Laadan
     [not found]     ` <4A142350.1060308-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-05-20 15:53       ` Serge E. Hallyn
     [not found]         ` <20090520155332.GA28999-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-20 16:08           ` Oren Laadan
     [not found]             ` <4A142B05.4040907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-05-20 16:13               ` Serge E. Hallyn
2009-05-20 16:54   ` Oren Laadan
     [not found]     ` <4A1435E0.3010306-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-05-20 21:40       ` Serge E. Hallyn
     [not found]         ` <20090520214027.GA3517-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-21 14:02           ` Oren Laadan
     [not found]             ` <4A155EEC.9070509-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-05-21 14:14               ` Serge E. Hallyn
2009-05-20 21:52       ` Serge E. Hallyn
     [not found]         ` <20090520215250.GB3517-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-21 14:13           ` Oren Laadan
2009-05-20 22:16       ` Serge E. Hallyn
     [not found]         ` <20090520221600.GA3925-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-21  6:03           ` Oren Laadan
2009-05-20 16:56   ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.