All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [RFC] c/r: Add UTS support
@ 2009-03-12 17:56 Dan Smith
       [not found] ` <1236880612-15316-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-12 17:56 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

(apologies if you see this multiple times...IBM mail server troubles)

This patch adds a "phase" of c/r that saves out information about any
namespaces the task(s) may have.  Do this by tracking the nsproxy of the
first task and making sure that the tasks that follow get hooked back to
share the same one on restart.  On restart, we also explicitly create new
namespaces for any that we support at the first task.

I tested this with single and multiple task restore, on top of Oren's
v13 tree.

This is intended to get some discussion going about how to c/r the various
namespaces.  So, if this approach isn't favorable, speak up with some
suggestions for how to do it better.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/checkpoint.c        |   57 ++++++++++++++++++++++++++
 checkpoint/objhash.c           |    7 +++
 checkpoint/restart.c           |   87 ++++++++++++++++++++++++++++++++++++++++
 include/linux/checkpoint.h     |    1 +
 include/linux/checkpoint_hdr.h |   15 +++++++
 5 files changed, 167 insertions(+), 0 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 64155de..a613f2d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -193,6 +193,59 @@ static int cr_write_tail(struct cr_ctx *ctx)
 	return ret;
 }
 
+static int cr_write_ns_uts(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr h;
+	struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct new_utsname *n = &t->nsproxy->uts_ns->name;
+	int ret;
+
+	h.type = CR_HDR_UTSNS;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	memcpy(hh->nodename, n->nodename, sizeof(n->nodename));
+	memcpy(hh->domainname, n->domainname, sizeof(n->domainname));
+
+	ret = cr_write_obj(ctx, &h, hh);
+	cr_hbuf_put(ctx, sizeof(*hh));
+
+	return ret;
+}
+
+static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr h;
+	struct cr_hdr_nsproxy *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct nsproxy *nsp = t->nsproxy;
+	int ret;
+	int new;
+
+	h.type = CR_HDR_NSP;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	new = cr_obj_add_ptr(ctx, nsp, &hh->objref, CR_OBJ_NSP, 0);
+	if (new)
+		hh->types = CR_NSP_UTS; /* Record types we support */
+
+	ret = cr_write_obj(ctx, &h, hh);
+	if (ret)
+		goto out;
+
+	if (new) {
+		ret = cr_write_ns_uts(ctx, t);
+		if (ret < 0)
+			goto out;
+
+		/* FIXME: Write other namespaces here */
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+
+	return ret;
+}
+
 /* dump the task_struct of a given task */
 static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
 {
@@ -230,6 +283,10 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
 		goto out;
+	ret = cr_write_namespaces(ctx, t);
+	pr_debug("namespaces: ret %d\n", ret);
+	if (ret < 0)
+		goto out;
 	ret = cr_write_mm(ctx, t);
 	pr_debug("memory: ret %d\n", ret);
 	if (ret < 0)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index ee31b38..aaaf583 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -12,6 +12,7 @@
 #include <linux/file.h>
 #include <linux/hash.h>
 #include <linux/checkpoint.h>
+#include <linux/utsname.h>
 
 struct cr_objref {
 	int objref;
@@ -35,6 +36,9 @@ static void cr_obj_ref_drop(struct cr_objref *obj)
 	case CR_OBJ_FILE:
 		fput((struct file *) obj->ptr);
 		break;
+	case CR_OBJ_NSP:
+		put_nsproxy((struct nsproxy *) obj->ptr);
+		break;
 	default:
 		BUG();
 	}
@@ -46,6 +50,9 @@ static void cr_obj_ref_grab(struct cr_objref *obj)
 	case CR_OBJ_FILE:
 		get_file((struct file *) obj->ptr);
 		break;
+	case CR_OBJ_NSP:
+		get_nsproxy((struct nsproxy *) obj->ptr);
+		break;
 	default:
 		BUG();
 	}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 7ec4de4..513d772 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -15,6 +15,7 @@
 #include <linux/magic.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/utsname.h>
 
 #include "checkpoint_arch.h"
 
@@ -213,6 +214,88 @@ static int cr_read_tail(struct cr_ctx *ctx)
 	return ret;
 }
 
+/* Read the new UTS namespace record and adjust uts_ns accordingly */
+static int cr_read_ns_uts(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct new_utsname *n = &t->nsproxy->uts_ns->name;
+	int ret = 0;
+	int parent;
+
+	parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_UTSNS);
+	if (parent != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	memset(n->nodename, 0, sizeof(n->nodename));
+	memcpy(n->nodename, hh->nodename, sizeof(n->nodename)-1);
+
+	memset(n->domainname, 0, sizeof(n->nodename));
+	memcpy(n->domainname, hh->domainname, sizeof(n->domainname)-1);
+
+	pr_debug("read uts_ns: nn=%s dn=%s\n", n->nodename, n->domainname);
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+
+	return ret;
+}
+
+/* Convert our type flags to CLONE flags */
+static unsigned long cr_ns_clone_flags(unsigned long types)
+{
+	unsigned long flags = 0;
+
+	if (types & CR_NSP_UTS)
+		flags |= CLONE_NEWUTS;
+
+	return flags;
+}
+
+/* Read namespace records for this task */
+static int cr_read_namespaces(struct cr_ctx *ctx)
+{
+	struct cr_hdr_nsproxy *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct task_struct *t = current;
+	struct nsproxy *nsp;
+	int parent;
+	int ret = 0;
+
+	parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_NSP);
+	if (parent != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nsp = cr_obj_get_by_ref(ctx, hh->objref, CR_OBJ_NSP);
+	if (nsp != NULL) {
+		t->nsproxy = nsp;
+		goto out;
+	}
+
+	if (hh->types == 0)
+		goto out; /* No namespaces to read */
+
+	/* Get the obligatory new set of namespaces */
+	ret = copy_namespaces(cr_ns_clone_flags(hh->types), t);
+	if (ret)
+		goto out;
+
+	ret = cr_obj_add_ref(ctx, t->nsproxy,
+			     hh->objref, CR_OBJ_NSP, 0);
+	if (ret)
+		goto out;
+
+	if (hh->types & CR_NSP_UTS)
+		ret = cr_read_ns_uts(ctx, t);
+
+	/* FIXME: Read other namespaces here */
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+
+	return ret;
+}
+
 /* read the task_struct into the current task */
 static int cr_read_task_struct(struct cr_ctx *ctx)
 {
@@ -258,6 +341,10 @@ static int cr_read_task(struct cr_ctx *ctx)
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
 		goto out;
+	ret = cr_read_namespaces(ctx);
+	pr_debug("namespaces: ret %d\n", ret);
+	if (ret < 0)
+		goto out;
 	ret = cr_read_mm(ctx);
 	pr_debug("memory: ret %d\n", ret);
 	if (ret < 0)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 217cf6e..5966275 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -75,6 +75,7 @@ extern void cr_ctx_put(struct cr_ctx *ctx);
 
 enum {
 	CR_OBJ_FILE = 1,
+	CR_OBJ_NSP,
 	CR_OBJ_MAX
 };
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 6dc739f..1413572 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -49,6 +49,8 @@ enum {
 	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
+	CR_HDR_NSP,
+	CR_HDR_UTSNS,
 
 	CR_HDR_MM = 201,
 	CR_HDR_VMA,
@@ -156,4 +158,17 @@ struct cr_hdr_fd_data {
 	__u64 f_version;
 } __attribute__((aligned(8)));
 
+#define CR_NSP_UTS 1
+
+struct cr_hdr_nsproxy {
+	__u32 objref;
+	__u32 types;
+};
+
+struct cr_hdr_utsns {
+	/* Both of these fields are defined as 65-chars long */
+	char nodename[65];
+	char domainname[65];
+};
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found] ` <1236880612-15316-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-12 21:29   ` Nathan Lynch
  2009-03-12 21:56     ` Dan Smith
  2009-03-18  8:35   ` Oren Laadan
  1 sibling, 1 reply; 41+ messages in thread
From: Nathan Lynch @ 2009-03-12 21:29 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Hi Dan.

Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:

> (apologies if you see this multiple times...IBM mail server troubles)
> 
> This patch adds a "phase" of c/r that saves out information about any
> namespaces the task(s) may have.  Do this by tracking the nsproxy of the
> first task and making sure that the tasks that follow get hooked back to
> share the same one on restart.  On restart, we also explicitly create new
> namespaces for any that we support at the first task.

I'd like there to be some discussion about this, because namespace
creation seems like a significant addition to the semantics of restart
as I understand it.  Is namespace creation during restart unavoidable,
or merely desirable?  Is there a case for requiring the user to provide
a suitable namespace environment before attempting restart?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
  2009-03-12 21:29   ` Nathan Lynch
@ 2009-03-12 21:56     ` Dan Smith
       [not found]       ` <87fxhipfrh.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-12 21:56 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

NL> I'd like there to be some discussion about this, because namespace
NL> creation seems like a significant addition to the semantics of
NL> restart as I understand it.

Indeed.

NL> Is namespace creation during restart unavoidable, or merely
NL> desirable?  Is there a case for requiring the user to provide a
NL> suitable namespace environment before attempting restart?

Information about the namespaces has to be saved at checkpoint time no
matter what, right?  I guess I don't see any compelling reason to not
have the restart operation replicate the environment of the original
process.  Otherwise we require userspace to read and interpret the
checkpoint stream and selectively feed the bits that the kernel is
responsible for to the kernel and process the rest itself (or have the
kernel ignore those records).

What's the argument for depending on userspace to set this up?

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]       ` <87fxhipfrh.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-12 22:48         ` Serge E. Hallyn
       [not found]           ` <20090312224820.GA12723-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
  2009-03-12 22:48         ` Daniel Lezcano
  1 sibling, 1 reply; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-12 22:48 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> NL> I'd like there to be some discussion about this, because namespace
> NL> creation seems like a significant addition to the semantics of
> NL> restart as I understand it.
> 
> Indeed.
> 
> NL> Is namespace creation during restart unavoidable, or merely
> NL> desirable?  Is there a case for requiring the user to provide a
> NL> suitable namespace environment before attempting restart?
> 
> Information about the namespaces has to be saved at checkpoint time no
> matter what, right?  I guess I don't see any compelling reason to not
> have the restart operation replicate the environment of the original
> process.  Otherwise we require userspace to read and interpret the
> checkpoint stream and selectively feed the bits that the kernel is
> responsible for to the kernel and process the rest itself (or have the
> kernel ignore those records).

Well we haven't yet settled the Oren-vs-Alexey argument of who sets
up the new process tree.  Oren would have userspace parse the checkpoint
file, do a bunch of clones to recreate the process tree, then have each
task call sys_restart().  Alexey would have one task call sys_restart()
where the system call then recreates the process tree.

You seem to be doing half of each :)  I think if we go with Oren's
route then we should use CLONE_NEWPID etc flags from userspace to set
up the proper relationships.  If we go with Alexey's, then the kernel
can just hand-create the nsproxies somewhat the way you are doing using
copy_namespaces().

> What's the argument for depending on userspace to set this up?

Well it forces restart to go through the established userspace API's
when creating resources (in this case, tasks and namespaces) which
means any existing security guarantees are leveraged.

If we go with your patch, we suddenly have to worry about whether
restart is a way to get around the CAP_SYS_ADMIN requirements for
cloning a new namespace.  Just as an example.

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]       ` <87fxhipfrh.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-03-12 22:48         ` Serge E. Hallyn
@ 2009-03-12 22:48         ` Daniel Lezcano
       [not found]           ` <49B99144.9000106-GANU6spQydw@public.gmane.org>
  1 sibling, 1 reply; 41+ messages in thread
From: Daniel Lezcano @ 2009-03-12 22:48 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Dan Smith wrote:
> NL> I'd like there to be some discussion about this, because namespace
> NL> creation seems like a significant addition to the semantics of
> NL> restart as I understand it.
>
> Indeed.
>
> NL> Is namespace creation during restart unavoidable, or merely
> NL> desirable?  Is there a case for requiring the user to provide a
> NL> suitable namespace environment before attempting restart?
>
> Information about the namespaces has to be saved at checkpoint time no
> matter what, right?  I guess I don't see any compelling reason to not
> have the restart operation replicate the environment of the original
> process.  Otherwise we require userspace to read and interpret the
> checkpoint stream and selectively feed the bits that the kernel is
> responsible for to the kernel and process the rest itself (or have the
> kernel ignore those records).
>   

Assuming you have a process and this one unshared the network 100 times 
and each time opens a socket, how do you checkpoint these namespaces ?

> What's the argument for depending on userspace to set this up?
>   
Maybe, CR of the namespaces is more complicate topic than it looks like 
and the CR itself is big enough to not complicate things. IMHO, I would 
recommend as the first step to forbid the unshare inside a container and 
let the container implementation to save the configuration with the 
statefile in order to recreate it at the restart

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]           ` <20090312224820.GA12723-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
@ 2009-03-12 22:56             ` Dan Smith
       [not found]               ` <87bps6pcyf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-12 22:56 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

SH> Well it forces restart to go through the established userspace
SH> API's when creating resources (in this case, tasks and namespaces)
SH> which means any existing security guarantees are leveraged.

That's a very valid point.  However, it still seems unbalanced to make
checkpoint a completely in-kernel process and restart an odd mix of
the two with potentially more confusing semantics and requirements.

SH> If we go with your patch, we suddenly have to worry about whether
SH> restart is a way to get around the CAP_SYS_ADMIN requirements for
SH> cloning a new namespace.  Just as an example.

Why?  The call to copy_namespaces() will do the CAP_SYS_ADMIN check,
right?  Maybe your point is that in the restart implementation of
other namespace types we could potentially slide in a call to
something else that has already assumed the check has been made?  I
think that doing the obligatory copy_namespaces() during the restart
helps catch that case early and explicitly, no?

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]           ` <49B99144.9000106-GANU6spQydw@public.gmane.org>
@ 2009-03-12 22:58             ` Dan Smith
       [not found]               ` <877i2upcvo.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-03-18  8:32             ` Oren Laadan
  1 sibling, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-12 22:58 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

DL> Assuming you have a process and this one unshared the network 100
DL> times and each time opens a socket, how do you checkpoint these
DL> namespaces ?

>> What's the argument for depending on userspace to set this up?
>> 
DL> Maybe, CR of the namespaces is more complicate topic than it looks
DL> like and the CR itself is big enough to not complicate
DL> things. IMHO, I would recommend as the first step to forbid the
DL> unshare inside a container and let the container implementation to
DL> save the configuration with the statefile in order to recreate it
DL> at the restart

I think what you're suggesting here is some sort of check to make sure
we don't allow checkpointing a process with nested namespaces... is
that correct?  If so, I agree.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]               ` <877i2upcvo.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-12 23:11                 ` Daniel Lezcano
       [not found]                   ` <49B996BC.1090908-GANU6spQydw@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Daniel Lezcano @ 2009-03-12 23:11 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Dan Smith wrote:
> DL> Assuming you have a process and this one unshared the network 100
> DL> times and each time opens a socket, how do you checkpoint these
> DL> namespaces ?
>
>   
>>> What's the argument for depending on userspace to set this up?
>>>
>>>       
> DL> Maybe, CR of the namespaces is more complicate topic than it looks
> DL> like and the CR itself is big enough to not complicate
> DL> things. IMHO, I would recommend as the first step to forbid the
> DL> unshare inside a container and let the container implementation to
> DL> save the configuration with the statefile in order to recreate it
> DL> at the restart
>
> I think what you're suggesting here is some sort of check to make sure
> we don't allow checkpointing a process with nested namespaces... is
> that correct?  If so, I agree.
>   
Correct.

I guess it will be esay to implement with a nsproxy level counter.
Each time you unshare, the new nsproxy count is incremented.
Assuming the init_nsproxy is level 0, when the nsproxy counter is > 1, 
the process is uncheckpointable.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                   ` <49B996BC.1090908-GANU6spQydw@public.gmane.org>
@ 2009-03-12 23:13                     ` Dan Smith
       [not found]                       ` <873adipc5l.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-12 23:13 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

DL> I guess it will be esay to implement with a nsproxy level counter.
DL> Each time you unshare, the new nsproxy count is incremented.
DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
DL> > 1, the process is uncheckpointable.

This should also be possible by just making sure that the nsproxy of
the root process being checkpointed is the same as any of the
children, correct?  That way we avoid having to modify the core
nsproxy bits and can still reject any nested namespaces.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                       ` <873adipc5l.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-12 23:24                         ` Daniel Lezcano
       [not found]                           ` <49B999A6.2000005-GANU6spQydw@public.gmane.org>
  2009-03-13 15:59                         ` Cedric Le Goater
  1 sibling, 1 reply; 41+ messages in thread
From: Daniel Lezcano @ 2009-03-12 23:24 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Dan Smith wrote:
> DL> I guess it will be esay to implement with a nsproxy level counter.
> DL> Each time you unshare, the new nsproxy count is incremented.
> DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
> DL> > 1, the process is uncheckpointable.
>
> This should also be possible by just making sure that the nsproxy of
> the root process being checkpointed is the same as any of the
> children, correct?  That way we avoid having to modify the core
> nsproxy bits and can still reject any nested namespaces.
>   
Right, this is another option. The nsproxy counter will allow to flag at 
runtime a process to be uncheckpointable. The nsproxy comparison will 
detect nested nsproxies at checkpoint time.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]               ` <87bps6pcyf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-13  0:12                 ` Serge E. Hallyn
  2009-03-18  8:27                 ` Oren Laadan
  1 sibling, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-13  0:12 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> SH> Well it forces restart to go through the established userspace
> SH> API's when creating resources (in this case, tasks and namespaces)
> SH> which means any existing security guarantees are leveraged.
> 
> That's a very valid point.  However, it still seems unbalanced to make
> checkpoint a completely in-kernel process and restart an odd mix of
> the two with potentially more confusing semantics and requirements.

core_dump vs gdb? :)

> SH> If we go with your patch, we suddenly have to worry about whether
> SH> restart is a way to get around the CAP_SYS_ADMIN requirements for
> SH> cloning a new namespace.  Just as an example.
> 
> Why?  The call to copy_namespaces() will do the CAP_SYS_ADMIN check,

Had to check, but yes you're right.

> right?  Maybe your point is that in the restart implementation of
> other namespace types we could potentially slide in a call to
> something else that has already assumed the check has been made?  I
> think that doing the obligatory copy_namespaces() during the restart
> helps catch that case early and explicitly, no?

Actually we can go a step further and say we expect user-space to
set the hostname, which otherwise (admittedly in a container) the
user, with your patch, can now do unprivileged, right?

In particular, once it comes to setting up network devices for a
container at restart, I think we'll find userspace a far easier
place to work.

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                           ` <49B999A6.2000005-GANU6spQydw@public.gmane.org>
@ 2009-03-13 15:30                             ` Serge E. Hallyn
       [not found]                               ` <20090313153004.GA8317-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-13 15:30 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Quoting Daniel Lezcano (daniel.lezcano-GANU6spQydw@public.gmane.org):
> Dan Smith wrote:
> > DL> I guess it will be esay to implement with a nsproxy level counter.
> > DL> Each time you unshare, the new nsproxy count is incremented.
> > DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
> > DL> > 1, the process is uncheckpointable.
> >
> > This should also be possible by just making sure that the nsproxy of
> > the root process being checkpointed is the same as any of the
> > children, correct?  That way we avoid having to modify the core
> > nsproxy bits and can still reject any nested namespaces.
> >   
> Right, this is another option. The nsproxy counter will allow to flag at 
> runtime a process to be uncheckpointable. The nsproxy comparison will 
> detect nested nsproxies at checkpoint time.

Or, to stick more to the resource->may_checkpoint way of doing it, you
setbit(&nsproxy->uts_ns->may_checkpoint, 0) when the uts_ns is
created, and anytime a task does clone(CLONE_NEWUTS) or
unshare(CLONE_NEWUTS), you clear the bit on the parent uts_ns.

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                               ` <20090313153004.GA8317-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-13 15:51                                 ` Daniel Lezcano
       [not found]                                   ` <49BA811C.4070302-GANU6spQydw@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Daniel Lezcano @ 2009-03-13 15:51 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Serge E. Hallyn wrote:
> Quoting Daniel Lezcano (daniel.lezcano-GANU6spQydw@public.gmane.org):
>   
>> Dan Smith wrote:
>>     
>>> DL> I guess it will be esay to implement with a nsproxy level counter.
>>> DL> Each time you unshare, the new nsproxy count is incremented.
>>> DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
>>> DL> > 1, the process is uncheckpointable.
>>>
>>> This should also be possible by just making sure that the nsproxy of
>>> the root process being checkpointed is the same as any of the
>>> children, correct?  That way we avoid having to modify the core
>>> nsproxy bits and can still reject any nested namespaces.
>>>   
>>>       
>> Right, this is another option. The nsproxy counter will allow to flag at 
>> runtime a process to be uncheckpointable. The nsproxy comparison will 
>> detect nested nsproxies at checkpoint time.
>>     
>
> Or, to stick more to the resource->may_checkpoint way of doing it, you
> setbit(&nsproxy->uts_ns->may_checkpoint, 0) when the uts_ns is
> created, and anytime a task does clone(CLONE_NEWUTS) or
> unshare(CLONE_NEWUTS), you clear the bit on the parent uts_ns.
>   
Hmm, you will need to add a back pointer for the nsproxy | utsns parent, 
no ?
What I was proposing is a counter directly in the nsproxy. Maybe instead 
of initializing it to zero, it can be initialized to the max supported 
nested level ( only one right now) and decrement each time a clone or a 
unshare is done whatever the namespace.

init nsproxy->may_checkpoint = 2
First clone | unshare => for the new nsproxy the counter may_checkpoint 
becomes 1
Second clone | unshare (forbidden) => may_checkpoint becomes 0

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                       ` <873adipc5l.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-03-12 23:24                         ` Daniel Lezcano
@ 2009-03-13 15:59                         ` Cedric Le Goater
       [not found]                           ` <49BA82CE.4090206-GANU6spQydw@public.gmane.org>
  1 sibling, 1 reply; 41+ messages in thread
From: Cedric Le Goater @ 2009-03-13 15:59 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Dan Smith wrote:
> DL> I guess it will be esay to implement with a nsproxy level counter.
> DL> Each time you unshare, the new nsproxy count is incremented.
> DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
> DL> > 1, the process is uncheckpointable.
> 
> This should also be possible by just making sure that the nsproxy of
> the root process being checkpointed is the same as any of the
> children, correct?  That way we avoid having to modify the core
> nsproxy bits and can still reject any nested namespaces.

Daniel L, could we cleanup the patch we have on ns_group which filters
out the clone() done with the 'wrong' clone flags ? 

Thanks,

C. 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                           ` <49BA82CE.4090206-GANU6spQydw@public.gmane.org>
@ 2009-03-13 16:04                             ` Daniel Lezcano
  0 siblings, 0 replies; 41+ messages in thread
From: Daniel Lezcano @ 2009-03-13 16:04 UTC (permalink / raw)
  To: Cedric Le Goater
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Cedric Le Goater wrote:
> Dan Smith wrote:
>   
>> DL> I guess it will be esay to implement with a nsproxy level counter.
>> DL> Each time you unshare, the new nsproxy count is incremented.
>> DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
>> DL> > 1, the process is uncheckpointable.
>>
>> This should also be possible by just making sure that the nsproxy of
>> the root process being checkpointed is the same as any of the
>> children, correct?  That way we avoid having to modify the core
>> nsproxy bits and can still reject any nested namespaces.
>>     
>
> Daniel L, could we cleanup the patch we have on ns_group which filters
> out the clone() done with the 'wrong' clone flags ? 
>
> Thanks,
>   
Sure.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                   ` <49BA811C.4070302-GANU6spQydw@public.gmane.org>
@ 2009-03-13 17:15                                     ` Serge E. Hallyn
       [not found]                                       ` <20090313171556.GB10685-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-13 17:15 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Quoting Daniel Lezcano (daniel.lezcano-GANU6spQydw@public.gmane.org):
> Serge E. Hallyn wrote:
>> Quoting Daniel Lezcano (daniel.lezcano-GANU6spQydw@public.gmane.org):
>>   
>>> Dan Smith wrote:
>>>     
>>>> DL> I guess it will be esay to implement with a nsproxy level counter.
>>>> DL> Each time you unshare, the new nsproxy count is incremented.
>>>> DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
>>>> DL> > 1, the process is uncheckpointable.
>>>>
>>>> This should also be possible by just making sure that the nsproxy of
>>>> the root process being checkpointed is the same as any of the
>>>> children, correct?  That way we avoid having to modify the core
>>>> nsproxy bits and can still reject any nested namespaces.
>>>>         
>>> Right, this is another option. The nsproxy counter will allow to flag 
>>> at runtime a process to be uncheckpointable. The nsproxy comparison 
>>> will detect nested nsproxies at checkpoint time.
>>>     
>>
>> Or, to stick more to the resource->may_checkpoint way of doing it, you
>> setbit(&nsproxy->uts_ns->may_checkpoint, 0) when the uts_ns is
>> created, and anytime a task does clone(CLONE_NEWUTS) or
>> unshare(CLONE_NEWUTS), you clear the bit on the parent uts_ns.
>>   
> Hmm, you will need to add a back pointer for the nsproxy | utsns parent,  
> no ?

Why?

> What I was proposing is a counter directly in the nsproxy. Maybe instead  
> of initializing it to zero, it can be initialized to the max supported  
> nested level ( only one right now) and decrement each time a clone or a  
> unshare is done whatever the namespace.
>
> init nsproxy->may_checkpoint = 2
> First clone | unshare => for the new nsproxy the counter may_checkpoint  
> becomes 1

I don't understand why it gets decremented twice before not being
checkpointable - or do you mean that by the time the nsproxy is
useful it will be 1?   2 is basically an init-only unused phase?

> Second clone | unshare (forbidden) => may_checkpoint becomes 0

Ok but if I

	unshare(CLONE_NEWNS)

I'll get a new nsproxy with an old uts_ns.  So we'll need
some (potentially complicated) logic at nsproxy creation to
decide whether the namespaces being cloned or not being cloned
impact the checkpointability of the new nsproxy...

Hmm.  I think I prefer making sure that the uts_ns is the
same for all checkpointed tasks :)

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                       ` <20090313171556.GB10685-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-13 17:53                                         ` Daniel Lezcano
       [not found]                                           ` <49BA9D9C.2030208-GANU6spQydw@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Daniel Lezcano @ 2009-03-13 17:53 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Serge E. Hallyn wrote:
> Quoting Daniel Lezcano (daniel.lezcano-GANU6spQydw@public.gmane.org):
>   
>> Serge E. Hallyn wrote:
>>     
>>> Quoting Daniel Lezcano (daniel.lezcano-GANU6spQydw@public.gmane.org):
>>>   
>>>       
>>>> Dan Smith wrote:
>>>>     
>>>>         
>>>>> DL> I guess it will be esay to implement with a nsproxy level counter.
>>>>> DL> Each time you unshare, the new nsproxy count is incremented.
>>>>> DL> Assuming the init_nsproxy is level 0, when the nsproxy counter is
>>>>> DL> > 1, the process is uncheckpointable.
>>>>>
>>>>> This should also be possible by just making sure that the nsproxy of
>>>>> the root process being checkpointed is the same as any of the
>>>>> children, correct?  That way we avoid having to modify the core
>>>>> nsproxy bits and can still reject any nested namespaces.
>>>>>         
>>>>>           
>>>> Right, this is another option. The nsproxy counter will allow to flag 
>>>> at runtime a process to be uncheckpointable. The nsproxy comparison 
>>>> will detect nested nsproxies at checkpoint time.
>>>>     
>>>>         
>>> Or, to stick more to the resource->may_checkpoint way of doing it, you
>>> setbit(&nsproxy->uts_ns->may_checkpoint, 0) when the uts_ns is
>>> created, and anytime a task does clone(CLONE_NEWUTS) or
>>> unshare(CLONE_NEWUTS), you clear the bit on the parent uts_ns.
>>>   
>>>       
>> Hmm, you will need to add a back pointer for the nsproxy | utsns parent,  
>> no ?
>>     
>
> Why?
>   
Never mind, I talked too fast :) we have both parent and child namespace 
in the clone and unshare functions.
>> What I was proposing is a counter directly in the nsproxy. Maybe instead  
>> of initializing it to zero, it can be initialized to the max supported  
>> nested level ( only one right now) and decrement each time a clone or a  
>> unshare is done whatever the namespace.
>>
>> init nsproxy->may_checkpoint = 2
>> First clone | unshare => for the new nsproxy the counter may_checkpoint  
>> becomes 1
>>     
>
> I don't understand why it gets decremented twice before not being
> checkpointable - or do you mean that by the time the nsproxy is
> useful it will be 1?   2 is basically an init-only unused phase?
>   
The counter is initialized on the system to 2, which is (max level + 1), 
it is for the init namespaces.

>> Second clone | unshare (forbidden) => may_checkpoint becomes 0
>>     
>
> Ok but if I
>
> 	unshare(CLONE_NEWNS)
>
> I'll get a new nsproxy with an old uts_ns.  So we'll need
> some (potentially complicated) logic at nsproxy creation to
> decide whether the namespaces being cloned or not being cloned
> impact the checkpointability of the new nsproxy...
>
> Hmm.  I think I prefer making sure that the uts_ns is the
> same for all checkpointed tasks :)
>   
Yes, this is an alternative. Maybe, I will say something stupid but IMO 
the "maycheckpoint" will depends on what you assume you have for the CR:
    1) the container is instantiated in one step, that is 
clone(mycloneflags) and that's all, any other clone/unshare is 
forbidden. In this case, you can concentrate the code in the nsproxy 
structure.
    2) the container can be instantiated in several steps, that is 
several clone/unshare but with different namespaces. In this case, you 
have to take care of all the namespaces and do a "maycheckpoint" for 
each of them.

IMHO, both solutions are valid. Of course, we are talking about short 
term solution :)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]               ` <87bps6pcyf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-03-13  0:12                 ` Serge E. Hallyn
@ 2009-03-18  8:27                 ` Oren Laadan
       [not found]                   ` <49C0B069.6060300-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 1 reply; 41+ messages in thread
From: Oren Laadan @ 2009-03-18  8:27 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch



Dan Smith wrote:
> SH> Well it forces restart to go through the established userspace
> SH> API's when creating resources (in this case, tasks and namespaces)
> SH> which means any existing security guarantees are leveraged.
> 
> That's a very valid point.  However, it still seems unbalanced to make
> checkpoint a completely in-kernel process and restart an odd mix of
> the two with potentially more confusing semantics and requirements.
> 

There are other reasons to allow restart to be not fully symmetric
with respect to checkpoint. For example, if you have a smart(er) user
space application that wants to provide the restart some of the resources
pre-constructed, allowing much flexibility (already requested by people)
for the restart provdure (E.g., when doing distributed checkpoint, or
when restarting a special device whose).

See my post:
https://lists.linux-foundation.org/pipermail/containers/2009-March/016234.html

Oren.

> SH> If we go with your patch, we suddenly have to worry about whether
> SH> restart is a way to get around the CAP_SYS_ADMIN requirements for
> SH> cloning a new namespace.  Just as an example.
> 
> Why?  The call to copy_namespaces() will do the CAP_SYS_ADMIN check,
> right?  Maybe your point is that in the restart implementation of
> other namespace types we could potentially slide in a call to
> something else that has already assumed the check has been made?  I
> think that doing the obligatory copy_namespaces() during the restart
> helps catch that case early and explicitly, no?
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]           ` <49B99144.9000106-GANU6spQydw@public.gmane.org>
  2009-03-12 22:58             ` Dan Smith
@ 2009-03-18  8:32             ` Oren Laadan
  1 sibling, 0 replies; 41+ messages in thread
From: Oren Laadan @ 2009-03-18  8:32 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch



Daniel Lezcano wrote:
> Dan Smith wrote:
>> NL> I'd like there to be some discussion about this, because namespace
>> NL> creation seems like a significant addition to the semantics of
>> NL> restart as I understand it.
>>
>> Indeed.
>>
>> NL> Is namespace creation during restart unavoidable, or merely
>> NL> desirable?  Is there a case for requiring the user to provide a
>> NL> suitable namespace environment before attempting restart?
>>
>> Information about the namespaces has to be saved at checkpoint time no
>> matter what, right?  I guess I don't see any compelling reason to not
>> have the restart operation replicate the environment of the original
>> process.  Otherwise we require userspace to read and interpret the
>> checkpoint stream and selectively feed the bits that the kernel is
>> responsible for to the kernel and process the rest itself (or have the
>> kernel ignore those records).
>>   
> 
> Assuming you have a process and this one unshared the network 100 times 
> and each time opens a socket, how do you checkpoint these namespaces ?
> 
>> What's the argument for depending on userspace to set this up?
>>   
> Maybe, CR of the namespaces is more complicate topic than it looks like 

s/Maybe/Surely/ ...

> and the CR itself is big enough to not complicate things. IMHO, I would 
> recommend as the first step to forbid the unshare inside a container and 
> let the container implementation to save the configuration with the 
> statefile in order to recreate it at the restart
> 

Agreed.

Oren.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found] ` <1236880612-15316-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-12 21:29   ` Nathan Lynch
@ 2009-03-18  8:35   ` Oren Laadan
  1 sibling, 0 replies; 41+ messages in thread
From: Oren Laadan @ 2009-03-18  8:35 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> (apologies if you see this multiple times...IBM mail server troubles)
> 
> This patch adds a "phase" of c/r that saves out information about any
> namespaces the task(s) may have.  Do this by tracking the nsproxy of the
> first task and making sure that the tasks that follow get hooked back to
> share the same one on restart.  On restart, we also explicitly create new
> namespaces for any that we support at the first task.
> 
> I tested this with single and multiple task restore, on top of Oren's
> v13 tree.
> 
> This is intended to get some discussion going about how to c/r the various
> namespaces.  So, if this approach isn't favorable, speak up with some
> suggestions for how to do it better.
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/checkpoint.c        |   57 ++++++++++++++++++++++++++
>  checkpoint/objhash.c           |    7 +++
>  checkpoint/restart.c           |   87 ++++++++++++++++++++++++++++++++++++++++
>  include/linux/checkpoint.h     |    1 +
>  include/linux/checkpoint_hdr.h |   15 +++++++
>  5 files changed, 167 insertions(+), 0 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index 64155de..a613f2d 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -193,6 +193,59 @@ static int cr_write_tail(struct cr_ctx *ctx)
>  	return ret;
>  }
>  
> +static int cr_write_ns_uts(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	struct new_utsname *n = &t->nsproxy->uts_ns->name;
> +	int ret;
> +
> +	h.type = CR_HDR_UTSNS;
> +	h.len = sizeof(*hh);
> +	h.parent = 0;
> +
> +	memcpy(hh->nodename, n->nodename, sizeof(n->nodename));
> +	memcpy(hh->domainname, n->domainname, sizeof(n->domainname));

The length of ->nodename etc may change between kernels or in future
versions. Please also save the length of the buffer.
(see for example cr_write_task_struct() handling of ->comm)

[...]

Oren.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                   ` <49C0B069.6060300-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18  9:01                     ` Cedric Le Goater
  2009-03-18 13:49                     ` Serge E. Hallyn
  1 sibling, 0 replies; 41+ messages in thread
From: Cedric Le Goater @ 2009-03-18  9:01 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Oren Laadan wrote:
> 
> Dan Smith wrote:
>> SH> Well it forces restart to go through the established userspace
>> SH> API's when creating resources (in this case, tasks and namespaces)
>> SH> which means any existing security guarantees are leveraged.
>>
>> That's a very valid point.  However, it still seems unbalanced to make
>> checkpoint a completely in-kernel process and restart an odd mix of
>> the two with potentially more confusing semantics and requirements.
>>
> 
> There are other reasons to allow restart to be not fully symmetric
> with respect to checkpoint. For example, if you have a smart(er) user
> space application that wants to provide the restart some of the resources
> pre-constructed, allowing much flexibility (already requested by people)
> for the restart provdure (E.g., when doing distributed checkpoint, or
> when restarting a special device whose).

yes

the arguments you have for restart are also valid for checkpoint in a 
distributed checkpoint scenario. 

you want to be able to easily and rapidly abort the checkpoint of a job 
when one node (among thousands) fails for some reason. a batch manager 
would use a signal.  

you also want fine grain synchronization for network, when migrating only 
one node. 

We've had to solve the above issues on a large HPC project and there are 
plenty of other good reasons to have a mix of kernel and user space for 
restart and for checkpoint. 

C.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                   ` <49C0B069.6060300-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-18  9:01                     ` Cedric Le Goater
@ 2009-03-18 13:49                     ` Serge E. Hallyn
       [not found]                       ` <20090318134932.GC22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-18 13:49 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Dan Smith wrote:
> > SH> Well it forces restart to go through the established userspace
> > SH> API's when creating resources (in this case, tasks and namespaces)
> > SH> which means any existing security guarantees are leveraged.
> > 
> > That's a very valid point.  However, it still seems unbalanced to make
> > checkpoint a completely in-kernel process and restart an odd mix of
> > the two with potentially more confusing semantics and requirements.
> > 
> 
> There are other reasons to allow restart to be not fully symmetric
> with respect to checkpoint. For example, if you have a smart(er) user
> space application that wants to provide the restart some of the resources
> pre-constructed, allowing much flexibility (already requested by people)
> for the restart provdure (E.g., when doing distributed checkpoint, or
> when restarting a special device whose).
> 
> See my post:
> https://lists.linux-foundation.org/pipermail/containers/2009-March/016234.html

(Note that in Dan's next version, he did move unshare into userspace)

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                       ` <20090318134932.GC22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18 14:04                         ` Dan Smith
       [not found]                           ` <878wn353mf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-18 14:04 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

SH> (Note that in Dan's next version, he did move unshare into
SH> userspace)

The idealist in me still wants it to be in the kernel.  However, after
seeing it done I agree that it's the right thing to do, at least in
this case.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                           ` <878wn353mf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-18 15:46                             ` Cedric Le Goater
       [not found]                               ` <49C1175F.9060600-GANU6spQydw@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Cedric Le Goater @ 2009-03-18 15:46 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Dan Smith wrote:
> SH> (Note that in Dan's next version, he did move unshare into
> SH> userspace)
> 
> The idealist in me still wants it to be in the kernel.  However, after
> seeing it done I agree that it's the right thing to do, at least in
> this case.

I would say in all cases.

as you can't unshare(CLONE_NEWPID), you might consider doing a 

	clone(CLONE_NEW*|CLONE_WHATEVER)

to restart 1 and then restart the container process tree under it, block
them, restore the container namespaces and other shared resources in the
right order and then release the processes to let them restart themselves. 
that's how we've implemented restart. You might be doing that already, 
sorry I haven't had time to look at your code. 

1 being an invariant between checkpoint and restart.

C.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                               ` <49C1175F.9060600-GANU6spQydw@public.gmane.org>
@ 2009-03-18 15:55                                 ` Dan Smith
       [not found]                                   ` <874oxq6d1x.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-03-18 19:50                                 ` Mike Waychison
  1 sibling, 1 reply; 41+ messages in thread
From: Dan Smith @ 2009-03-18 15:55 UTC (permalink / raw)
  To: Cedric Le Goater; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

CG> as you can't unshare(CLONE_NEWPID), you might consider doing a 

CG> 	clone(CLONE_NEW*|CLONE_WHATEVER)

CG> to restart 1 and then restart the container process tree under it

Yep, that's how mktree behaves with my changes. :)

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                   ` <874oxq6d1x.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-18 16:02                                     ` Cedric Le Goater
  0 siblings, 0 replies; 41+ messages in thread
From: Cedric Le Goater @ 2009-03-18 16:02 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch

Dan Smith wrote:
> CG> as you can't unshare(CLONE_NEWPID), you might consider doing a 
> 
> CG> 	clone(CLONE_NEW*|CLONE_WHATEVER)
> 
> CG> to restart 1 and then restart the container process tree under it
> 
> Yep, that's how mktree behaves with my changes. :)

cool. it looks like a good way to go. 

I need to find some time to look at the code and see how we can make 
all these code trees converge : mktree, oren's patchset, lxc, mcr, mcr 
patchset.

C. 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                               ` <49C1175F.9060600-GANU6spQydw@public.gmane.org>
  2009-03-18 15:55                                 ` Dan Smith
@ 2009-03-18 19:50                                 ` Mike Waychison
       [not found]                                   ` <49C1506C.1080500-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 41+ messages in thread
From: Mike Waychison @ 2009-03-18 19:50 UTC (permalink / raw)
  To: ebiederm-aS9lmoZGLiVWk0Htik3J/w
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

[-- Attachment #1: Type: text/plain, Size: 773 bytes --]

Cedric Le Goater wrote:
> Dan Smith wrote:
>> SH> (Note that in Dan's next version, he did move unshare into
>> SH> userspace)
>>
>> The idealist in me still wants it to be in the kernel.  However, after
>> seeing it done I agree that it's the right thing to do, at least in
>> this case.
> 
> I would say in all cases.
> 
> as you can't unshare(CLONE_NEWPID), 

Eric,

Is there a particular reason the above doesn't work?  I made an attempt 
to implement it a while back, but haven't convinced myself that signals 
and re-attaching a new struct pid to a running task is correct.

This should apply on top of Oren's ckpt v13 (based on 2.6.27-rc8). 
Consider this me floating the idea of adding support and I'll clean it 
up/rebase if you think it's useful.

Mike Waychison

[-- Attachment #2: add-unshare-support --]
[-- Type: text/plain, Size: 6357 bytes --]

Add unshare CLONE_NEWPID support

From: Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Add support for doing CLONE_NEWPID to sys_unshare().  Doing so requires that
the calling thread isn't sharing their signal handlers with anyone, or if they
are, they must also unshare their signal handler config at the same time.

Open issues:
   - I'm not 100% convinced I'm doing the right thing with pending signals.
   - I'm rewriting current's struct pid without any kind of synchronization.
   The lifetimes look alright to me, but it seems a little racy.  I can't think
   of any actual cases where we'd cause problems though: paths where we'd race
   would include cases where we go off and look at a struct pid's level, but
   then index in to get the pid_t out.  This is the same before and after we
   attach the pid to the task however, so maybe it's okay?

Signed-off-by: Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---

 include/linux/pid.h |    2 ++
 kernel/fork.c       |   47 ++++++++++++++++++++++++++++++++++++++-
 kernel/nsproxy.c    |    2 +-
 kernel/pid.c        |   61 +++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 105 insertions(+), 7 deletions(-)


diff --git a/include/linux/pid.h b/include/linux/pid.h
index d7e98ff..0ff4829 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -120,6 +120,8 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 int next_pidmap(struct pid_namespace *pid_ns, int last);
 
 extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid_keep(struct pid_namespace *ns,
+				  struct pid *orig_pid);
 extern void free_pid(struct pid *pid);
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe..2db6f38 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1575,7 +1575,10 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
 				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
-				CLONE_NEWNET))
+				CLONE_NEWNET|CLONE_NEWPID))
+		goto bad_unshare_out;
+	if ((unshare_flags & CLONE_NEWPID) && !(unshare_flags & CLONE_SIGHAND)
+	 && atomic_read(&current->sighand->count) > 1)
 		goto bad_unshare_out;
 
 	/*
@@ -1599,6 +1602,47 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 			new_fs)))
 		goto bad_unshare_cleanup_fd;
 
+	if (unshare_flags & CLONE_NEWPID) {
+		struct pid *new_pid, *old_pid;
+		err = pid_ns_prepare_proc(new_nsproxy->pid_ns);
+		if (err)
+			goto bad_unshare_cleanup_nsproxy;
+		/* Give ourselves a pid. */
+		new_pid = alloc_pid_keep(new_nsproxy->pid_ns,
+					 ask_pid(current));
+		if (!new_pid)
+			goto bad_unshare_cleanup_nsproxy;
+
+		old_pid = task_pid(current);
+
+		write_lock_irq(&tasklist_lock);
+		spin_lock(&current->sighand->siglock);
+
+		/* TODO: Do we have to check if there are signals pending at
+		 * this point? */
+
+		current->pid = pid_nr(new_pid);
+		current->tgid = current->pid;
+		current->group_leader = current;
+		list_del_init(&current->thread_group);
+		new_nsproxy->pid_ns->child_reaper = current;
+		/*
+		 * TODO: Is this the right way to handle the signal updates?
+		 *
+		 * The guard that ensures that we specified CLONE_SIGHAND
+		 * currently ensures that we aren't sharing our sighand with
+		 * anyone else.
+		 */
+		current->signal->leader_pid = new_pid;
+
+		set_task_pgrp(current, pid_nr(new_pid));
+		set_task_session(current, pid_nr(new_pid));
+		detach_pid(current, PIDTYPE_PID);
+		attach_pid(current, PIDTYPE_PID, new_pid);
+		spin_unlock(&current->sighand->siglock);
+		write_unlock_irq(&tasklist_lock);
+	}
+
 	if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
@@ -1638,6 +1682,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		task_unlock(current);
 	}
 
+bad_unshare_cleanup_nsproxy:
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29..23cafe7 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -189,7 +189,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWUSER | CLONE_NEWNET)))
+			       CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76a..3919b0d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -239,10 +239,64 @@ void free_pid(struct pid *pid)
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
+static void init_pid(struct pid_namespace *ns, struct pid *pid)
+{
+	enum pid_type type;
+	pid->level = ns->level;
+	atomic_set(&pid->count, 1);
+	for (type = 0; type < PIDTYPE_MAX; ++type)
+		INIT_HLIST_HEAD(&pid->tasks[type]);
+}
+
+struct pid *alloc_pid_keep(struct pid_namespace *ns, struct pid *orig_pid)
+{
+	struct pid *pid;
+	int i;
+	pid_t nr;
+
+	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
+	if (!pid)
+		goto out;
+
+	nr = alloc_pidmap(ns);
+	if (nr < 0)
+		goto out_free;
+	BUG_ON(nr != 1);
+
+	pid->numbers[ns->level].nr = nr;
+	pid->numbers[ns->level].ns = ns;
+	for (i = ns->level - 1; i >= 0; i--) {
+		/* Transfer the pid references to the new structure. */
+		pid->numbers[i].nr = orig_pid->numbers[i].nr;
+		orig_pid->numbers[i].nr = 0;
+
+		pid->numbers[i].ns = orig_pid->numbers[i].ns;
+	}
+
+	get_pid_ns(ns);
+	init_pid(ns, pid);
+
+	/* Update the hash tables.. */
+	spin_lock_irq(&pidmap_lock);
+	for (i = ns->level; i >= 0; i--) {
+		struct upid *upid;
+		upid = &pid->numbers[i];
+		/* put_pid will unhash the old upids */
+		hlist_add_head_rcu(&upid->pid_chain,
+				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+	}
+	spin_unlock_irq(&pidmap_lock);
+
+out:
+	return pid;
+out_free:
+	kmem_cache_free(ns->pid_cachep, pid);
+	return NULL;
+}
+
 struct pid *alloc_pid(struct pid_namespace *ns)
 {
 	struct pid *pid;
-	enum pid_type type;
 	int i, nr;
 	struct pid_namespace *tmp;
 	struct upid *upid;
@@ -263,10 +317,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	get_pid_ns(ns);
-	pid->level = ns->level;
-	atomic_set(&pid->count, 1);
-	for (type = 0; type < PIDTYPE_MAX; ++type)
-		INIT_HLIST_HEAD(&pid->tasks[type]);
+	init_pid(ns, pid);
 
 	spin_lock_irq(&pidmap_lock);
 	for (i = ns->level; i >= 0; i--) {

[-- Attachment #3: Type: text/plain, Size: 206 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                   ` <49C1506C.1080500-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2009-03-19  0:10                                     ` Eric W. Biederman
       [not found]                                       ` <m1bprye5io.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Eric W. Biederman @ 2009-03-19  0:10 UTC (permalink / raw)
  To: Mike Waychison; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:

> Cedric Le Goater wrote:
>> Dan Smith wrote:
>>> SH> (Note that in Dan's next version, he did move unshare into
>>> SH> userspace)
>>>
>>> The idealist in me still wants it to be in the kernel.  However, after
>>> seeing it done I agree that it's the right thing to do, at least in
>>> this case.
>>
>> I would say in all cases.
>>
>> as you can't unshare(CLONE_NEWPID), 
>
> Eric,
>
> Is there a particular reason the above doesn't work?  I made an attempt to
> implement it a while back, but haven't convinced myself that signals and
> re-attaching a new struct pid to a running task is correct.

Last time I was thinking about this I figured unsharing a pid namespace would
simply place it's children in a different pid namespace, not the originating
process.

Would that semantic be useful?  It would certainly be a lot less effort than
changing the pid on a running process correctly.


Eric

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                       ` <m1bprye5io.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2009-03-19  0:46                                         ` Mike Waychison
       [not found]                                           ` <49C195CF.1080506-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Mike Waychison @ 2009-03-19  0:46 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Eric W. Biederman wrote:
> Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
> 
>> Cedric Le Goater wrote:
>>> Dan Smith wrote:
>>>> SH> (Note that in Dan's next version, he did move unshare into
>>>> SH> userspace)
>>>>
>>>> The idealist in me still wants it to be in the kernel.  However, after
>>>> seeing it done I agree that it's the right thing to do, at least in
>>>> this case.
>>> I would say in all cases.
>>>
>>> as you can't unshare(CLONE_NEWPID), 
>> Eric,
>>
>> Is there a particular reason the above doesn't work?  I made an attempt to
>> implement it a while back, but haven't convinced myself that signals and
>> re-attaching a new struct pid to a running task is correct.
> 
> Last time I was thinking about this I figured unsharing a pid namespace would
> simply place it's children in a different pid namespace, not the originating
> process.
> 
> Would that semantic be useful?  It would certainly be a lot less effort than
> changing the pid on a running process correctly.

Hmm, that would be a little odd.  I think getting the unsharing task to 
become pid 1 is a bit easier to understand and it makes it clear which 
task is the reaper for the new namespace.  Otherwise the first child 
becomes pid 1 but it isn't the reaper.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                           ` <49C195CF.1080506-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2009-03-19  1:06                                             ` Eric W. Biederman
       [not found]                                               ` <m1ab7icodl.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Eric W. Biederman @ 2009-03-19  1:06 UTC (permalink / raw)
  To: Mike Waychison; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:

> Eric W. Biederman wrote:
>> Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
>>
>>> Cedric Le Goater wrote:
>>>> Dan Smith wrote:
>>>>> SH> (Note that in Dan's next version, he did move unshare into
>>>>> SH> userspace)
>>>>>
>>>>> The idealist in me still wants it to be in the kernel.  However, after
>>>>> seeing it done I agree that it's the right thing to do, at least in
>>>>> this case.
>>>> I would say in all cases.
>>>>
>>>> as you can't unshare(CLONE_NEWPID), 
>>> Eric,
>>>
>>> Is there a particular reason the above doesn't work?  I made an attempt to
>>> implement it a while back, but haven't convinced myself that signals and
>>> re-attaching a new struct pid to a running task is correct.
>>
>> Last time I was thinking about this I figured unsharing a pid namespace would
>> simply place it's children in a different pid namespace, not the originating
>> process.
>>
>> Would that semantic be useful?  It would certainly be a lot less effort than
>> changing the pid on a running process correctly.
>
> Hmm, that would be a little odd.  I think getting the unsharing task to become
> pid 1 is a bit easier to understand and it makes it clear which task is the
> reaper for the new namespace.  Otherwise the first child becomes pid 1 but it
> isn't the reaper.

For very lightweight pid namespaces that may be desirable, and that is where
all of this conversation originally started.   I am happy to set the starting
pid to 2 to avoid confusion on that point.

One of the other problems with changing the pid is that user space in general
glibc in particular can not cope with the pid of a process changing.

My memories are foggy at the moment but I do know that on the several occasions
we have looked at unshare of the pid namespace it has failed due to kernel issues.
I also remember I was close to having resolved the issues of unsharing the pid
namespace if we did not change the pid of processing calling unshare.


You did not answer my question.  I don't quite see how you were envisioning
using unsharing the pid namespace as part of restart so I can't tell if my
proposed semantics would work for that case.

Eric

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                               ` <m1ab7icodl.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2009-03-19  1:51                                                 ` Mike Waychison
       [not found]                                                   ` <49C1A52D.4000503-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Mike Waychison @ 2009-03-19  1:51 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Eric W. Biederman wrote:
> Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
> 
>> Eric W. Biederman wrote:
>>> Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
>>>
>>>> Cedric Le Goater wrote:
>>>>> Dan Smith wrote:
>>>>>> SH> (Note that in Dan's next version, he did move unshare into
>>>>>> SH> userspace)
>>>>>>
>>>>>> The idealist in me still wants it to be in the kernel.  However, after
>>>>>> seeing it done I agree that it's the right thing to do, at least in
>>>>>> this case.
>>>>> I would say in all cases.
>>>>>
>>>>> as you can't unshare(CLONE_NEWPID), 
>>>> Eric,
>>>>
>>>> Is there a particular reason the above doesn't work?  I made an attempt to
>>>> implement it a while back, but haven't convinced myself that signals and
>>>> re-attaching a new struct pid to a running task is correct.
>>> Last time I was thinking about this I figured unsharing a pid namespace would
>>> simply place it's children in a different pid namespace, not the originating
>>> process.
>>>
>>> Would that semantic be useful?  It would certainly be a lot less effort than
>>> changing the pid on a running process correctly.
>> Hmm, that would be a little odd.  I think getting the unsharing task to become
>> pid 1 is a bit easier to understand and it makes it clear which task is the
>> reaper for the new namespace.  Otherwise the first child becomes pid 1 but it
>> isn't the reaper.
> 
> For very lightweight pid namespaces that may be desirable, and that is where
> all of this conversation originally started.   I am happy to set the starting
> pid to 2 to avoid confusion on that point.

I wasn't really taking into consideration the notion of a 'lightweight' 
pid namespace that didn't have a 'container-init' process.

> 
> One of the other problems with changing the pid is that user space in general
> glibc in particular can not cope with the pid of a process changing.
> 
> My memories are foggy at the moment but I do know that on the several occasions
> we have looked at unshare of the pid namespace it has failed due to kernel issues.
> I also remember I was close to having resolved the issues of unsharing the pid
> namespace if we did not change the pid of processing calling unshare.

Do you have pointers to discussions about these issues?

> You did not answer my question.  I don't quite see how you were envisioning
> using unsharing the pid namespace as part of restart so I can't tell if my
> proposed semantics would work for that case.

Well, one way to look at doing restart with nested namespaces would be 
to have userland go off and begin by rebuilding the process tree.  While 
rebuilding, any given process being recreated would need to have the 
same pid in the parenting pid namespace (the outer most namespace in the 
container).  It would need to know if it 'got' the right pid, and if so, 
would then create the new child pid namespace.  Requiring CLONE_NEWPID 
set on each and every clone(2) [*] would certainly be possible, as long 
as we had some way for the task being created to know what it's parent 
namespace pid is.  I guess this could be done by a shared memory segment 
shared between the parent and child of the clone as well, though it 
doesn't seem as clear-cut to me.



[*] Yes, I'm dancing around the clone_with_pid issue..

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                   ` <49C1A52D.4000503-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2009-03-19  3:28                                                     ` Eric W. Biederman
       [not found]                                                       ` <m1iqm6xkc7.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Eric W. Biederman @ 2009-03-19  3:28 UTC (permalink / raw)
  To: Mike Waychison; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Mike Waychison <mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:


>> all of this conversation originally started.   I am happy to set the starting
>> pid to 2 to avoid confusion on that point.
>
> I wasn't really taking into consideration the notion of a 'lightweight' pid
> namespace that didn't have a 'container-init' process.

I know that was something the Vserver guys were do today.  They have something
that shows up as pid 1 but they don't really have a process there.

>> One of the other problems with changing the pid is that user space in general
>> glibc in particular can not cope with the pid of a process changing.
>>
>> My memories are foggy at the moment but I do know that on the several occasions
>> we have looked at unshare of the pid namespace it has failed due to kernel issues.
>> I also remember I was close to having resolved the issues of unsharing the pid
>> namespace if we did not change the pid of processing calling unshare.
>
> Do you have pointers to discussions about these issues?

Not better than the containers list archives.

>> You did not answer my question.  I don't quite see how you were envisioning
>> using unsharing the pid namespace as part of restart so I can't tell if my
>> proposed semantics would work for that case.
>
> Well, one way to look at doing restart with nested namespaces would be to have
> userland go off and begin by rebuilding the process tree.  While rebuilding, any
> given process being recreated would need to have the same pid in the parenting
> pid namespace (the outer most namespace in the container).  It would need to
> know if it 'got' the right pid, and if so, would then create the new child pid
> namespace.  Requiring CLONE_NEWPID set on each and every clone(2) [*] would
> certainly be possible, as long as we had some way for the task being created to
> know what it's parent namespace pid is.  I guess this could be done by a shared
> memory segment shared between the parent and child of the clone as well, though
> it doesn't seem as clear-cut to me.
>
>
>
> [*] Yes, I'm dancing around the clone_with_pid issue..

Ok.  I see what you are trying to accomplish with this and honestly I think it
is silly.

We should start the threads we need in the kernel, and if we need to run clone_pid
fine.  I am not comfortable exporting clone_with_pid to user space.

As for the implementation of allocating a struct pid with a certain set of pid values.
I expect we can do that easily enough by refactoring the pid allocator to be passed
in the min/max pid to allocate from, and have a special case that passes in a different
set of min/max values so we can allocate just the pid we need.

If the primary use for a userspace interface is restart I feel we are doing it wrong.

Eric

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                       ` <m1iqm6xkc7.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2009-03-20 17:26                                                         ` Serge E. Hallyn
       [not found]                                                           ` <20090320172616.GA7203-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-20 17:26 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> Ok.  I see what you are trying to accomplish with this and honestly I
> think it is silly.
> 
> We should start the threads we need in the kernel, and if we need to
> run clone_pid fine.  I am not comfortable exporting clone_with_pid to
> user space.

Even if we create the task tree in userspace, I don't see why we
can't have the parent of each nested pid_ns pass CLONE_NEWPID to
clone_with_pid() instead of doing clone first and then unsharing
the pidns?

As for clone_with_pid(), I don't particularly like the semantics,
but as was discussed over IRC, we could have clone_with_pid()
return -EINVAL unless it is called while it is called from a task
inside a restarting container.  (and -EPERM if setting a pid in
a pid_ns which was not created as part of the container)  Eric
do you dislike that any less?

> As for the implementation of allocating a struct pid with a certain
> set of pid values.  I expect we can do that easily enough by
> refactoring the pid allocator to be passed in the min/max pid to
> allocate from, and have a special case that passes in a different set
> of min/max values so we can allocate just the pid we need.

What is wrong with Alexey's patch, which simply passes in the values
themselves?  Do you have another use in mind for the min/max pid
values?

> If the primary use for a userspace interface is restart I feel we are
> doing it wrong.

I think that's a good guideline, bad rule.  Certainly possible
that you're right that this is just pointing to in-kernel
recreation of process tree as the way to go.  I was getting
that feeling myself, but then there are still very good reasons
not to do that, as there are things which each task should do
before completing sys_restart() which are best done in userspace.
These include for instance creating virtual nics, and calling
Oren's suggested 'cr_advise()' system calls.

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                           ` <20090320172616.GA7203-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-20 19:51                                                             ` Mike Waychison
       [not found]                                                               ` <49C3F3C0.30100-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2009-03-20 23:26                                                             ` Eric W. Biederman
  1 sibling, 1 reply; 41+ messages in thread
From: Mike Waychison @ 2009-03-20 19:51 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch,
	Eric W. Biederman

Serge E. Hallyn wrote:
> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> Ok.  I see what you are trying to accomplish with this and honestly I
>> think it is silly.
>>
>> We should start the threads we need in the kernel, and if we need to
>> run clone_pid fine.  I am not comfortable exporting clone_with_pid to
>> user space.
> 
> Even if we create the task tree in userspace, I don't see why we
> can't have the parent of each nested pid_ns pass CLONE_NEWPID to
> clone_with_pid() instead of doing clone first and then unsharing
> the pidns?
> 
> As for clone_with_pid(), I don't particularly like the semantics,
> but as was discussed over IRC, we could have clone_with_pid()
> return -EINVAL unless it is called while it is called from a task
> inside a restarting container.  (and -EPERM if setting a pid in
> a pid_ns which was not created as part of the container)  Eric
> do you dislike that any less?

Wouldn't this mean the kernel would have to track which namespaces are 
part of a restart and which aren't?   Seems a little kludgy to me.

> 
>> As for the implementation of allocating a struct pid with a certain
>> set of pid values.  I expect we can do that easily enough by
>> refactoring the pid allocator to be passed in the min/max pid to
>> allocate from, and have a special case that passes in a different set
>> of min/max values so we can allocate just the pid we need.
> 
> What is wrong with Alexey's patch, which simply passes in the values
> themselves?  Do you have another use in mind for the min/max pid
> values?
> 
>> If the primary use for a userspace interface is restart I feel we are
>> doing it wrong.
> 
> I think that's a good guideline, bad rule.  Certainly possible
> that you're right that this is just pointing to in-kernel
> recreation of process tree as the way to go.  I was getting
> that feeling myself, but then there are still very good reasons
> not to do that, as there are things which each task should do
> before completing sys_restart() which are best done in userspace.
> These include for instance creating virtual nics, and calling
> Oren's suggested 'cr_advise()' system calls.
> 
> -serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                               ` <49C3F3C0.30100-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2009-03-20 20:40                                                                 ` Serge E. Hallyn
  2009-03-20 20:53                                                                 ` Oren Laadan
  1 sibling, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-20 20:40 UTC (permalink / raw)
  To: Mike Waychison
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch,
	Eric W. Biederman

Quoting Mike Waychison (mikew-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org):
> Serge E. Hallyn wrote:
>> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>>> Ok.  I see what you are trying to accomplish with this and honestly I
>>> think it is silly.
>>>
>>> We should start the threads we need in the kernel, and if we need to
>>> run clone_pid fine.  I am not comfortable exporting clone_with_pid to
>>> user space.
>>
>> Even if we create the task tree in userspace, I don't see why we
>> can't have the parent of each nested pid_ns pass CLONE_NEWPID to
>> clone_with_pid() instead of doing clone first and then unsharing
>> the pidns?
>>
>> As for clone_with_pid(), I don't particularly like the semantics,
>> but as was discussed over IRC, we could have clone_with_pid()
>> return -EINVAL unless it is called while it is called from a task
>> inside a restarting container.  (and -EPERM if setting a pid in
>> a pid_ns which was not created as part of the container)  Eric
>> do you dislike that any less?
>
> Wouldn't this mean the kernel would have to track which namespaces are  
> part of a restart and which aren't?   Seems a little kludgy to me.

Well it could do that, which would be trivial since it knows the
hierarchy, and knows which ns the init process of the whole restarted
container is.

Or, we can just, as I suggested before, tag the pid_ns with the
uid of the task which created it.  Then a restart can theoritically
specify a pid_t in a clone_with_pid() for a pid_ns pre-existing to
the restart, but it can still only do it for pid namespaces which it
"owns."

Whatever we do, we just need to make sure that an unprivileged task
can't manipulate a checkpoint image so as to pick an id in the
init_pid_ns, as that can be perceived as potentially exacerbating
vulnerabilities in poorly written userspace programs as Linus
mentioned.

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                               ` <49C3F3C0.30100-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2009-03-20 20:40                                                                 ` Serge E. Hallyn
@ 2009-03-20 20:53                                                                 ` Oren Laadan
  1 sibling, 0 replies; 41+ messages in thread
From: Oren Laadan @ 2009-03-20 20:53 UTC (permalink / raw)
  To: Mike Waychison
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch,
	Eric W. Biederman



Mike Waychison wrote:
> Serge E. Hallyn wrote:
>> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>>> Ok.  I see what you are trying to accomplish with this and honestly I
>>> think it is silly.
>>>
>>> We should start the threads we need in the kernel, and if we need to
>>> run clone_pid fine.  I am not comfortable exporting clone_with_pid to
>>> user space.
>> Even if we create the task tree in userspace, I don't see why we
>> can't have the parent of each nested pid_ns pass CLONE_NEWPID to
>> clone_with_pid() instead of doing clone first and then unsharing
>> the pidns?
>>
>> As for clone_with_pid(), I don't particularly like the semantics,
>> but as was discussed over IRC, we could have clone_with_pid()
>> return -EINVAL unless it is called while it is called from a task
>> inside a restarting container.  (and -EPERM if setting a pid in
>> a pid_ns which was not created as part of the container)  Eric
>> do you dislike that any less?
> 
> Wouldn't this mean the kernel would have to track which namespaces are 
> part of a restart and which aren't?   Seems a little kludgy to me.

We are talking only about pidns, right ?

So we need to keep track of a single pidns - the top level for the
restarting container; that's the container init. We can, e.g., mark
it with a flag.

We can then follow the hierarchy up through pidns until we reach
the one marked, or the topmost, and grant (or deny) permission.

Oren.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                           ` <20090320172616.GA7203-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-20 19:51                                                             ` Mike Waychison
@ 2009-03-20 23:26                                                             ` Eric W. Biederman
       [not found]                                                               ` <m1d4cb3he5.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  1 sibling, 1 reply; 41+ messages in thread
From: Eric W. Biederman @ 2009-03-20 23:26 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> Ok.  I see what you are trying to accomplish with this and honestly I
>> think it is silly.
>> 
>> We should start the threads we need in the kernel, and if we need to
>> run clone_pid fine.  I am not comfortable exporting clone_with_pid to
>> user space.
>
> Even if we create the task tree in userspace, I don't see why we
> can't have the parent of each nested pid_ns pass CLONE_NEWPID to
> clone_with_pid() instead of doing clone first and then unsharing
> the pidns?
>
> As for clone_with_pid(), I don't particularly like the semantics,
> but as was discussed over IRC, we could have clone_with_pid()
> return -EINVAL unless it is called while it is called from a task
> inside a restarting container.  (and -EPERM if setting a pid in
> a pid_ns which was not created as part of the container)  Eric
> do you dislike that any less?
>
>> As for the implementation of allocating a struct pid with a certain
>> set of pid values.  I expect we can do that easily enough by
>> refactoring the pid allocator to be passed in the min/max pid to
>> allocate from, and have a special case that passes in a different set
>> of min/max values so we can allocate just the pid we need.
>
> What is wrong with Alexey's patch, which simply passes in the values
> themselves?  Do you have another use in mind for the min/max pid
> values?

At an implementation level (and I need to look at Alexey's specific patch)
every patch I have seen to date creates their own version of alloc_pidmap.

alloc_pidmap already implicitly takes min/max and first value to try
as parameters.  RESERVED_PIDS, pid_max, and pid_ns->last_pid.  So
instead of rewriting alloc_pidmap we should just be able to refactor
alloc_pidmap to take the requisite values.  That should be less code
and easier to maintain.

Looking at the current implementation we also have the issue that
pid_max is not per pid namespace.  Where it seems to belong.

>> If the primary use for a userspace interface is restart I feel we are
>> doing it wrong.
>
> I think that's a good guideline, bad rule.  Certainly possible
> that you're right that this is just pointing to in-kernel
> recreation of process tree as the way to go.  I was getting
> that feeling myself, but then there are still very good reasons
> not to do that, as there are things which each task should do
> before completing sys_restart() which are best done in userspace.
> These include for instance creating virtual nics, and calling
> Oren's suggested 'cr_advise()' system calls.

You might be right.   I am behind on that part of the conversation.

My general concern is that dividing up the responsibilities between user space
and kernel space seems harder to maintain, and refactor if we don't get something
right the first time.


Eric

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                               ` <m1d4cb3he5.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2009-03-21  2:38                                                                 ` Serge E. Hallyn
       [not found]                                                                   ` <20090321023834.GA21064-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-21  2:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> > What is wrong with Alexey's patch, which simply passes in the values
> > themselves?  Do you have another use in mind for the min/max pid
> > values?
> 
> At an implementation level (and I need to look at Alexey's specific patch)
> every patch I have seen to date creates their own version of alloc_pidmap.

You're right, Alexey's patch creates a new one.

> alloc_pidmap already implicitly takes min/max and first value to try
> as parameters.  RESERVED_PIDS, pid_max, and pid_ns->last_pid.  So
> instead of rewriting alloc_pidmap we should just be able to refactor
> alloc_pidmap to take the requisite values.  That should be less code
> and easier to maintain.

Yeah, that sounds good actually.  Thanks.

> Looking at the current implementation we also have the issue that
> pid_max is not per pid namespace.  Where it seems to belong.

Eh.  It does seem to, but otoh why give userspace knobs it has no use
for...  Or, can you think of a case where it'd be useful?

> >> If the primary use for a userspace interface is restart I feel we are
> >> doing it wrong.
> >
> > I think that's a good guideline, bad rule.  Certainly possible
> > that you're right that this is just pointing to in-kernel
> > recreation of process tree as the way to go.  I was getting
> > that feeling myself, but then there are still very good reasons
> > not to do that, as there are things which each task should do
> > before completing sys_restart() which are best done in userspace.
> > These include for instance creating virtual nics, and calling
> > Oren's suggested 'cr_advise()' system calls.
> 
> You might be right.   I am behind on that part of the conversation.
> 
> My general concern is that dividing up the responsibilities between user space
> and kernel space seems harder to maintain, and refactor if we don't get something
> right the first time.

So far we're actually still at the point where the code (Oren's set)
could go either way.  A small patch from Alexey can make it swing toward
kernel, while Oren's mktree.c userspace restart program swings the other
way.

And since we're punting on any nested namespaces it actually may stay that way
for awhile.

thanks,
-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                                   ` <20090321023834.GA21064-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
@ 2009-03-21  3:39                                                                     ` Eric W. Biederman
       [not found]                                                                       ` <m1prgbzgqq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 41+ messages in thread
From: Eric W. Biederman @ 2009-03-21  3:39 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

"Serge E. Hallyn" <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org> writes:

> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> > What is wrong with Alexey's patch, which simply passes in the values
>> > themselves?  Do you have another use in mind for the min/max pid
>> > values?
>> 
>> At an implementation level (and I need to look at Alexey's specific patch)
>> every patch I have seen to date creates their own version of alloc_pidmap.
>
> You're right, Alexey's patch creates a new one.
>
>> alloc_pidmap already implicitly takes min/max and first value to try
>> as parameters.  RESERVED_PIDS, pid_max, and pid_ns->last_pid.  So
>> instead of rewriting alloc_pidmap we should just be able to refactor
>> alloc_pidmap to take the requisite values.  That should be less code
>> and easier to maintain.
>
> Yeah, that sounds good actually.  Thanks.
>
>> Looking at the current implementation we also have the issue that
>> pid_max is not per pid namespace.  Where it seems to belong.
>
> Eh.  It does seem to, but otoh why give userspace knobs it has no use
> for...  Or, can you think of a case where it'd be useful?

In general the number of usable pid numbers should be larger in the outer
pid namespace than in the child pid namespace.  Otherwise it is possible
for the child to eat all of the possible pid numbers.

So I think it would be advantageous for to make containers designed to migrate
to have a small pid_max by default so we know we won't overwhelm others.

Furthermore since pid_max is a limit on the identifiers allocated no on the
number of processes it is very much a pid namespace property.

>> > I think that's a good guideline, bad rule.  Certainly possible
>> > that you're right that this is just pointing to in-kernel
>> > recreation of process tree as the way to go.  I was getting
>> > that feeling myself, but then there are still very good reasons
>> > not to do that, as there are things which each task should do
>> > before completing sys_restart() which are best done in userspace.
>> > These include for instance creating virtual nics, and calling
>> > Oren's suggested 'cr_advise()' system calls.
>> 
>> You might be right.   I am behind on that part of the conversation.
>> 
>> My general concern is that dividing up the responsibilities between user space
>> and kernel space seems harder to maintain, and refactor if we don't get something
>> right the first time.
>
> So far we're actually still at the point where the code (Oren's set)
> could go either way.  A small patch from Alexey can make it swing toward
> kernel, while Oren's mktree.c userspace restart program swings the other
> way.
>
> And since we're punting on any nested namespaces it actually may stay that way
> for awhile.

Interesting.  That sounds fairly fundamental.  If I have some free time I will
have to take a look.  I'm in favor of a kernel/user space cooperation but I don't
currently see the benefit of fork processes in user space.

Eric

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                                                       ` <m1prgbzgqq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2009-03-21 14:51                                                                         ` Serge E. Hallyn
  0 siblings, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-21 14:51 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org> writes:
> 
> > Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> >> > What is wrong with Alexey's patch, which simply passes in the values
> >> > themselves?  Do you have another use in mind for the min/max pid
> >> > values?
> >> 
> >> At an implementation level (and I need to look at Alexey's specific patch)
> >> every patch I have seen to date creates their own version of alloc_pidmap.
> >
> > You're right, Alexey's patch creates a new one.
> >
> >> alloc_pidmap already implicitly takes min/max and first value to try
> >> as parameters.  RESERVED_PIDS, pid_max, and pid_ns->last_pid.  So
> >> instead of rewriting alloc_pidmap we should just be able to refactor
> >> alloc_pidmap to take the requisite values.  That should be less code
> >> and easier to maintain.
> >
> > Yeah, that sounds good actually.  Thanks.
> >
> >> Looking at the current implementation we also have the issue that
> >> pid_max is not per pid namespace.  Where it seems to belong.
> >
> > Eh.  It does seem to, but otoh why give userspace knobs it has no use
> > for...  Or, can you think of a case where it'd be useful?
> 
> In general the number of usable pid numbers should be larger in the outer
> pid namespace than in the child pid namespace.  Otherwise it is possible
> for the child to eat all of the possible pid numbers.
> 
> So I think it would be advantageous for to make containers designed to migrate
> to have a small pid_max by default so we know we won't overwhelm others.
> 
> Furthermore since pid_max is a limit on the identifiers allocated no on the
> number of processes it is very much a pid namespace property.

Right, I don't argue that it doesn't seem to belong there.  Well if
you think people would use it, it does seem simple enough to do.
Untested (well compile-tested) patch below just for grins.

> >> > I think that's a good guideline, bad rule.  Certainly possible
> >> > that you're right that this is just pointing to in-kernel
> >> > recreation of process tree as the way to go.  I was getting
> >> > that feeling myself, but then there are still very good reasons
> >> > not to do that, as there are things which each task should do
> >> > before completing sys_restart() which are best done in userspace.
> >> > These include for instance creating virtual nics, and calling
> >> > Oren's suggested 'cr_advise()' system calls.
> >> 
> >> You might be right.   I am behind on that part of the conversation.
> >> 
> >> My general concern is that dividing up the responsibilities between user space
> >> and kernel space seems harder to maintain, and refactor if we don't get something
> >> right the first time.
> >
> > So far we're actually still at the point where the code (Oren's set)
> > could go either way.  A small patch from Alexey can make it swing toward
> > kernel, while Oren's mktree.c userspace restart program swings the other
> > way.
> >
> > And since we're punting on any nested namespaces it actually may stay that way
> > for awhile.
> 
> Interesting.  That sounds fairly fundamental.  If I have some free time I will
> have to take a look.  I'm in favor of a kernel/user space cooperation but I don't
> currently see the benefit of fork processes in user space.

All right I'll wait for you to take a look, rather than repeat
myself :)  The biggest concern IMO is how to create complicated
resources (like a veth tunnel pair) in the kernel case.

thanks,
-serge

From 47303d729ec494add03fbddb47fac9a020d65f00 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Date: Sat, 21 Mar 2009 09:22:26 -0500
Subject: [PATCH 1/1] pid_ns: make pid_max a pid_ns property

Remove the pid_max global, and make it a property of the
pid_namespace.  When a pid_ns is created, it inherits
the parent's pid_ns.

Fixing up sysctl (trivial akin to ipc version, but
potentially tedious to get right for all CONFIG*
combinations) is left for later.

Signed-off-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/pid_namespace.h |    1 +
 kernel/pid.c                  |   14 +++++++-------
 kernel/pid_namespace.c        |    6 ++++--
 kernel/sysctl.c               |    4 ++--
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..fd7f497 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -30,6 +30,7 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
 #endif
+	int pid_max;
 };
 
 extern struct pid_namespace init_pid_ns;
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586f..898fa8b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,8 +43,6 @@ static struct hlist_head *pid_hash;
 static int pidhash_shift;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
-int pid_max = PID_MAX_DEFAULT;
-
 #define RESERVED_PIDS		300
 
 int pid_max_min = RESERVED_PIDS + 1;
@@ -78,6 +76,7 @@ struct pid_namespace init_pid_ns = {
 	.last_pid = 0,
 	.level = 0,
 	.child_reaper = &init_task,
+	.pid_max = PID_MAX_DEFAULT,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
@@ -128,11 +127,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	struct pidmap *map;
 
 	pid = last + 1;
-	if (pid >= pid_max)
+	if (pid >= pid_ns->pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+	max_scan = (pid_ns->pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE
+			- !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
 			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -164,11 +164,11 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 			 * bitmap block and the final block was the same
 			 * as the starting point, pid is before last_pid.
 			 */
-			} while (offset < BITS_PER_PAGE && pid < pid_max &&
-					(i != max_scan || pid < last ||
+			} while (offset < BITS_PER_PAGE && pid < pid_ns->pid_max
+					&& (i != max_scan || pid < last ||
 					    !((last+1) & BITS_PER_PAGE_MASK)));
 		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+		if (map < &pid_ns->pidmap[(pid_ns->pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea8..1ba3970 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,15 +67,17 @@ err_alloc:
 	return NULL;
 }
 
-static struct pid_namespace *create_pid_namespace(unsigned int level)
+static struct pid_namespace *create_pid_namespace(struct pid_namespace *old)
 {
 	struct pid_namespace *ns;
+	unsigned int level = old->level + 1;
 	int i;
 
 	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
 	if (ns == NULL)
 		goto out;
 
+	ns->pid_max = old->pid_max;
 	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!ns->pidmap[0].page)
 		goto out_free;
@@ -125,7 +127,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
 	if (flags & CLONE_THREAD)
 		goto out_put;
 
-	new_ns = create_pid_namespace(old_ns->level + 1);
+	new_ns = create_pid_namespace(old_ns);
 	if (!IS_ERR(new_ns))
 		new_ns->parent = get_pid_ns(old_ns);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44f..8af16bd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/pid_namespace.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -74,7 +75,6 @@ extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
-extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
@@ -643,7 +643,7 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PIDMAX,
 		.procname	= "pid_max",
-		.data		= &pid_max,
+		.data		= &init_pid_ns.pid_max,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH] [RFC] c/r: Add UTS support
       [not found]                                           ` <49BA9D9C.2030208-GANU6spQydw@public.gmane.org>
@ 2009-03-25 12:01                                             ` Eric W. Biederman
  0 siblings, 0 replies; 41+ messages in thread
From: Eric W. Biederman @ 2009-03-25 12:01 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, Nathan Lynch

Daniel Lezcano <daniel.lezcano-GANU6spQydw@public.gmane.org> writes:

> Yes, this is an alternative. Maybe, I will say something stupid but IMO 
> the "maycheckpoint" will depends on what you assume you have for the CR:
>     1) the container is instantiated in one step, that is 
> clone(mycloneflags) and that's all, any other clone/unshare is 
> forbidden. In this case, you can concentrate the code in the nsproxy 
> structure.
>     2) the container can be instantiated in several steps, that is 
> several clone/unshare but with different namespaces. In this case, you 
> have to take care of all the namespaces and do a "maycheckpoint" for 
> each of them.

Please excuse me for jumping in late.

unshare/clone are irrelevant.  Think passing of file descriptors
in unix domain sockets.

Eric

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2009-03-25 12:01 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-03-12 17:56 [PATCH] [RFC] c/r: Add UTS support Dan Smith
     [not found] ` <1236880612-15316-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-12 21:29   ` Nathan Lynch
2009-03-12 21:56     ` Dan Smith
     [not found]       ` <87fxhipfrh.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-12 22:48         ` Serge E. Hallyn
     [not found]           ` <20090312224820.GA12723-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
2009-03-12 22:56             ` Dan Smith
     [not found]               ` <87bps6pcyf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-13  0:12                 ` Serge E. Hallyn
2009-03-18  8:27                 ` Oren Laadan
     [not found]                   ` <49C0B069.6060300-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18  9:01                     ` Cedric Le Goater
2009-03-18 13:49                     ` Serge E. Hallyn
     [not found]                       ` <20090318134932.GC22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18 14:04                         ` Dan Smith
     [not found]                           ` <878wn353mf.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-18 15:46                             ` Cedric Le Goater
     [not found]                               ` <49C1175F.9060600-GANU6spQydw@public.gmane.org>
2009-03-18 15:55                                 ` Dan Smith
     [not found]                                   ` <874oxq6d1x.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-18 16:02                                     ` Cedric Le Goater
2009-03-18 19:50                                 ` Mike Waychison
     [not found]                                   ` <49C1506C.1080500-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2009-03-19  0:10                                     ` Eric W. Biederman
     [not found]                                       ` <m1bprye5io.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2009-03-19  0:46                                         ` Mike Waychison
     [not found]                                           ` <49C195CF.1080506-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2009-03-19  1:06                                             ` Eric W. Biederman
     [not found]                                               ` <m1ab7icodl.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2009-03-19  1:51                                                 ` Mike Waychison
     [not found]                                                   ` <49C1A52D.4000503-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2009-03-19  3:28                                                     ` Eric W. Biederman
     [not found]                                                       ` <m1iqm6xkc7.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2009-03-20 17:26                                                         ` Serge E. Hallyn
     [not found]                                                           ` <20090320172616.GA7203-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-20 19:51                                                             ` Mike Waychison
     [not found]                                                               ` <49C3F3C0.30100-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2009-03-20 20:40                                                                 ` Serge E. Hallyn
2009-03-20 20:53                                                                 ` Oren Laadan
2009-03-20 23:26                                                             ` Eric W. Biederman
     [not found]                                                               ` <m1d4cb3he5.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2009-03-21  2:38                                                                 ` Serge E. Hallyn
     [not found]                                                                   ` <20090321023834.GA21064-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org>
2009-03-21  3:39                                                                     ` Eric W. Biederman
     [not found]                                                                       ` <m1prgbzgqq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2009-03-21 14:51                                                                         ` Serge E. Hallyn
2009-03-12 22:48         ` Daniel Lezcano
     [not found]           ` <49B99144.9000106-GANU6spQydw@public.gmane.org>
2009-03-12 22:58             ` Dan Smith
     [not found]               ` <877i2upcvo.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-12 23:11                 ` Daniel Lezcano
     [not found]                   ` <49B996BC.1090908-GANU6spQydw@public.gmane.org>
2009-03-12 23:13                     ` Dan Smith
     [not found]                       ` <873adipc5l.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-12 23:24                         ` Daniel Lezcano
     [not found]                           ` <49B999A6.2000005-GANU6spQydw@public.gmane.org>
2009-03-13 15:30                             ` Serge E. Hallyn
     [not found]                               ` <20090313153004.GA8317-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-13 15:51                                 ` Daniel Lezcano
     [not found]                                   ` <49BA811C.4070302-GANU6spQydw@public.gmane.org>
2009-03-13 17:15                                     ` Serge E. Hallyn
     [not found]                                       ` <20090313171556.GB10685-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-13 17:53                                         ` Daniel Lezcano
     [not found]                                           ` <49BA9D9C.2030208-GANU6spQydw@public.gmane.org>
2009-03-25 12:01                                             ` Eric W. Biederman
2009-03-13 15:59                         ` Cedric Le Goater
     [not found]                           ` <49BA82CE.4090206-GANU6spQydw@public.gmane.org>
2009-03-13 16:04                             ` Daniel Lezcano
2009-03-18  8:32             ` Oren Laadan
2009-03-18  8:35   ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.