[PATCH] c/r: Add UTS support (v4)

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] c/r: Add UTS support (v4)
@ 2009-03-18 18:51 Dan Smith
       [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Dan Smith @ 2009-03-18 18:51 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: adobriyan-Re5JQEeQqe8AvxtiuMwx3w

This patch adds a "phase" of checkpoint that saves out information about any
namespaces the task(s) may have.  Do this by tracking the namespace objects
of the tasks and making sure that tasks with the same namespace that follow
get properly referenced in the checkpoint stream.  Note that for now, we
refuse to checkpoint if all tasks in the set don't share the same set of
*all* namespaces.

Restart is handled in userspace by reading the UTS record(s), calling
unshare() and setting the hostname accordingly.  See my changes to
mktree.c for details.

I tested this with single and multiple task restore, on top of Oren's
v13 tree.

Changes:
  - Remove the kernel restore path
  - Punt on nested namespaces
  - Use __NEW_UTS_LEN in nodename and domainname buffers
  - Add a note to Documentation/checkpoint/internals.txt to indicate where
    in the save/restore process the UTS information is kept
  - Store (and track) the objref of the namespace itself instead of the
    nsproxy (based on comments from Dave on IRC)
  - Remove explicit check for non-root nsproxy
  - Store the nodename and domainname lengths and use cr_write_string()
    to store the actual name strings

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Cc: adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org
Cc: orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org
---
 Documentation/checkpoint/internals.txt |    2 +
 checkpoint/checkpoint.c                |   80 ++++++++++++++++++++++++++++++++
 checkpoint/objhash.c                   |    7 +++
 checkpoint/restart.c                   |    1 +
 include/linux/checkpoint.h             |    1 +
 include/linux/checkpoint_hdr.h         |   14 ++++++
 6 files changed, 105 insertions(+), 0 deletions(-)

diff --git a/Documentation/checkpoint/internals.txt b/Documentation/checkpoint/internals.txt
index b363e83..7a2488b 100644
--- a/Documentation/checkpoint/internals.txt
+++ b/Documentation/checkpoint/internals.txt
@@ -12,6 +12,8 @@ The order of operations, both save and restore, is as follows:
 
 * Process forest: [TBD] tasks and their relationships
 
+* Namespace section: per-container namespace information
+
 * Per task data (for each task):
   -> task state: elements of task_struct
   -> thread state: elements of thread_struct and thread_info
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 64155de..228bdae 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -193,6 +193,82 @@ static int cr_write_tail(struct cr_ctx *ctx)
 	return ret;
 }
 
+static int cr_write_ns_uts(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr h;
+	struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct new_utsname *n = &t->nsproxy->uts_ns->name;
+	int ret;
+
+	h.type = CR_HDR_UTSNS;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh->nodename_len = sizeof(n->nodename);
+	hh->domainname_len = sizeof(n->domainname);
+
+	ret = cr_write_obj(ctx, &h, hh);
+	if (ret < 0)
+		goto out;
+
+	ret = cr_write_string(ctx, n->nodename, hh->nodename_len);
+	if (ret < 0)
+		goto out;
+
+	ret = cr_write_string(ctx, n->domainname, hh->domainname_len);
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+
+	return ret;
+}
+
+static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr h;
+	struct cr_hdr_namespaces *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct nsproxy *nsp = t->nsproxy;
+	int ret;
+
+	h.type = CR_HDR_NS;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh->types = 0;
+
+	if (cr_obj_add_ptr(ctx, nsp->uts_ns, &hh->uts_ref, CR_OBJ_UTSNS, 0))
+		hh->types |= CR_NS_UTS;
+
+	ret = cr_write_obj(ctx, &h, hh);
+	if (ret)
+		goto out;
+
+	if (hh->types & CR_NS_UTS) {
+		ret = cr_write_ns_uts(ctx, t);
+		if (ret < 0)
+			goto out;
+
+		/* FIXME: Write other namespaces here */
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+
+	return ret;
+}
+
+static int cr_write_all_namespaces(struct cr_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		pr_debug("dumping ns for task #%d\n", n);
+		ret = cr_write_namespaces(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
 /* dump the task_struct of a given task */
 static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
 {
@@ -549,6 +625,10 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	if (ret < 0)
 		goto out;
 
+	ret = cr_write_all_namespaces(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_all_tasks(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index ee31b38..afcf1d1 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -12,6 +12,7 @@
 #include <linux/file.h>
 #include <linux/hash.h>
 #include <linux/checkpoint.h>
+#include <linux/utsname.h>
 
 struct cr_objref {
 	int objref;
@@ -35,6 +36,9 @@ static void cr_obj_ref_drop(struct cr_objref *obj)
 	case CR_OBJ_FILE:
 		fput((struct file *) obj->ptr);
 		break;
+	case CR_OBJ_UTSNS:
+		put_uts_ns((struct uts_namespace *) obj->ptr);
+		break;
 	default:
 		BUG();
 	}
@@ -46,6 +50,9 @@ static void cr_obj_ref_grab(struct cr_objref *obj)
 	case CR_OBJ_FILE:
 		get_file((struct file *) obj->ptr);
 		break;
+	case CR_OBJ_UTSNS:
+		get_uts_ns((struct uts_namespace *) obj->ptr);
+		break;
 	default:
 		BUG();
 	}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 7ec4de4..0ed01aa 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -15,6 +15,7 @@
 #include <linux/magic.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/utsname.h>
 
 #include "checkpoint_arch.h"
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 217cf6e..02c2990 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -75,6 +75,7 @@ extern void cr_ctx_put(struct cr_ctx *ctx);
 
 enum {
 	CR_OBJ_FILE = 1,
+	CR_OBJ_UTSNS,
 	CR_OBJ_MAX
 };
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 6dc739f..886ab53 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -49,6 +49,8 @@ enum {
 	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
+	CR_HDR_NS,
+	CR_HDR_UTSNS,
 
 	CR_HDR_MM = 201,
 	CR_HDR_VMA,
@@ -156,4 +158,16 @@ struct cr_hdr_fd_data {
 	__u64 f_version;
 } __attribute__((aligned(8)));
 
+#define CR_NS_UTS 1
+
+struct cr_hdr_namespaces {
+	__u32 types;   /* NS records that follow this */
+	__u32 uts_ref; /* Objref of matching UTS namespace */
+};
+
+struct cr_hdr_utsns {
+	__u32 nodename_len;
+	__u32 domainname_len;
+};
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

[parent not found: <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-19 22:26   ` Oren Laadan
       [not found]     ` <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-19 22:28   ` Oren Laadan
  2009-03-24 15:07   ` Oren Laadan
  2 siblings, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-19 22:26 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w


So what happens in the following scenario:

* task A is the container init(1)
* A calls fork() to create task B
* B calls unshare(CLONE_NEWUTS)
* B calls clone(CLONE_PARENT) to create task C

Now C and B have the same UTS namespace, which differs from A's, but
they will each have a different UTS namespace when restarted with this
patch.

This is part of a larger complexity related to how CLONE_PARENT works.
Also related to how session IDs are inherited.

Two approaches to solve this are:

a) Identify, in mktree, that this was the case, and impose an order on
the forks/clones to recreate the same dependency (an algorithm for this
is described in [1])

b) Do it in the kernel: for each nsproxy (identified by an objref) the
first task that has it will create it during restart, in or out of the
kernel, and the next task will simply attach to the existing one that
will be deposited in the objhash.

Oren.

[1] "Transparent Checkpoint/Restart of Multiple Processes on Commodity
Operating Systems",
http://www1.cs.columbia.edu/~orenl/papers/usenix07-checkpoint.pdf

Dan Smith wrote:
> This patch adds a "phase" of checkpoint that saves out information about any
> namespaces the task(s) may have.  Do this by tracking the namespace objects
> of the tasks and making sure that tasks with the same namespace that follow
> get properly referenced in the checkpoint stream.  Note that for now, we
> refuse to checkpoint if all tasks in the set don't share the same set of
> *all* namespaces.
> 
> Restart is handled in userspace by reading the UTS record(s), calling
> unshare() and setting the hostname accordingly.  See my changes to
> mktree.c for details.
> 
> I tested this with single and multiple task restore, on top of Oren's
> v13 tree.

[...]

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]     ` <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-19 22:39       ` Dan Smith
       [not found]         ` <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Dan Smith @ 2009-03-19 22:39 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

OL> So what happens in the following scenario:

OL> * task A is the container init(1)
OL> * A calls fork() to create task B
OL> * B calls unshare(CLONE_NEWUTS)
OL> * B calls clone(CLONE_PARENT) to create task C

In the previous version of the patch, I failed the checkpoint if this
was the case by making sure that all tasks in the set had the same
nsproxy.  You said in IRC that this was already done elsewhere in the
infrastructure, but now that I look I don't see that anywhere.

The check I had was in response to Daniel's comments about avoiding
the situation for the time being by making sure that all the tasks had
the same set of namespaces (i.e. the same nsproxy at the time of
checkpoint).

OL> Two approaches to solve this are:

OL> a) Identify, in mktree, that this was the case, and impose an
OL> order on the forks/clones to recreate the same dependency (an
OL> algorithm for this is described in [1])

OL> b) Do it in the kernel: for each nsproxy (identified by an objref)
OL> the first task that has it will create it during restart, in or
OL> out of the kernel, and the next task will simply attach to the
OL> existing one that will be deposited in the objhash.

I think that prior discussion led to the conclusion that simplicity
wins for the moment, but if you want to solve it now I can cook up
some changes.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]         ` <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-19 22:58           ` Oren Laadan
       [not found]             ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-19 22:58 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Dan Smith wrote:
> OL> So what happens in the following scenario:
> 
> OL> * task A is the container init(1)
> OL> * A calls fork() to create task B
> OL> * B calls unshare(CLONE_NEWUTS)
> OL> * B calls clone(CLONE_PARENT) to create task C
> 
> In the previous version of the patch, I failed the checkpoint if this
> was the case by making sure that all tasks in the set had the same
> nsproxy.  You said in IRC that this was already done elsewhere in the
> infrastructure, but now that I look I don't see that anywhere.
> 

in cr_may_checkpoint_task():

 285         /* FIXME: change this for nested containers */
 286         if (task_nsproxy(t) != ctx->root_nsproxy)
 287                 return -EPERM;

> The check I had was in response to Daniel's comments about avoiding
> the situation for the time being by making sure that all the tasks had
> the same set of namespaces (i.e. the same nsproxy at the time of
> checkpoint).
> 
> OL> Two approaches to solve this are:
> 
> OL> a) Identify, in mktree, that this was the case, and impose an
> OL> order on the forks/clones to recreate the same dependency (an
> OL> algorithm for this is described in [1])
> 
> OL> b) Do it in the kernel: for each nsproxy (identified by an objref)
> OL> the first task that has it will create it during restart, in or
> OL> out of the kernel, and the next task will simply attach to the
> OL> existing one that will be deposited in the objhash.
> 
> I think that prior discussion led to the conclusion that simplicity
> wins for the moment, but if you want to solve it now I can cook up
> some changes.
> 

If we keep the assumption, for simplicity, that all tasks share the
same namespace, then the checkpoint code should check, once, how that
nsproxy differs from the container's parent (except for the obvious
pidns).

If it does differ, e.g. in uts, then the checkpoint should save the
uts state _once_ - as in global data. Restart will restore the state
also _once_, for the init of the container (the first task restored),
_before_ it forks the rest of the tree.

Otherwise, we don't get the same outcome.

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]             ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-19 23:13               ` Oren Laadan
  2009-03-20 13:56                 ` Dan Smith
       [not found]                 ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-20 18:05               ` Serge E. Hallyn
  1 sibling, 2 replies; 23+ messages in thread
From: Oren Laadan @ 2009-03-19 23:13 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Oren Laadan wrote:
> 
> Dan Smith wrote:
>> OL> So what happens in the following scenario:
>>
>> OL> * task A is the container init(1)
>> OL> * A calls fork() to create task B
>> OL> * B calls unshare(CLONE_NEWUTS)
>> OL> * B calls clone(CLONE_PARENT) to create task C
>>
>> In the previous version of the patch, I failed the checkpoint if this
>> was the case by making sure that all tasks in the set had the same
>> nsproxy.  You said in IRC that this was already done elsewhere in the
>> infrastructure, but now that I look I don't see that anywhere.
>>
> 
> in cr_may_checkpoint_task():
> 
>  285         /* FIXME: change this for nested containers */
>  286         if (task_nsproxy(t) != ctx->root_nsproxy)
>  287                 return -EPERM;
> 
>> The check I had was in response to Daniel's comments about avoiding
>> the situation for the time being by making sure that all the tasks had
>> the same set of namespaces (i.e. the same nsproxy at the time of
>> checkpoint).
>>
>> OL> Two approaches to solve this are:
>>
>> OL> a) Identify, in mktree, that this was the case, and impose an
>> OL> order on the forks/clones to recreate the same dependency (an
>> OL> algorithm for this is described in [1])
>>
>> OL> b) Do it in the kernel: for each nsproxy (identified by an objref)
>> OL> the first task that has it will create it during restart, in or
>> OL> out of the kernel, and the next task will simply attach to the
>> OL> existing one that will be deposited in the objhash.
>>
>> I think that prior discussion led to the conclusion that simplicity
>> wins for the moment, but if you want to solve it now I can cook up
>> some changes.
>>
> 
> If we keep the assumption, for simplicity, that all tasks share the
> same namespace, then the checkpoint code should check, once, how that
> nsproxy differs from the container's parent (except for the obvious
> pidns).
> 
> If it does differ, e.g. in uts, then the checkpoint should save the
> uts state _once_ - as in global data. Restart will restore the state
> also _once_, for the init of the container (the first task restored),
> _before_ it forks the rest of the tree.
> 
> Otherwise, we don't get the same outcome.

... I re-read the code to make sure,so -

You indeed do it before all tasks are forked, so that's correct.

What got me confused was that you loop over all tasks, which is not
needed because was assume they all share the name nsproxy; And in
restart, you unshare() many times by the same task, so all but the
last unshare() are useless.  In other words, I wonder what is the
need for that loop over all processes.

Here is a suggestion for a simple change that is likely to be a step
towards more generic solution in the future:

The nsprox is a property of a task, and it is (possibly) shared. We
can put the data either on the pids_arr or on the cr_hdr_task itself.
For simplicity (and to work with your scheme) let's assume the former.

We can extend the pids_arr to have a ns_objref field, that will hold
the objref of the nxproxy. Of course, now, all pids_arr will have the
same objref, or else ...  This data will follow the pids_arr data in
the image.

During checkpoint, we read the pids_arr from the image, and then for
each objref of an nsproxy that is seen for the first time, we read
the state of that nsproxy and restore a new one. (In our simple case,
there will always be exactly one).

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
  2009-03-19 23:13               ` Oren Laadan
@ 2009-03-20 13:56                 ` Dan Smith
       [not found]                 ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 0 replies; 23+ messages in thread
From: Dan Smith @ 2009-03-20 13:56 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w, Dave Hansen

OL> What got me confused was that you loop over all tasks, which is
OL> not needed because was assume they all share the name nsproxy; And
OL> in restart, you unshare() many times by the same task, so all but
OL> the last unshare() are useless.  In other words, I wonder what is
OL> the need for that loop over all processes.

You're exactly right, but this wasn't my intent.  It was left over
from the first iteration of the patch.

OL> Here is a suggestion for a simple change that is likely to be a step
OL> towards more generic solution in the future:

OL> The nsprox is a property of a task, and it is (possibly) shared. We
OL> can put the data either on the pids_arr or on the cr_hdr_task itself.
OL> For simplicity (and to work with your scheme) let's assume the former.

OL> We can extend the pids_arr to have a ns_objref field, that will hold
OL> the objref of the nxproxy. Of course, now, all pids_arr will have the
OL> same objref, or else ...  This data will follow the pids_arr data in
OL> the image.

OL> During checkpoint, we read the pids_arr from the image, and then for
OL> each objref of an nsproxy that is seen for the first time, we read
OL> the state of that nsproxy and restore a new one. (In our simple case,
OL> there will always be exactly one).

Storing an objref of the nsproxy for each task to track changes at
that level is what I did, and is the reason for the gratuitous
unshare() that is left there.  The idea was only to unshare() when I
encountered a new nsproxy objref.

However, Dave had some comments on this, which he made only on IRC.
Dave do you want to document them here for the benefit of the list?

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                 ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-20 18:10                   ` Serge E. Hallyn
       [not found]                     ` <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Serge E. Hallyn @ 2009-03-20 18:10 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> What got me confused was that you loop over all tasks, which is not
> needed because was assume they all share the name nsproxy; And in
> restart, you unshare() many times by the same task, so all but the
> last unshare() are useless.  In other words, I wonder what is the
> need for that loop over all processes.
> 
> Here is a suggestion for a simple change that is likely to be a step
> towards more generic solution in the future:
> 
> The nsprox is a property of a task, and it is (possibly) shared. We
> can put the data either on the pids_arr or on the cr_hdr_task itself.
> For simplicity (and to work with your scheme) let's assume the former.
> 
> We can extend the pids_arr to have a ns_objref field, that will hold
> the objref of the nxproxy. Of course, now, all pids_arr will have the
> same objref, or else ...  This data will follow the pids_arr data in
> the image.
> 
> During checkpoint, we read the pids_arr from the image, and then for
> each objref of an nsproxy that is seen for the first time, we read
> the state of that nsproxy and restore a new one. (In our simple case,
> there will always be exactly one).

The nsproxy is not the right thing to record.  Rather, it
should record a bitmap  of namespaces which are to be private
from the parent task.  Then for each private ns, an optional
section with configuration info.

(maybe that is waht you meant by 'recording the nsproxy')

-serge

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                     ` <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-20 19:34                       ` Oren Laadan
       [not found]                         ` <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-20 19:34 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>> What got me confused was that you loop over all tasks, which is not
>> needed because was assume they all share the name nsproxy; And in
>> restart, you unshare() many times by the same task, so all but the
>> last unshare() are useless.  In other words, I wonder what is the
>> need for that loop over all processes.
>>
>> Here is a suggestion for a simple change that is likely to be a step
>> towards more generic solution in the future:
>>
>> The nsprox is a property of a task, and it is (possibly) shared. We
>> can put the data either on the pids_arr or on the cr_hdr_task itself.
>> For simplicity (and to work with your scheme) let's assume the former.
>>
>> We can extend the pids_arr to have a ns_objref field, that will hold
>> the objref of the nxproxy. Of course, now, all pids_arr will have the
>> same objref, or else ...  This data will follow the pids_arr data in
>> the image.
>>
>> During checkpoint, we read the pids_arr from the image, and then for
>> each objref of an nsproxy that is seen for the first time, we read
>> the state of that nsproxy and restore a new one. (In our simple case,
>> there will always be exactly one).
> 
> The nsproxy is not the right thing to record.  Rather, it
> should record a bitmap  of namespaces which are to be private
> from the parent task.  Then for each private ns, an optional
> section with configuration info.

Rethinking, I agree that the nsproxy is not the right thing to record.
On the other hand, a bitmap is insufficient to expose the relationships
of sharing.

Putting aside the pidns for a second, I'd think that all other ns's may
be relative easy to handle, even nested. Let me try to explain:

1) An nsproxy is a shared resource, so it gets an nsproxy_objref

2) Each ns in an nsproxy is itself shared (analoguous to file and then inode,
in pipes), so each ns also gets an ns_objref.

3) In checkpoint, for each task we'll do:
	if (nsproxy seen first time) {
		alloc nsproxy_objref
		for each ns in nsproxy {
			if (ns seen first time) {
				alloc ns_objref
				save state of ns
			} else {
				save existing ns_objref
			}
		}
	} else {
		save existing nsproxy_objref
	}

4) In restart, we'll do the same, but also creating the ns's and the nsproxy's

Just as we can restore a file in one process and use the point for another
process, we could pull the same trick to assign nsproxy's to processes, and
to assign ns's to nsproxy's.

The only caveat that is left, and which is also a constraint on the process,
is the nspid - which, in a sense, is a property of the nsproxy.

If we are to restore those in user space, then the nspid will eventually
dictate, I believe, the order of forks when creating the tasks, just as the
session id may be a constraint. I do need to think it further.

Oren.

> (maybe that is waht you meant by 'recording the nsproxy')
> 
> -serge
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                         ` <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-20 19:59                           ` Dave Hansen
  2009-03-20 20:48                             ` Oren Laadan
  2009-03-23 14:52                             ` Dan Smith
  0 siblings, 2 replies; 23+ messages in thread
From: Dave Hansen @ 2009-03-20 19:59 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

On Fri, 2009-03-20 at 15:34 -0400, Oren Laadan wrote:
> 
> 3) In checkpoint, for each task we'll do:
>         if (nsproxy seen first time) {
>                 alloc nsproxy_objref
>                 for each ns in nsproxy {
>                         if (ns seen first time) {
>                                 alloc ns_objref
>                                 save state of ns
>                         } else {
>                                 save existing ns_objref
>                         }
>                 }
>         } else {
>                 save existing nsproxy_objref
>         }

The problem with this is that the nsproxy is in no way, shape, or form
exposed to userspace.  It is wholly an internal implementation detail.

Take out all the nsproxy bits you have above, and do this for each task:

>                for each ns in nsproxy {
>                         if (ns seen first time) {
>                                 alloc ns_objref
>                                 save state of ns
>                         } else {
>                                 save existing ns_objref
>                         }
>                 }

And it will still _function_ *exactly* the same.  It won't be quite as
cache or space compact, but that's fine.

If you're worried about this extra space or cache impact, it can be
fixed up at restart with:

	if (nsproxy_equal(tsk->nsproxy, parent->nsproxy)) {
		get_nsproxy(parent->nsproxy);
		put_nsproxy(tsk->nsproxy);
		tsk->nsproxy = parent->nsproxy;
	}

Voila.  It won't be perfect.  If the parent doesn't work we could also
try the siblings.  

-- Dave

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
  2009-03-20 19:59                           ` Dave Hansen
@ 2009-03-20 20:48                             ` Oren Laadan
       [not found]                               ` <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-23 14:52                             ` Dan Smith
  1 sibling, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-20 20:48 UTC (permalink / raw)
  To: Dave Hansen
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Dave Hansen wrote:
> On Fri, 2009-03-20 at 15:34 -0400, Oren Laadan wrote:
>> 3) In checkpoint, for each task we'll do:
>>         if (nsproxy seen first time) {
>>                 alloc nsproxy_objref
>>                 for each ns in nsproxy {
>>                         if (ns seen first time) {
>>                                 alloc ns_objref
>>                                 save state of ns
>>                         } else {
>>                                 save existing ns_objref
>>                         }
>>                 }
>>         } else {
>>                 save existing nsproxy_objref
>>         }
> 
> The problem with this is that the nsproxy is in no way, shape, or form
> exposed to userspace.  It is wholly an internal implementation detail.
> 
> Take out all the nsproxy bits you have above, and do this for each task:
> 
>>                for each ns in nsproxy {
>>                         if (ns seen first time) {
>>                                 alloc ns_objref
>>                                 save state of ns
>>                         } else {
>>                                 save existing ns_objref
>>                         }
>>                 }
> 
> And it will still _function_ *exactly* the same.  It won't be quite as
> cache or space compact, but that's fine.
> 
> If you're worried about this extra space or cache impact, it can be
> fixed up at restart with:
> 
> 	if (nsproxy_equal(tsk->nsproxy, parent->nsproxy)) {
> 		get_nsproxy(parent->nsproxy);
> 		put_nsproxy(tsk->nsproxy);
> 		tsk->nsproxy = parent->nsproxy;
> 	}
> 
> Voila.  It won't be perfect.  If the parent doesn't work we could also
> try the siblings.  

Yup.

Does that scale well with many (1000's) of tasks ?

(The motivation to expose 'nsproxy_objref' to user space was to allow
user-space to decide on a sequence of clones/unshared that will create
an equivalent process tree with space-efficient nsproxy's).

Perhaps instead of nsproxy_equal() we could use an 'nsproxy_objref'
to know a-priori that it is common, and save the "compare-and-swap"
phase altogether.

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                               ` <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-20 21:00                                 ` Dave Hansen
  2009-03-20 21:26                                   ` Oren Laadan
  0 siblings, 1 reply; 23+ messages in thread
From: Dave Hansen @ 2009-03-20 21:00 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote:
> Does that scale well with many (1000's) of tasks ?
> 
> (The motivation to expose 'nsproxy_objref' to user space was to allow
> user-space to decide on a sequence of clones/unshared that will create
> an equivalent process tree with space-efficient nsproxy's).

OK, so you're saying that there's no way for userspace to tell that a
set of tasks share an nsproxy other than exporting that nsproxy?

> Perhaps instead of nsproxy_equal() we could use an 'nsproxy_objref'
> to know a-priori that it is common, and save the "compare-and-swap"
> phase altogether.

Sure, that's workable.  The important thing is that it is an
optimization that can be left for later.

-- Dave

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
  2009-03-20 21:00                                 ` Dave Hansen
@ 2009-03-20 21:26                                   ` Oren Laadan
       [not found]                                     ` <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-20 21:26 UTC (permalink / raw)
  To: Dave Hansen
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Dave Hansen wrote:
> On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote:
>> Does that scale well with many (1000's) of tasks ?
>>
>> (The motivation to expose 'nsproxy_objref' to user space was to allow
>> user-space to decide on a sequence of clones/unshared that will create
>> an equivalent process tree with space-efficient nsproxy's).
> 
> OK, so you're saying that there's no way for userspace to tell that a
> set of tasks share an nsproxy other than exporting that nsproxy?

I don't think so. Maybe the namespaces people know better.

Oren.

> 
>> Perhaps instead of nsproxy_equal() we could use an 'nsproxy_objref'
>> to know a-priori that it is common, and save the "compare-and-swap"
>> phase altogether.
> 
> Sure, that's workable.  The important thing is that it is an
> optimization that can be left for later.
> 
> -- Dave
> 
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                                     ` <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-20 21:39                                       ` Dave Hansen
  2009-03-20 21:58                                         ` Oren Laadan
  0 siblings, 1 reply; 23+ messages in thread
From: Dave Hansen @ 2009-03-20 21:39 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

On Fri, 2009-03-20 at 17:26 -0400, Oren Laadan wrote:
> Dave Hansen wrote:
> > On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote:
> >> Does that scale well with many (1000's) of tasks ?
> >>
> >> (The motivation to expose 'nsproxy_objref' to user space was to allow
> >> user-space to decide on a sequence of clones/unshared that will create
> >> an equivalent process tree with space-efficient nsproxy's).
> > 
> > OK, so you're saying that there's no way for userspace to tell that a
> > set of tasks share an nsproxy other than exporting that nsproxy?
> 
> I don't think so. Maybe the namespaces people know better.

Please go look at the code.

Two processes share an nsproxy (or *can* share an nsproxy) when all of
the namespace pointers are the same.  After allocation and
initialization, we basically don't ever write to the nsproxy.  We just
hook it into a task and run with it.

If anything ever unshares one of the nsproxy namespaces, before writing
to it we *first* create a new nsproxy which is a copy of the old one.

We basically open-code copy-on-write semantics for the nsproxy.

This means that it is safe for any two tasks that share all of the
pointers in the nsproxy to share an nsproxy itself.

-- Dave

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
  2009-03-20 21:39                                       ` Dave Hansen
@ 2009-03-20 21:58                                         ` Oren Laadan
  0 siblings, 0 replies; 23+ messages in thread
From: Oren Laadan @ 2009-03-20 21:58 UTC (permalink / raw)
  To: Dave Hansen
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Dave Hansen wrote:
> On Fri, 2009-03-20 at 17:26 -0400, Oren Laadan wrote:
>> Dave Hansen wrote:
>>> On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote:
>>>> Does that scale well with many (1000's) of tasks ?
>>>>
>>>> (The motivation to expose 'nsproxy_objref' to user space was to allow
>>>> user-space to decide on a sequence of clones/unshared that will create
>>>> an equivalent process tree with space-efficient nsproxy's).
>>> OK, so you're saying that there's no way for userspace to tell that a
>>> set of tasks share an nsproxy other than exporting that nsproxy?
>> I don't think so. Maybe the namespaces people know better.
> 
> Please go look at the code.


Heheh .. read: "I don't think so" as in "I don't think tasks in user
space can tell whether or not they share an nsproxy in the kernel."

(and the corollary follows ...)

> 
> Two processes share an nsproxy (or *can* share an nsproxy) when all of
> the namespace pointers are the same.  After allocation and
> initialization, we basically don't ever write to the nsproxy.  We just
> hook it into a task and run with it.
> 
> If anything ever unshares one of the nsproxy namespaces, before writing
> to it we *first* create a new nsproxy which is a copy of the old one.
> 
> We basically open-code copy-on-write semantics for the nsproxy.
> 
> This means that it is safe for any two tasks that share all of the
> pointers in the nsproxy to share an nsproxy itself.
> 
> -- Dave

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
  2009-03-20 19:59                           ` Dave Hansen
  2009-03-20 20:48                             ` Oren Laadan
@ 2009-03-23 14:52                             ` Dan Smith
       [not found]                               ` <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  1 sibling, 1 reply; 23+ messages in thread
From: Dan Smith @ 2009-03-23 14:52 UTC (permalink / raw)
  To: Dave Hansen
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

DH> for each ns in nsproxy {
DH>   if (ns seen first time) {
DH>     alloc ns_objref
DH>     save state of ns
DH>   } else {
DH>     save existing ns_objref
DH>   }
DH> }

Isn't this precisely what I have in the latest patch?  Since there is
only one supported namespace type, perhaps the for loop is hard to
recognize...

The bitfield serves only to describe which *new* namespace records
follow the general one, but the objref of each (supported) namespace
is included in the initial record.  Now that I think of it, I can do
away with the bitfield to avoid confusion.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                               ` <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-23 15:08                                 ` Serge E. Hallyn
  0 siblings, 0 replies; 23+ messages in thread
From: Serge E. Hallyn @ 2009-03-23 15:08 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w, Dave Hansen

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> DH> for each ns in nsproxy {
> DH>   if (ns seen first time) {
> DH>     alloc ns_objref
> DH>     save state of ns
> DH>   } else {
> DH>     save existing ns_objref
> DH>   }
> DH> }
> 
> Isn't this precisely what I have in the latest patch?  Since there is
> only one supported namespace type, perhaps the for loop is hard to
> recognize...

Yes, I still think your patch is good...

> The bitfield serves only to describe which *new* namespace records
> follow the general one, but the objref of each (supported) namespace
> is included in the initial record.  Now that I think of it, I can do
> away with the bitfield to avoid confusion.

How about you tweak both Nathan and Oren (who I think both have
IPC c/r patchsets) by adding dummy ipc c/r support.  Just define
CR_OBJ_IPCNS, make cr_wrte_ns_ipc() empty and call it from
cr_write_namespaces().

-serge

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]             ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-19 23:13               ` Oren Laadan
@ 2009-03-20 18:05               ` Serge E. Hallyn
       [not found]                 ` <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 23+ messages in thread
From: Serge E. Hallyn @ 2009-03-20 18:05 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Dan Smith wrote:
> > OL> So what happens in the following scenario:
> > 
> > OL> * task A is the container init(1)
> > OL> * A calls fork() to create task B
> > OL> * B calls unshare(CLONE_NEWUTS)
> > OL> * B calls clone(CLONE_PARENT) to create task C
> > 
> > In the previous version of the patch, I failed the checkpoint if this
> > was the case by making sure that all tasks in the set had the same
> > nsproxy.  You said in IRC that this was already done elsewhere in the
> > infrastructure, but now that I look I don't see that anywhere.
> > 
> 
> in cr_may_checkpoint_task():
> 
>  285         /* FIXME: change this for nested containers */
>  286         if (task_nsproxy(t) != ctx->root_nsproxy)
>  287                 return -EPERM;
> 
> > The check I had was in response to Daniel's comments about avoiding
> > the situation for the time being by making sure that all the tasks had
> > the same set of namespaces (i.e. the same nsproxy at the time of
> > checkpoint).
> > 
> > OL> Two approaches to solve this are:
> > 
> > OL> a) Identify, in mktree, that this was the case, and impose an
> > OL> order on the forks/clones to recreate the same dependency (an
> > OL> algorithm for this is described in [1])
> > 
> > OL> b) Do it in the kernel: for each nsproxy (identified by an objref)
> > OL> the first task that has it will create it during restart, in or
> > OL> out of the kernel, and the next task will simply attach to the
> > OL> existing one that will be deposited in the objhash.
> > 
> > I think that prior discussion led to the conclusion that simplicity
> > wins for the moment, but if you want to solve it now I can cook up
> > some changes.
> > 
> 
> If we keep the assumption, for simplicity, that all tasks share the
> same namespace, then the checkpoint code should check, once, how that
> nsproxy differs from the container's parent (except for the obvious
> pidns).

I disagree.  Whether the container had its own utsns doesn't
affect whether it should have a private utsns on restart.

> If it does differ, e.g. in uts, then the checkpoint should save the
> uts state _once_ - as in global data. Restart will restore the state
> also _once_, for the init of the container (the first task restored),
> _before_ it forks the rest of the tree.
> 
> Otherwise, we don't get the same outcome.

Again I disagree.  If we were planning on never supporting nested
uts namespaces it woudl be fine, but what you are talking about
is making sure we have to break the checkpoint format later to support
nested namespaces.

Rather, we should do:

1. record the hostname for the container in global data.
2. The restart program can decide whether to honor the global
   checkpoint image hostname or not.  It can either use a
   command line option, or check whether the recorded hostname
   is different from the restart host.  I prefer the former.
3. for each task, leave an optional spot for hostname.  If
   there is a hostname, then it will unshare(CLONE_NEWUTS)
   and set its hostname before calling sys_restart() or
   cloning any child tasks.

-serge

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                 ` <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-20 19:38                   ` Oren Laadan
       [not found]                     ` <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-20 19:38 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>
>> Dan Smith wrote:
>>> OL> So what happens in the following scenario:
>>>
>>> OL> * task A is the container init(1)
>>> OL> * A calls fork() to create task B
>>> OL> * B calls unshare(CLONE_NEWUTS)
>>> OL> * B calls clone(CLONE_PARENT) to create task C
>>>
>>> In the previous version of the patch, I failed the checkpoint if this
>>> was the case by making sure that all tasks in the set had the same
>>> nsproxy.  You said in IRC that this was already done elsewhere in the
>>> infrastructure, but now that I look I don't see that anywhere.
>>>
>> in cr_may_checkpoint_task():
>>
>>  285         /* FIXME: change this for nested containers */
>>  286         if (task_nsproxy(t) != ctx->root_nsproxy)
>>  287                 return -EPERM;
>>
>>> The check I had was in response to Daniel's comments about avoiding
>>> the situation for the time being by making sure that all the tasks had
>>> the same set of namespaces (i.e. the same nsproxy at the time of
>>> checkpoint).
>>>
>>> OL> Two approaches to solve this are:
>>>
>>> OL> a) Identify, in mktree, that this was the case, and impose an
>>> OL> order on the forks/clones to recreate the same dependency (an
>>> OL> algorithm for this is described in [1])
>>>
>>> OL> b) Do it in the kernel: for each nsproxy (identified by an objref)
>>> OL> the first task that has it will create it during restart, in or
>>> OL> out of the kernel, and the next task will simply attach to the
>>> OL> existing one that will be deposited in the objhash.
>>>
>>> I think that prior discussion led to the conclusion that simplicity
>>> wins for the moment, but if you want to solve it now I can cook up
>>> some changes.
>>>
>> If we keep the assumption, for simplicity, that all tasks share the
>> same namespace, then the checkpoint code should check, once, how that
>> nsproxy differs from the container's parent (except for the obvious
>> pidns).
> 
> I disagree.  Whether the container had its own utsns doesn't
> affect whether it should have a private utsns on restart.

Right, I missed that...

>> If it does differ, e.g. in uts, then the checkpoint should save the
>> uts state _once_ - as in global data. Restart will restore the state
>> also _once_, for the init of the container (the first task restored),
>> _before_ it forks the rest of the tree.
>>
>> Otherwise, we don't get the same outcome.
> 
> Again I disagree.  If we were planning on never supporting nested
> uts namespaces it woudl be fine, but what you are talking about
> is making sure we have to break the checkpoint format later to support
> nested namespaces.

We don't know how we are to support nested namespaces. So either we solve
it now, or we do something that is bound to break later. The image format
is going to change anyways as we move along.

> 
> Rather, we should do:
> 
> 1. record the hostname for the container in global data.
> 2. The restart program can decide whether to honor the global
>    checkpoint image hostname or not.  It can either use a
>    command line option, or check whether the recorded hostname
>    is different from the restart host.  I prefer the former.

Sounds good.

> 3. for each task, leave an optional spot for hostname.  If
>    there is a hostname, then it will unshare(CLONE_NEWUTS)
>    and set its hostname before calling sys_restart() or
>    cloning any child tasks.

Doesn't this imply a a specific format that is bound to break later ?

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]                     ` <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-20 20:42                       ` Serge E. Hallyn
  0 siblings, 0 replies; 23+ messages in thread
From: Serge E. Hallyn @ 2009-03-20 20:42 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Serge E. Hallyn wrote:
> > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >>
> >> Dan Smith wrote:
> >>> OL> So what happens in the following scenario:
> >>>
> >>> OL> * task A is the container init(1)
> >>> OL> * A calls fork() to create task B
> >>> OL> * B calls unshare(CLONE_NEWUTS)
> >>> OL> * B calls clone(CLONE_PARENT) to create task C
> >>>
> >>> In the previous version of the patch, I failed the checkpoint if this
> >>> was the case by making sure that all tasks in the set had the same
> >>> nsproxy.  You said in IRC that this was already done elsewhere in the
> >>> infrastructure, but now that I look I don't see that anywhere.
> >>>
> >> in cr_may_checkpoint_task():
> >>
> >>  285         /* FIXME: change this for nested containers */
> >>  286         if (task_nsproxy(t) != ctx->root_nsproxy)
> >>  287                 return -EPERM;
> >>
> >>> The check I had was in response to Daniel's comments about avoiding
> >>> the situation for the time being by making sure that all the tasks had
> >>> the same set of namespaces (i.e. the same nsproxy at the time of
> >>> checkpoint).
> >>>
> >>> OL> Two approaches to solve this are:
> >>>
> >>> OL> a) Identify, in mktree, that this was the case, and impose an
> >>> OL> order on the forks/clones to recreate the same dependency (an
> >>> OL> algorithm for this is described in [1])
> >>>
> >>> OL> b) Do it in the kernel: for each nsproxy (identified by an objref)
> >>> OL> the first task that has it will create it during restart, in or
> >>> OL> out of the kernel, and the next task will simply attach to the
> >>> OL> existing one that will be deposited in the objhash.
> >>>
> >>> I think that prior discussion led to the conclusion that simplicity
> >>> wins for the moment, but if you want to solve it now I can cook up
> >>> some changes.
> >>>
> >> If we keep the assumption, for simplicity, that all tasks share the
> >> same namespace, then the checkpoint code should check, once, how that
> >> nsproxy differs from the container's parent (except for the obvious
> >> pidns).
> > 
> > I disagree.  Whether the container had its own utsns doesn't
> > affect whether it should have a private utsns on restart.
> 
> Right, I missed that...
> 
> >> If it does differ, e.g. in uts, then the checkpoint should save the
> >> uts state _once_ - as in global data. Restart will restore the state
> >> also _once_, for the init of the container (the first task restored),
> >> _before_ it forks the rest of the tree.
> >>
> >> Otherwise, we don't get the same outcome.
> > 
> > Again I disagree.  If we were planning on never supporting nested
> > uts namespaces it woudl be fine, but what you are talking about
> > is making sure we have to break the checkpoint format later to support
> > nested namespaces.
> 
> We don't know how we are to support nested namespaces. So either we solve
> it now, or we do something that is bound to break later. The image format
> is going to change anyways as we move along.
> 
> > 
> > Rather, we should do:
> > 
> > 1. record the hostname for the container in global data.
> > 2. The restart program can decide whether to honor the global
> >    checkpoint image hostname or not.  It can either use a
> >    command line option, or check whether the recorded hostname
> >    is different from the restart host.  I prefer the former.
> 
> Sounds good.
> 
> > 3. for each task, leave an optional spot for hostname.  If
> >    there is a hostname, then it will unshare(CLONE_NEWUTS)
> >    and set its hostname before calling sys_restart() or
> >    cloning any child tasks.
> 
> Doesn't this imply a a specific format that is bound to break later ?

Not if we don't specify a format for the optional record now.
We do of course need to pick a spot for it now, and as Dan
noticed, that should be above the actual task layout so that
the info can be easily accessed by mktree.c before calling
sys_restart()...

But what the heck, like you're saying let's leave step 3 for later :)

thanks,
-serge

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-19 22:26   ` Oren Laadan
@ 2009-03-19 22:28   ` Oren Laadan
  2009-03-24 15:07   ` Oren Laadan
  2 siblings, 0 replies; 23+ messages in thread
From: Oren Laadan @ 2009-03-19 22:28 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Dan Smith wrote:
> This patch adds a "phase" of checkpoint that saves out information about any
> namespaces the task(s) may have.  Do this by tracking the namespace objects
> of the tasks and making sure that tasks with the same namespace that follow
> get properly referenced in the checkpoint stream.  Note that for now, we
> refuse to checkpoint if all tasks in the set don't share the same set of
> *all* namespaces.
> 
> Restart is handled in userspace by reading the UTS record(s), calling
> unshare() and setting the hostname accordingly.  See my changes to
> mktree.c for details.
> 
> I tested this with single and multiple task restore, on top of Oren's
> v13 tree.
> 
> Changes:
>   - Remove the kernel restore path
>   - Punt on nested namespaces
>   - Use __NEW_UTS_LEN in nodename and domainname buffers
>   - Add a note to Documentation/checkpoint/internals.txt to indicate where
>     in the save/restore process the UTS information is kept
>   - Store (and track) the objref of the namespace itself instead of the
>     nsproxy (based on comments from Dave on IRC)
>   - Remove explicit check for non-root nsproxy
>   - Store the nodename and domainname lengths and use cr_write_string()
>     to store the actual name strings
> 

[...]

> +static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_namespaces *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	struct nsproxy *nsp = t->nsproxy;
> +	int ret;
> +
> +	h.type = CR_HDR_NS;
> +	h.len = sizeof(*hh);
> +	h.parent = 0;
> +
> +	hh->types = 0;
> +
> +	if (cr_obj_add_ptr(ctx, nsp->uts_ns, &hh->uts_ref, CR_OBJ_UTSNS, 0))
> +		hh->types |= CR_NS_UTS;

cr_obj_add_ptr() may fail with an error (e.g. -ENOMEM).

> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	if (ret)
> +		goto out;
> +
> +	if (hh->types & CR_NS_UTS) {
> +		ret = cr_write_ns_uts(ctx, t);
> +		if (ret < 0)
> +			goto out;
> +
> +		/* FIXME: Write other namespaces here */
> +	}
> + out:
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +
> +	return ret;
> +}
> +

[...]

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-19 22:26   ` Oren Laadan
  2009-03-19 22:28   ` Oren Laadan
@ 2009-03-24 15:07   ` Oren Laadan
  2009-03-24 15:21     ` Dan Smith
  2 siblings, 1 reply; 23+ messages in thread
From: Oren Laadan @ 2009-03-24 15:07 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

Dan,

You might want to look at ckpt-v14-rc1 which has some cleanups, see below.

Dan Smith wrote:
> This patch adds a "phase" of checkpoint that saves out information about any
> namespaces the task(s) may have.  Do this by tracking the namespace objects
> of the tasks and making sure that tasks with the same namespace that follow
> get properly referenced in the checkpoint stream.  Note that for now, we
> refuse to checkpoint if all tasks in the set don't share the same set of
> *all* namespaces.
> 
> Restart is handled in userspace by reading the UTS record(s), calling
> unshare() and setting the hostname accordingly.  See my changes to
> mktree.c for details.

[...]

> +static int cr_write_ns_uts(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	struct new_utsname *n = &t->nsproxy->uts_ns->name;
> +	int ret;

Need to test whether cr_hbuf_get() succeeds (while now it's trivial, in
the future it may fail if we change the allocation method).

> +
> +	h.type = CR_HDR_UTSNS;
> +	h.len = sizeof(*hh);
> +	h.parent = 0;

The 'h.parent' fields is gone.

> +
> +	hh->nodename_len = sizeof(n->nodename);
> +	hh->domainname_len = sizeof(n->domainname);
> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = cr_write_string(ctx, n->nodename, hh->nodename_len);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = cr_write_string(ctx, n->domainname, hh->domainname_len);
> + out:
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +
> +	return ret;
> +}
> +
> +static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_namespaces *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	struct nsproxy *nsp = t->nsproxy;
> +	int ret;
> +
> +	h.type = CR_HDR_NS;
> +	h.len = sizeof(*hh);
> +	h.parent = 0;

Here too.

> +
> +	hh->types = 0;
> +
> +	if (cr_obj_add_ptr(ctx, nsp->uts_ns, &hh->uts_ref, CR_OBJ_UTSNS, 0))
> +		hh->types |= CR_NS_UTS;

cr_obj_add_ptr() mail fail, with -ENOMEM for instance.

If we plan to support multiple uts_ns, then it makes sense to save the
'objref' value. If you omit the 'objref' because you assume a single
namespace for all for now, then you may also drop the loop on all tasks
(below), for now.

> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	if (ret)
> +		goto out;
> +
> +	if (hh->types & CR_NS_UTS) {
> +		ret = cr_write_ns_uts(ctx, t);
> +		if (ret < 0)
> +			goto out;
> +
> +		/* FIXME: Write other namespaces here */
> +	}
> + out:
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +
> +	return ret;
> +}
> +
> +static int cr_write_all_namespaces(struct cr_ctx *ctx)
> +{
> +	int n, ret = 0;
> +
> +	for (n = 0; n < ctx->tasks_nr; n++) {
> +		pr_debug("dumping ns for task #%d\n", n);

I changed this back to cr_debug() in response to a comment about pr_fmt.

> +		ret = cr_write_namespaces(ctx, ctx->tasks_arr[n]);
> +		if (ret < 0)
> +			break;
> +	}

I'm still unhappy with this loop. It isn't necessary now, since we assume
and enforce a single, common namespace among all tasks. It is unlikely to
be useful as is in the future because the format is likely to change anyway.
Even in the (userspace) restart part the code to read it is in the global
section as opposed to per task section. Is there any reason to keep it ?

If you are to dump the namespace information of each task, why not add it
to an already existing table of tasks - the pids_arr (cr_hdr_pids) ?

> +
> +	return ret;
> +}
> +

[...]

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] c/r: Add UTS support (v4)
  2009-03-24 15:07   ` Oren Laadan
@ 2009-03-24 15:21     ` Dan Smith
       [not found]       ` <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Dan Smith @ 2009-03-24 15:21 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w

I'm a little confused.  Haven't you already reviewed v4?  Perhaps you
meant to reply to v5?

OL> Need to test whether cr_hbuf_get() succeeds (while now it's
OL> trivial, in the future it may fail if we change the allocation
OL> method).

Hmm, that's rather unfortunate as it seems to make it messier.  I've
long wondered, why not have cr_write_obj() do the allocation (and
check) so that we can avoid the get and put in the caller?  I suppose
that introduces an additional copy, but it seems like it would make it
significantly more attractive.

OL> The 'h.parent' fields is gone.

Okay.  I need to re-base on your latest.  Sorry about that.

OL> If we plan to support multiple uts_ns, then it makes sense to save
OL> the 'objref' value. If you omit the 'objref' because you assume a
OL> single namespace for all for now, then you may also drop the loop
OL> on all tasks (below), for now.

<snip>

OL> I'm still unhappy with this loop. It isn't necessary now, since we
OL> assume and enforce a single, common namespace among all tasks. It
OL> is unlikely to be useful as is in the future because the format is
OL> likely to change anyway.  Even in the (userspace) restart part the
OL> code to read it is in the global section as opposed to per task
OL> section. Is there any reason to keep it ?

I guess I'm not sure why the format would change.  Rather, I would
expect it to look something quite like this when we do support nested
namespaces.  By having it there, it keeps mktree organized in a
similar way for when we do support it.

However, if you'd rather be very explicit about the unsupported-ness
of it, then I can just completely re-write it to reflect the naive
case.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

[parent not found: <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]

* Re: [PATCH] c/r: Add UTS support (v4)
       [not found]       ` <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-03-24 15:34         ` Oren Laadan
  0 siblings, 0 replies; 23+ messages in thread
From: Oren Laadan @ 2009-03-24 15:34 UTC (permalink / raw)
  To: Dan Smith
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	adobriyan-Re5JQEeQqe8AvxtiuMwx3w



Dan Smith wrote:
> I'm a little confused.  Haven't you already reviewed v4?  Perhaps you
> meant to reply to v5?

Uhhhhh... you are so right and I'm so confused ..
Sorry, ignore those comments please.

> 
> OL> Need to test whether cr_hbuf_get() succeeds (while now it's
> OL> trivial, in the future it may fail if we change the allocation
> OL> method).
> 
> Hmm, that's rather unfortunate as it seems to make it messier.  I've
> long wondered, why not have cr_write_obj() do the allocation (and
> check) so that we can avoid the get and put in the caller?  I suppose
> that introduces an additional copy, but it seems like it would make it
> significantly more attractive.
> 
> OL> The 'h.parent' fields is gone.
> 
> Okay.  I need to re-base on your latest.  Sorry about that.
> 
> OL> If we plan to support multiple uts_ns, then it makes sense to save
> OL> the 'objref' value. If you omit the 'objref' because you assume a
> OL> single namespace for all for now, then you may also drop the loop
> OL> on all tasks (below), for now.
> 
> <snip>
> 
> OL> I'm still unhappy with this loop. It isn't necessary now, since we
> OL> assume and enforce a single, common namespace among all tasks. It
> OL> is unlikely to be useful as is in the future because the format is
> OL> likely to change anyway.  Even in the (userspace) restart part the
> OL> code to read it is in the global section as opposed to per task
> OL> section. Is there any reason to keep it ?
> 
> I guess I'm not sure why the format would change.  Rather, I would
> expect it to look something quite like this when we do support nested
> namespaces.  By having it there, it keeps mktree organized in a
> similar way for when we do support it.

I'd expect it to be a per task information, as opposed to a "global"
part, similar to a session ID. I don't see how the restart of ns can
be done by the first task (in the "global" part) only. While we have
not implemented anything yet, it sounds to me that the right place
to do that will be per task.

In other words, it makes sense to save that information for each task,
but, I think, not as a separate and dedicated array only handled in
the "global" part.

Hence my suggestion that you do dump that information, but instead of
that loop, you add it to the existing array of tasks that is written,
and extend the 'struct cr_hdr_pids'. It's flexible to be used "global"
or per-task.

> However, if you'd rather be very explicit about the unsupported-ness
> of it, then I can just completely re-write it to reflect the naive
> case.

We're already explicit in the test for 'nxproxy != ctx->task->nxproxy'...

Oren.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2009-03-24 15:34 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-03-18 18:51 [PATCH] c/r: Add UTS support (v4) Dan Smith
     [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-19 22:26   ` Oren Laadan
     [not found]     ` <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-19 22:39       ` Dan Smith
     [not found]         ` <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-19 22:58           ` Oren Laadan
     [not found]             ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-19 23:13               ` Oren Laadan
2009-03-20 13:56                 ` Dan Smith
     [not found]                 ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 18:10                   ` Serge E. Hallyn
     [not found]                     ` <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-20 19:34                       ` Oren Laadan
     [not found]                         ` <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 19:59                           ` Dave Hansen
2009-03-20 20:48                             ` Oren Laadan
     [not found]                               ` <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 21:00                                 ` Dave Hansen
2009-03-20 21:26                                   ` Oren Laadan
     [not found]                                     ` <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 21:39                                       ` Dave Hansen
2009-03-20 21:58                                         ` Oren Laadan
2009-03-23 14:52                             ` Dan Smith
     [not found]                               ` <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-23 15:08                                 ` Serge E. Hallyn
2009-03-20 18:05               ` Serge E. Hallyn
     [not found]                 ` <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-20 19:38                   ` Oren Laadan
     [not found]                     ` <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 20:42                       ` Serge E. Hallyn
2009-03-19 22:28   ` Oren Laadan
2009-03-24 15:07   ` Oren Laadan
2009-03-24 15:21     ` Dan Smith
     [not found]       ` <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-24 15:34         ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.