* [PATCH] c/r: Add UTS support (v4)
@ 2009-03-18 18:51 Dan Smith
[not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 23+ messages in thread
From: Dan Smith @ 2009-03-18 18:51 UTC (permalink / raw)
To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: adobriyan-Re5JQEeQqe8AvxtiuMwx3w
This patch adds a "phase" of checkpoint that saves out information about any
namespaces the task(s) may have. Do this by tracking the namespace objects
of the tasks and making sure that tasks with the same namespace that follow
get properly referenced in the checkpoint stream. Note that for now, we
refuse to checkpoint if all tasks in the set don't share the same set of
*all* namespaces.
Restart is handled in userspace by reading the UTS record(s), calling
unshare() and setting the hostname accordingly. See my changes to
mktree.c for details.
I tested this with single and multiple task restore, on top of Oren's
v13 tree.
Changes:
- Remove the kernel restore path
- Punt on nested namespaces
- Use __NEW_UTS_LEN in nodename and domainname buffers
- Add a note to Documentation/checkpoint/internals.txt to indicate where
in the save/restore process the UTS information is kept
- Store (and track) the objref of the namespace itself instead of the
nsproxy (based on comments from Dave on IRC)
- Remove explicit check for non-root nsproxy
- Store the nodename and domainname lengths and use cr_write_string()
to store the actual name strings
Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Cc: adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org
Cc: orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org
---
Documentation/checkpoint/internals.txt | 2 +
checkpoint/checkpoint.c | 80 ++++++++++++++++++++++++++++++++
checkpoint/objhash.c | 7 +++
checkpoint/restart.c | 1 +
include/linux/checkpoint.h | 1 +
include/linux/checkpoint_hdr.h | 14 ++++++
6 files changed, 105 insertions(+), 0 deletions(-)
diff --git a/Documentation/checkpoint/internals.txt b/Documentation/checkpoint/internals.txt
index b363e83..7a2488b 100644
--- a/Documentation/checkpoint/internals.txt
+++ b/Documentation/checkpoint/internals.txt
@@ -12,6 +12,8 @@ The order of operations, both save and restore, is as follows:
* Process forest: [TBD] tasks and their relationships
+* Namespace section: per-container namespace information
+
* Per task data (for each task):
-> task state: elements of task_struct
-> thread state: elements of thread_struct and thread_info
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 64155de..228bdae 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -193,6 +193,82 @@ static int cr_write_tail(struct cr_ctx *ctx)
return ret;
}
+static int cr_write_ns_uts(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct new_utsname *n = &t->nsproxy->uts_ns->name;
+ int ret;
+
+ h.type = CR_HDR_UTSNS;
+ h.len = sizeof(*hh);
+ h.parent = 0;
+
+ hh->nodename_len = sizeof(n->nodename);
+ hh->domainname_len = sizeof(n->domainname);
+
+ ret = cr_write_obj(ctx, &h, hh);
+ if (ret < 0)
+ goto out;
+
+ ret = cr_write_string(ctx, n->nodename, hh->nodename_len);
+ if (ret < 0)
+ goto out;
+
+ ret = cr_write_string(ctx, n->domainname, hh->domainname_len);
+ out:
+ cr_hbuf_put(ctx, sizeof(*hh));
+
+ return ret;
+}
+
+static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_namespaces *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct nsproxy *nsp = t->nsproxy;
+ int ret;
+
+ h.type = CR_HDR_NS;
+ h.len = sizeof(*hh);
+ h.parent = 0;
+
+ hh->types = 0;
+
+ if (cr_obj_add_ptr(ctx, nsp->uts_ns, &hh->uts_ref, CR_OBJ_UTSNS, 0))
+ hh->types |= CR_NS_UTS;
+
+ ret = cr_write_obj(ctx, &h, hh);
+ if (ret)
+ goto out;
+
+ if (hh->types & CR_NS_UTS) {
+ ret = cr_write_ns_uts(ctx, t);
+ if (ret < 0)
+ goto out;
+
+ /* FIXME: Write other namespaces here */
+ }
+ out:
+ cr_hbuf_put(ctx, sizeof(*hh));
+
+ return ret;
+}
+
+static int cr_write_all_namespaces(struct cr_ctx *ctx)
+{
+ int n, ret = 0;
+
+ for (n = 0; n < ctx->tasks_nr; n++) {
+ pr_debug("dumping ns for task #%d\n", n);
+ ret = cr_write_namespaces(ctx, ctx->tasks_arr[n]);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
/* dump the task_struct of a given task */
static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
{
@@ -549,6 +625,10 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
if (ret < 0)
goto out;
+ ret = cr_write_all_namespaces(ctx);
+ if (ret < 0)
+ goto out;
+
ret = cr_write_all_tasks(ctx);
if (ret < 0)
goto out;
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index ee31b38..afcf1d1 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -12,6 +12,7 @@
#include <linux/file.h>
#include <linux/hash.h>
#include <linux/checkpoint.h>
+#include <linux/utsname.h>
struct cr_objref {
int objref;
@@ -35,6 +36,9 @@ static void cr_obj_ref_drop(struct cr_objref *obj)
case CR_OBJ_FILE:
fput((struct file *) obj->ptr);
break;
+ case CR_OBJ_UTSNS:
+ put_uts_ns((struct uts_namespace *) obj->ptr);
+ break;
default:
BUG();
}
@@ -46,6 +50,9 @@ static void cr_obj_ref_grab(struct cr_objref *obj)
case CR_OBJ_FILE:
get_file((struct file *) obj->ptr);
break;
+ case CR_OBJ_UTSNS:
+ get_uts_ns((struct uts_namespace *) obj->ptr);
+ break;
default:
BUG();
}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 7ec4de4..0ed01aa 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -15,6 +15,7 @@
#include <linux/magic.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <linux/utsname.h>
#include "checkpoint_arch.h"
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 217cf6e..02c2990 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -75,6 +75,7 @@ extern void cr_ctx_put(struct cr_ctx *ctx);
enum {
CR_OBJ_FILE = 1,
+ CR_OBJ_UTSNS,
CR_OBJ_MAX
};
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 6dc739f..886ab53 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -49,6 +49,8 @@ enum {
CR_HDR_TASK,
CR_HDR_THREAD,
CR_HDR_CPU,
+ CR_HDR_NS,
+ CR_HDR_UTSNS,
CR_HDR_MM = 201,
CR_HDR_VMA,
@@ -156,4 +158,16 @@ struct cr_hdr_fd_data {
__u64 f_version;
} __attribute__((aligned(8)));
+#define CR_NS_UTS 1
+
+struct cr_hdr_namespaces {
+ __u32 types; /* NS records that follow this */
+ __u32 uts_ref; /* Objref of matching UTS namespace */
+};
+
+struct cr_hdr_utsns {
+ __u32 nodename_len;
+ __u32 domainname_len;
+};
+
#endif /* _CHECKPOINT_CKPT_HDR_H_ */
--
1.5.6.3
^ permalink raw reply related [flat|nested] 23+ messages in thread[parent not found: <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-03-19 22:26 ` Oren Laadan [not found] ` <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 2009-03-19 22:28 ` Oren Laadan 2009-03-24 15:07 ` Oren Laadan 2 siblings, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-19 22:26 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w So what happens in the following scenario: * task A is the container init(1) * A calls fork() to create task B * B calls unshare(CLONE_NEWUTS) * B calls clone(CLONE_PARENT) to create task C Now C and B have the same UTS namespace, which differs from A's, but they will each have a different UTS namespace when restarted with this patch. This is part of a larger complexity related to how CLONE_PARENT works. Also related to how session IDs are inherited. Two approaches to solve this are: a) Identify, in mktree, that this was the case, and impose an order on the forks/clones to recreate the same dependency (an algorithm for this is described in [1]) b) Do it in the kernel: for each nsproxy (identified by an objref) the first task that has it will create it during restart, in or out of the kernel, and the next task will simply attach to the existing one that will be deposited in the objhash. Oren. [1] "Transparent Checkpoint/Restart of Multiple Processes on Commodity Operating Systems", http://www1.cs.columbia.edu/~orenl/papers/usenix07-checkpoint.pdf Dan Smith wrote: > This patch adds a "phase" of checkpoint that saves out information about any > namespaces the task(s) may have. Do this by tracking the namespace objects > of the tasks and making sure that tasks with the same namespace that follow > get properly referenced in the checkpoint stream. Note that for now, we > refuse to checkpoint if all tasks in the set don't share the same set of > *all* namespaces. > > Restart is handled in userspace by reading the UTS record(s), calling > unshare() and setting the hostname accordingly. See my changes to > mktree.c for details. > > I tested this with single and multiple task restore, on top of Oren's > v13 tree. [...] ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-19 22:39 ` Dan Smith [not found] ` <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Dan Smith @ 2009-03-19 22:39 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w OL> So what happens in the following scenario: OL> * task A is the container init(1) OL> * A calls fork() to create task B OL> * B calls unshare(CLONE_NEWUTS) OL> * B calls clone(CLONE_PARENT) to create task C In the previous version of the patch, I failed the checkpoint if this was the case by making sure that all tasks in the set had the same nsproxy. You said in IRC that this was already done elsewhere in the infrastructure, but now that I look I don't see that anywhere. The check I had was in response to Daniel's comments about avoiding the situation for the time being by making sure that all the tasks had the same set of namespaces (i.e. the same nsproxy at the time of checkpoint). OL> Two approaches to solve this are: OL> a) Identify, in mktree, that this was the case, and impose an OL> order on the forks/clones to recreate the same dependency (an OL> algorithm for this is described in [1]) OL> b) Do it in the kernel: for each nsproxy (identified by an objref) OL> the first task that has it will create it during restart, in or OL> out of the kernel, and the next task will simply attach to the OL> existing one that will be deposited in the objhash. I think that prior discussion led to the conclusion that simplicity wins for the moment, but if you want to solve it now I can cook up some changes. -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-03-19 22:58 ` Oren Laadan [not found] ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-19 22:58 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dan Smith wrote: > OL> So what happens in the following scenario: > > OL> * task A is the container init(1) > OL> * A calls fork() to create task B > OL> * B calls unshare(CLONE_NEWUTS) > OL> * B calls clone(CLONE_PARENT) to create task C > > In the previous version of the patch, I failed the checkpoint if this > was the case by making sure that all tasks in the set had the same > nsproxy. You said in IRC that this was already done elsewhere in the > infrastructure, but now that I look I don't see that anywhere. > in cr_may_checkpoint_task(): 285 /* FIXME: change this for nested containers */ 286 if (task_nsproxy(t) != ctx->root_nsproxy) 287 return -EPERM; > The check I had was in response to Daniel's comments about avoiding > the situation for the time being by making sure that all the tasks had > the same set of namespaces (i.e. the same nsproxy at the time of > checkpoint). > > OL> Two approaches to solve this are: > > OL> a) Identify, in mktree, that this was the case, and impose an > OL> order on the forks/clones to recreate the same dependency (an > OL> algorithm for this is described in [1]) > > OL> b) Do it in the kernel: for each nsproxy (identified by an objref) > OL> the first task that has it will create it during restart, in or > OL> out of the kernel, and the next task will simply attach to the > OL> existing one that will be deposited in the objhash. > > I think that prior discussion led to the conclusion that simplicity > wins for the moment, but if you want to solve it now I can cook up > some changes. > If we keep the assumption, for simplicity, that all tasks share the same namespace, then the checkpoint code should check, once, how that nsproxy differs from the container's parent (except for the obvious pidns). If it does differ, e.g. in uts, then the checkpoint should save the uts state _once_ - as in global data. Restart will restore the state also _once_, for the init of the container (the first task restored), _before_ it forks the rest of the tree. Otherwise, we don't get the same outcome. Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-19 23:13 ` Oren Laadan 2009-03-20 13:56 ` Dan Smith [not found] ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 2009-03-20 18:05 ` Serge E. Hallyn 1 sibling, 2 replies; 23+ messages in thread From: Oren Laadan @ 2009-03-19 23:13 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Oren Laadan wrote: > > Dan Smith wrote: >> OL> So what happens in the following scenario: >> >> OL> * task A is the container init(1) >> OL> * A calls fork() to create task B >> OL> * B calls unshare(CLONE_NEWUTS) >> OL> * B calls clone(CLONE_PARENT) to create task C >> >> In the previous version of the patch, I failed the checkpoint if this >> was the case by making sure that all tasks in the set had the same >> nsproxy. You said in IRC that this was already done elsewhere in the >> infrastructure, but now that I look I don't see that anywhere. >> > > in cr_may_checkpoint_task(): > > 285 /* FIXME: change this for nested containers */ > 286 if (task_nsproxy(t) != ctx->root_nsproxy) > 287 return -EPERM; > >> The check I had was in response to Daniel's comments about avoiding >> the situation for the time being by making sure that all the tasks had >> the same set of namespaces (i.e. the same nsproxy at the time of >> checkpoint). >> >> OL> Two approaches to solve this are: >> >> OL> a) Identify, in mktree, that this was the case, and impose an >> OL> order on the forks/clones to recreate the same dependency (an >> OL> algorithm for this is described in [1]) >> >> OL> b) Do it in the kernel: for each nsproxy (identified by an objref) >> OL> the first task that has it will create it during restart, in or >> OL> out of the kernel, and the next task will simply attach to the >> OL> existing one that will be deposited in the objhash. >> >> I think that prior discussion led to the conclusion that simplicity >> wins for the moment, but if you want to solve it now I can cook up >> some changes. >> > > If we keep the assumption, for simplicity, that all tasks share the > same namespace, then the checkpoint code should check, once, how that > nsproxy differs from the container's parent (except for the obvious > pidns). > > If it does differ, e.g. in uts, then the checkpoint should save the > uts state _once_ - as in global data. Restart will restore the state > also _once_, for the init of the container (the first task restored), > _before_ it forks the rest of the tree. > > Otherwise, we don't get the same outcome. ... I re-read the code to make sure,so - You indeed do it before all tasks are forked, so that's correct. What got me confused was that you loop over all tasks, which is not needed because was assume they all share the name nsproxy; And in restart, you unshare() many times by the same task, so all but the last unshare() are useless. In other words, I wonder what is the need for that loop over all processes. Here is a suggestion for a simple change that is likely to be a step towards more generic solution in the future: The nsprox is a property of a task, and it is (possibly) shared. We can put the data either on the pids_arr or on the cr_hdr_task itself. For simplicity (and to work with your scheme) let's assume the former. We can extend the pids_arr to have a ns_objref field, that will hold the objref of the nxproxy. Of course, now, all pids_arr will have the same objref, or else ... This data will follow the pids_arr data in the image. During checkpoint, we read the pids_arr from the image, and then for each objref of an nsproxy that is seen for the first time, we read the state of that nsproxy and restore a new one. (In our simple case, there will always be exactly one). Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) 2009-03-19 23:13 ` Oren Laadan @ 2009-03-20 13:56 ` Dan Smith [not found] ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 1 sibling, 0 replies; 23+ messages in thread From: Dan Smith @ 2009-03-20 13:56 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w, Dave Hansen OL> What got me confused was that you loop over all tasks, which is OL> not needed because was assume they all share the name nsproxy; And OL> in restart, you unshare() many times by the same task, so all but OL> the last unshare() are useless. In other words, I wonder what is OL> the need for that loop over all processes. You're exactly right, but this wasn't my intent. It was left over from the first iteration of the patch. OL> Here is a suggestion for a simple change that is likely to be a step OL> towards more generic solution in the future: OL> The nsprox is a property of a task, and it is (possibly) shared. We OL> can put the data either on the pids_arr or on the cr_hdr_task itself. OL> For simplicity (and to work with your scheme) let's assume the former. OL> We can extend the pids_arr to have a ns_objref field, that will hold OL> the objref of the nxproxy. Of course, now, all pids_arr will have the OL> same objref, or else ... This data will follow the pids_arr data in OL> the image. OL> During checkpoint, we read the pids_arr from the image, and then for OL> each objref of an nsproxy that is seen for the first time, we read OL> the state of that nsproxy and restore a new one. (In our simple case, OL> there will always be exactly one). Storing an objref of the nsproxy for each task to track changes at that level is what I did, and is the reason for the gratuitous unshare() that is left there. The idea was only to unshare() when I encountered a new nsproxy objref. However, Dave had some comments on this, which he made only on IRC. Dave do you want to document them here for the benefit of the list? -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-20 18:10 ` Serge E. Hallyn [not found] ` <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Serge E. Hallyn @ 2009-03-20 18:10 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): > What got me confused was that you loop over all tasks, which is not > needed because was assume they all share the name nsproxy; And in > restart, you unshare() many times by the same task, so all but the > last unshare() are useless. In other words, I wonder what is the > need for that loop over all processes. > > Here is a suggestion for a simple change that is likely to be a step > towards more generic solution in the future: > > The nsprox is a property of a task, and it is (possibly) shared. We > can put the data either on the pids_arr or on the cr_hdr_task itself. > For simplicity (and to work with your scheme) let's assume the former. > > We can extend the pids_arr to have a ns_objref field, that will hold > the objref of the nxproxy. Of course, now, all pids_arr will have the > same objref, or else ... This data will follow the pids_arr data in > the image. > > During checkpoint, we read the pids_arr from the image, and then for > each objref of an nsproxy that is seen for the first time, we read > the state of that nsproxy and restore a new one. (In our simple case, > there will always be exactly one). The nsproxy is not the right thing to record. Rather, it should record a bitmap of namespaces which are to be private from the parent task. Then for each private ns, an optional section with configuration info. (maybe that is waht you meant by 'recording the nsproxy') -serge ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-03-20 19:34 ` Oren Laadan [not found] ` <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-20 19:34 UTC (permalink / raw) To: Serge E. Hallyn Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Serge E. Hallyn wrote: > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): >> What got me confused was that you loop over all tasks, which is not >> needed because was assume they all share the name nsproxy; And in >> restart, you unshare() many times by the same task, so all but the >> last unshare() are useless. In other words, I wonder what is the >> need for that loop over all processes. >> >> Here is a suggestion for a simple change that is likely to be a step >> towards more generic solution in the future: >> >> The nsprox is a property of a task, and it is (possibly) shared. We >> can put the data either on the pids_arr or on the cr_hdr_task itself. >> For simplicity (and to work with your scheme) let's assume the former. >> >> We can extend the pids_arr to have a ns_objref field, that will hold >> the objref of the nxproxy. Of course, now, all pids_arr will have the >> same objref, or else ... This data will follow the pids_arr data in >> the image. >> >> During checkpoint, we read the pids_arr from the image, and then for >> each objref of an nsproxy that is seen for the first time, we read >> the state of that nsproxy and restore a new one. (In our simple case, >> there will always be exactly one). > > The nsproxy is not the right thing to record. Rather, it > should record a bitmap of namespaces which are to be private > from the parent task. Then for each private ns, an optional > section with configuration info. Rethinking, I agree that the nsproxy is not the right thing to record. On the other hand, a bitmap is insufficient to expose the relationships of sharing. Putting aside the pidns for a second, I'd think that all other ns's may be relative easy to handle, even nested. Let me try to explain: 1) An nsproxy is a shared resource, so it gets an nsproxy_objref 2) Each ns in an nsproxy is itself shared (analoguous to file and then inode, in pipes), so each ns also gets an ns_objref. 3) In checkpoint, for each task we'll do: if (nsproxy seen first time) { alloc nsproxy_objref for each ns in nsproxy { if (ns seen first time) { alloc ns_objref save state of ns } else { save existing ns_objref } } } else { save existing nsproxy_objref } 4) In restart, we'll do the same, but also creating the ns's and the nsproxy's Just as we can restore a file in one process and use the point for another process, we could pull the same trick to assign nsproxy's to processes, and to assign ns's to nsproxy's. The only caveat that is left, and which is also a constraint on the process, is the nspid - which, in a sense, is a property of the nsproxy. If we are to restore those in user space, then the nspid will eventually dictate, I believe, the order of forks when creating the tasks, just as the session id may be a constraint. I do need to think it further. Oren. > (maybe that is waht you meant by 'recording the nsproxy') > > -serge > ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-20 19:59 ` Dave Hansen 2009-03-20 20:48 ` Oren Laadan 2009-03-23 14:52 ` Dan Smith 0 siblings, 2 replies; 23+ messages in thread From: Dave Hansen @ 2009-03-20 19:59 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w On Fri, 2009-03-20 at 15:34 -0400, Oren Laadan wrote: > > 3) In checkpoint, for each task we'll do: > if (nsproxy seen first time) { > alloc nsproxy_objref > for each ns in nsproxy { > if (ns seen first time) { > alloc ns_objref > save state of ns > } else { > save existing ns_objref > } > } > } else { > save existing nsproxy_objref > } The problem with this is that the nsproxy is in no way, shape, or form exposed to userspace. It is wholly an internal implementation detail. Take out all the nsproxy bits you have above, and do this for each task: > for each ns in nsproxy { > if (ns seen first time) { > alloc ns_objref > save state of ns > } else { > save existing ns_objref > } > } And it will still _function_ *exactly* the same. It won't be quite as cache or space compact, but that's fine. If you're worried about this extra space or cache impact, it can be fixed up at restart with: if (nsproxy_equal(tsk->nsproxy, parent->nsproxy)) { get_nsproxy(parent->nsproxy); put_nsproxy(tsk->nsproxy); tsk->nsproxy = parent->nsproxy; } Voila. It won't be perfect. If the parent doesn't work we could also try the siblings. -- Dave ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) 2009-03-20 19:59 ` Dave Hansen @ 2009-03-20 20:48 ` Oren Laadan [not found] ` <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 2009-03-23 14:52 ` Dan Smith 1 sibling, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-20 20:48 UTC (permalink / raw) To: Dave Hansen Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dave Hansen wrote: > On Fri, 2009-03-20 at 15:34 -0400, Oren Laadan wrote: >> 3) In checkpoint, for each task we'll do: >> if (nsproxy seen first time) { >> alloc nsproxy_objref >> for each ns in nsproxy { >> if (ns seen first time) { >> alloc ns_objref >> save state of ns >> } else { >> save existing ns_objref >> } >> } >> } else { >> save existing nsproxy_objref >> } > > The problem with this is that the nsproxy is in no way, shape, or form > exposed to userspace. It is wholly an internal implementation detail. > > Take out all the nsproxy bits you have above, and do this for each task: > >> for each ns in nsproxy { >> if (ns seen first time) { >> alloc ns_objref >> save state of ns >> } else { >> save existing ns_objref >> } >> } > > And it will still _function_ *exactly* the same. It won't be quite as > cache or space compact, but that's fine. > > If you're worried about this extra space or cache impact, it can be > fixed up at restart with: > > if (nsproxy_equal(tsk->nsproxy, parent->nsproxy)) { > get_nsproxy(parent->nsproxy); > put_nsproxy(tsk->nsproxy); > tsk->nsproxy = parent->nsproxy; > } > > Voila. It won't be perfect. If the parent doesn't work we could also > try the siblings. Yup. Does that scale well with many (1000's) of tasks ? (The motivation to expose 'nsproxy_objref' to user space was to allow user-space to decide on a sequence of clones/unshared that will create an equivalent process tree with space-efficient nsproxy's). Perhaps instead of nsproxy_equal() we could use an 'nsproxy_objref' to know a-priori that it is common, and save the "compare-and-swap" phase altogether. Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-20 21:00 ` Dave Hansen 2009-03-20 21:26 ` Oren Laadan 0 siblings, 1 reply; 23+ messages in thread From: Dave Hansen @ 2009-03-20 21:00 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote: > Does that scale well with many (1000's) of tasks ? > > (The motivation to expose 'nsproxy_objref' to user space was to allow > user-space to decide on a sequence of clones/unshared that will create > an equivalent process tree with space-efficient nsproxy's). OK, so you're saying that there's no way for userspace to tell that a set of tasks share an nsproxy other than exporting that nsproxy? > Perhaps instead of nsproxy_equal() we could use an 'nsproxy_objref' > to know a-priori that it is common, and save the "compare-and-swap" > phase altogether. Sure, that's workable. The important thing is that it is an optimization that can be left for later. -- Dave ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) 2009-03-20 21:00 ` Dave Hansen @ 2009-03-20 21:26 ` Oren Laadan [not found] ` <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-20 21:26 UTC (permalink / raw) To: Dave Hansen Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dave Hansen wrote: > On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote: >> Does that scale well with many (1000's) of tasks ? >> >> (The motivation to expose 'nsproxy_objref' to user space was to allow >> user-space to decide on a sequence of clones/unshared that will create >> an equivalent process tree with space-efficient nsproxy's). > > OK, so you're saying that there's no way for userspace to tell that a > set of tasks share an nsproxy other than exporting that nsproxy? I don't think so. Maybe the namespaces people know better. Oren. > >> Perhaps instead of nsproxy_equal() we could use an 'nsproxy_objref' >> to know a-priori that it is common, and save the "compare-and-swap" >> phase altogether. > > Sure, that's workable. The important thing is that it is an > optimization that can be left for later. > > -- Dave > > ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-20 21:39 ` Dave Hansen 2009-03-20 21:58 ` Oren Laadan 0 siblings, 1 reply; 23+ messages in thread From: Dave Hansen @ 2009-03-20 21:39 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w On Fri, 2009-03-20 at 17:26 -0400, Oren Laadan wrote: > Dave Hansen wrote: > > On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote: > >> Does that scale well with many (1000's) of tasks ? > >> > >> (The motivation to expose 'nsproxy_objref' to user space was to allow > >> user-space to decide on a sequence of clones/unshared that will create > >> an equivalent process tree with space-efficient nsproxy's). > > > > OK, so you're saying that there's no way for userspace to tell that a > > set of tasks share an nsproxy other than exporting that nsproxy? > > I don't think so. Maybe the namespaces people know better. Please go look at the code. Two processes share an nsproxy (or *can* share an nsproxy) when all of the namespace pointers are the same. After allocation and initialization, we basically don't ever write to the nsproxy. We just hook it into a task and run with it. If anything ever unshares one of the nsproxy namespaces, before writing to it we *first* create a new nsproxy which is a copy of the old one. We basically open-code copy-on-write semantics for the nsproxy. This means that it is safe for any two tasks that share all of the pointers in the nsproxy to share an nsproxy itself. -- Dave ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) 2009-03-20 21:39 ` Dave Hansen @ 2009-03-20 21:58 ` Oren Laadan 0 siblings, 0 replies; 23+ messages in thread From: Oren Laadan @ 2009-03-20 21:58 UTC (permalink / raw) To: Dave Hansen Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dave Hansen wrote: > On Fri, 2009-03-20 at 17:26 -0400, Oren Laadan wrote: >> Dave Hansen wrote: >>> On Fri, 2009-03-20 at 16:48 -0400, Oren Laadan wrote: >>>> Does that scale well with many (1000's) of tasks ? >>>> >>>> (The motivation to expose 'nsproxy_objref' to user space was to allow >>>> user-space to decide on a sequence of clones/unshared that will create >>>> an equivalent process tree with space-efficient nsproxy's). >>> OK, so you're saying that there's no way for userspace to tell that a >>> set of tasks share an nsproxy other than exporting that nsproxy? >> I don't think so. Maybe the namespaces people know better. > > Please go look at the code. Heheh .. read: "I don't think so" as in "I don't think tasks in user space can tell whether or not they share an nsproxy in the kernel." (and the corollary follows ...) > > Two processes share an nsproxy (or *can* share an nsproxy) when all of > the namespace pointers are the same. After allocation and > initialization, we basically don't ever write to the nsproxy. We just > hook it into a task and run with it. > > If anything ever unshares one of the nsproxy namespaces, before writing > to it we *first* create a new nsproxy which is a copy of the old one. > > We basically open-code copy-on-write semantics for the nsproxy. > > This means that it is safe for any two tasks that share all of the > pointers in the nsproxy to share an nsproxy itself. > > -- Dave ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) 2009-03-20 19:59 ` Dave Hansen 2009-03-20 20:48 ` Oren Laadan @ 2009-03-23 14:52 ` Dan Smith [not found] ` <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 1 sibling, 1 reply; 23+ messages in thread From: Dan Smith @ 2009-03-23 14:52 UTC (permalink / raw) To: Dave Hansen Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w DH> for each ns in nsproxy { DH> if (ns seen first time) { DH> alloc ns_objref DH> save state of ns DH> } else { DH> save existing ns_objref DH> } DH> } Isn't this precisely what I have in the latest patch? Since there is only one supported namespace type, perhaps the for loop is hard to recognize... The bitfield serves only to describe which *new* namespace records follow the general one, but the objref of each (supported) namespace is included in the initial record. Now that I think of it, I can do away with the bitfield to avoid confusion. -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-03-23 15:08 ` Serge E. Hallyn 0 siblings, 0 replies; 23+ messages in thread From: Serge E. Hallyn @ 2009-03-23 15:08 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w, Dave Hansen Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org): > DH> for each ns in nsproxy { > DH> if (ns seen first time) { > DH> alloc ns_objref > DH> save state of ns > DH> } else { > DH> save existing ns_objref > DH> } > DH> } > > Isn't this precisely what I have in the latest patch? Since there is > only one supported namespace type, perhaps the for loop is hard to > recognize... Yes, I still think your patch is good... > The bitfield serves only to describe which *new* namespace records > follow the general one, but the objref of each (supported) namespace > is included in the initial record. Now that I think of it, I can do > away with the bitfield to avoid confusion. How about you tweak both Nathan and Oren (who I think both have IPC c/r patchsets) by adding dummy ipc c/r support. Just define CR_OBJ_IPCNS, make cr_wrte_ns_ipc() empty and call it from cr_write_namespaces(). -serge ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 2009-03-19 23:13 ` Oren Laadan @ 2009-03-20 18:05 ` Serge E. Hallyn [not found] ` <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 1 sibling, 1 reply; 23+ messages in thread From: Serge E. Hallyn @ 2009-03-20 18:05 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): > > > Dan Smith wrote: > > OL> So what happens in the following scenario: > > > > OL> * task A is the container init(1) > > OL> * A calls fork() to create task B > > OL> * B calls unshare(CLONE_NEWUTS) > > OL> * B calls clone(CLONE_PARENT) to create task C > > > > In the previous version of the patch, I failed the checkpoint if this > > was the case by making sure that all tasks in the set had the same > > nsproxy. You said in IRC that this was already done elsewhere in the > > infrastructure, but now that I look I don't see that anywhere. > > > > in cr_may_checkpoint_task(): > > 285 /* FIXME: change this for nested containers */ > 286 if (task_nsproxy(t) != ctx->root_nsproxy) > 287 return -EPERM; > > > The check I had was in response to Daniel's comments about avoiding > > the situation for the time being by making sure that all the tasks had > > the same set of namespaces (i.e. the same nsproxy at the time of > > checkpoint). > > > > OL> Two approaches to solve this are: > > > > OL> a) Identify, in mktree, that this was the case, and impose an > > OL> order on the forks/clones to recreate the same dependency (an > > OL> algorithm for this is described in [1]) > > > > OL> b) Do it in the kernel: for each nsproxy (identified by an objref) > > OL> the first task that has it will create it during restart, in or > > OL> out of the kernel, and the next task will simply attach to the > > OL> existing one that will be deposited in the objhash. > > > > I think that prior discussion led to the conclusion that simplicity > > wins for the moment, but if you want to solve it now I can cook up > > some changes. > > > > If we keep the assumption, for simplicity, that all tasks share the > same namespace, then the checkpoint code should check, once, how that > nsproxy differs from the container's parent (except for the obvious > pidns). I disagree. Whether the container had its own utsns doesn't affect whether it should have a private utsns on restart. > If it does differ, e.g. in uts, then the checkpoint should save the > uts state _once_ - as in global data. Restart will restore the state > also _once_, for the init of the container (the first task restored), > _before_ it forks the rest of the tree. > > Otherwise, we don't get the same outcome. Again I disagree. If we were planning on never supporting nested uts namespaces it woudl be fine, but what you are talking about is making sure we have to break the checkpoint format later to support nested namespaces. Rather, we should do: 1. record the hostname for the container in global data. 2. The restart program can decide whether to honor the global checkpoint image hostname or not. It can either use a command line option, or check whether the recorded hostname is different from the restart host. I prefer the former. 3. for each task, leave an optional spot for hostname. If there is a hostname, then it will unshare(CLONE_NEWUTS) and set its hostname before calling sys_restart() or cloning any child tasks. -serge ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-03-20 19:38 ` Oren Laadan [not found] ` <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-20 19:38 UTC (permalink / raw) To: Serge E. Hallyn Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Serge E. Hallyn wrote: > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): >> >> Dan Smith wrote: >>> OL> So what happens in the following scenario: >>> >>> OL> * task A is the container init(1) >>> OL> * A calls fork() to create task B >>> OL> * B calls unshare(CLONE_NEWUTS) >>> OL> * B calls clone(CLONE_PARENT) to create task C >>> >>> In the previous version of the patch, I failed the checkpoint if this >>> was the case by making sure that all tasks in the set had the same >>> nsproxy. You said in IRC that this was already done elsewhere in the >>> infrastructure, but now that I look I don't see that anywhere. >>> >> in cr_may_checkpoint_task(): >> >> 285 /* FIXME: change this for nested containers */ >> 286 if (task_nsproxy(t) != ctx->root_nsproxy) >> 287 return -EPERM; >> >>> The check I had was in response to Daniel's comments about avoiding >>> the situation for the time being by making sure that all the tasks had >>> the same set of namespaces (i.e. the same nsproxy at the time of >>> checkpoint). >>> >>> OL> Two approaches to solve this are: >>> >>> OL> a) Identify, in mktree, that this was the case, and impose an >>> OL> order on the forks/clones to recreate the same dependency (an >>> OL> algorithm for this is described in [1]) >>> >>> OL> b) Do it in the kernel: for each nsproxy (identified by an objref) >>> OL> the first task that has it will create it during restart, in or >>> OL> out of the kernel, and the next task will simply attach to the >>> OL> existing one that will be deposited in the objhash. >>> >>> I think that prior discussion led to the conclusion that simplicity >>> wins for the moment, but if you want to solve it now I can cook up >>> some changes. >>> >> If we keep the assumption, for simplicity, that all tasks share the >> same namespace, then the checkpoint code should check, once, how that >> nsproxy differs from the container's parent (except for the obvious >> pidns). > > I disagree. Whether the container had its own utsns doesn't > affect whether it should have a private utsns on restart. Right, I missed that... >> If it does differ, e.g. in uts, then the checkpoint should save the >> uts state _once_ - as in global data. Restart will restore the state >> also _once_, for the init of the container (the first task restored), >> _before_ it forks the rest of the tree. >> >> Otherwise, we don't get the same outcome. > > Again I disagree. If we were planning on never supporting nested > uts namespaces it woudl be fine, but what you are talking about > is making sure we have to break the checkpoint format later to support > nested namespaces. We don't know how we are to support nested namespaces. So either we solve it now, or we do something that is bound to break later. The image format is going to change anyways as we move along. > > Rather, we should do: > > 1. record the hostname for the container in global data. > 2. The restart program can decide whether to honor the global > checkpoint image hostname or not. It can either use a > command line option, or check whether the recorded hostname > is different from the restart host. I prefer the former. Sounds good. > 3. for each task, leave an optional spot for hostname. If > there is a hostname, then it will unshare(CLONE_NEWUTS) > and set its hostname before calling sys_restart() or > cloning any child tasks. Doesn't this imply a a specific format that is bound to break later ? Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-03-20 20:42 ` Serge E. Hallyn 0 siblings, 0 replies; 23+ messages in thread From: Serge E. Hallyn @ 2009-03-20 20:42 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): > > > Serge E. Hallyn wrote: > > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): > >> > >> Dan Smith wrote: > >>> OL> So what happens in the following scenario: > >>> > >>> OL> * task A is the container init(1) > >>> OL> * A calls fork() to create task B > >>> OL> * B calls unshare(CLONE_NEWUTS) > >>> OL> * B calls clone(CLONE_PARENT) to create task C > >>> > >>> In the previous version of the patch, I failed the checkpoint if this > >>> was the case by making sure that all tasks in the set had the same > >>> nsproxy. You said in IRC that this was already done elsewhere in the > >>> infrastructure, but now that I look I don't see that anywhere. > >>> > >> in cr_may_checkpoint_task(): > >> > >> 285 /* FIXME: change this for nested containers */ > >> 286 if (task_nsproxy(t) != ctx->root_nsproxy) > >> 287 return -EPERM; > >> > >>> The check I had was in response to Daniel's comments about avoiding > >>> the situation for the time being by making sure that all the tasks had > >>> the same set of namespaces (i.e. the same nsproxy at the time of > >>> checkpoint). > >>> > >>> OL> Two approaches to solve this are: > >>> > >>> OL> a) Identify, in mktree, that this was the case, and impose an > >>> OL> order on the forks/clones to recreate the same dependency (an > >>> OL> algorithm for this is described in [1]) > >>> > >>> OL> b) Do it in the kernel: for each nsproxy (identified by an objref) > >>> OL> the first task that has it will create it during restart, in or > >>> OL> out of the kernel, and the next task will simply attach to the > >>> OL> existing one that will be deposited in the objhash. > >>> > >>> I think that prior discussion led to the conclusion that simplicity > >>> wins for the moment, but if you want to solve it now I can cook up > >>> some changes. > >>> > >> If we keep the assumption, for simplicity, that all tasks share the > >> same namespace, then the checkpoint code should check, once, how that > >> nsproxy differs from the container's parent (except for the obvious > >> pidns). > > > > I disagree. Whether the container had its own utsns doesn't > > affect whether it should have a private utsns on restart. > > Right, I missed that... > > >> If it does differ, e.g. in uts, then the checkpoint should save the > >> uts state _once_ - as in global data. Restart will restore the state > >> also _once_, for the init of the container (the first task restored), > >> _before_ it forks the rest of the tree. > >> > >> Otherwise, we don't get the same outcome. > > > > Again I disagree. If we were planning on never supporting nested > > uts namespaces it woudl be fine, but what you are talking about > > is making sure we have to break the checkpoint format later to support > > nested namespaces. > > We don't know how we are to support nested namespaces. So either we solve > it now, or we do something that is bound to break later. The image format > is going to change anyways as we move along. > > > > > Rather, we should do: > > > > 1. record the hostname for the container in global data. > > 2. The restart program can decide whether to honor the global > > checkpoint image hostname or not. It can either use a > > command line option, or check whether the recorded hostname > > is different from the restart host. I prefer the former. > > Sounds good. > > > 3. for each task, leave an optional spot for hostname. If > > there is a hostname, then it will unshare(CLONE_NEWUTS) > > and set its hostname before calling sys_restart() or > > cloning any child tasks. > > Doesn't this imply a a specific format that is bound to break later ? Not if we don't specify a format for the optional record now. We do of course need to pick a spot for it now, and as Dan noticed, that should be above the actual task layout so that the info can be easily accessed by mktree.c before calling sys_restart()... But what the heck, like you're saying let's leave step 3 for later :) thanks, -serge ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-03-19 22:26 ` Oren Laadan @ 2009-03-19 22:28 ` Oren Laadan 2009-03-24 15:07 ` Oren Laadan 2 siblings, 0 replies; 23+ messages in thread From: Oren Laadan @ 2009-03-19 22:28 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dan Smith wrote: > This patch adds a "phase" of checkpoint that saves out information about any > namespaces the task(s) may have. Do this by tracking the namespace objects > of the tasks and making sure that tasks with the same namespace that follow > get properly referenced in the checkpoint stream. Note that for now, we > refuse to checkpoint if all tasks in the set don't share the same set of > *all* namespaces. > > Restart is handled in userspace by reading the UTS record(s), calling > unshare() and setting the hostname accordingly. See my changes to > mktree.c for details. > > I tested this with single and multiple task restore, on top of Oren's > v13 tree. > > Changes: > - Remove the kernel restore path > - Punt on nested namespaces > - Use __NEW_UTS_LEN in nodename and domainname buffers > - Add a note to Documentation/checkpoint/internals.txt to indicate where > in the save/restore process the UTS information is kept > - Store (and track) the objref of the namespace itself instead of the > nsproxy (based on comments from Dave on IRC) > - Remove explicit check for non-root nsproxy > - Store the nodename and domainname lengths and use cr_write_string() > to store the actual name strings > [...] > +static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t) > +{ > + struct cr_hdr h; > + struct cr_hdr_namespaces *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct nsproxy *nsp = t->nsproxy; > + int ret; > + > + h.type = CR_HDR_NS; > + h.len = sizeof(*hh); > + h.parent = 0; > + > + hh->types = 0; > + > + if (cr_obj_add_ptr(ctx, nsp->uts_ns, &hh->uts_ref, CR_OBJ_UTSNS, 0)) > + hh->types |= CR_NS_UTS; cr_obj_add_ptr() may fail with an error (e.g. -ENOMEM). > + > + ret = cr_write_obj(ctx, &h, hh); > + if (ret) > + goto out; > + > + if (hh->types & CR_NS_UTS) { > + ret = cr_write_ns_uts(ctx, t); > + if (ret < 0) > + goto out; > + > + /* FIXME: Write other namespaces here */ > + } > + out: > + cr_hbuf_put(ctx, sizeof(*hh)); > + > + return ret; > +} > + [...] Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-03-19 22:26 ` Oren Laadan 2009-03-19 22:28 ` Oren Laadan @ 2009-03-24 15:07 ` Oren Laadan 2009-03-24 15:21 ` Dan Smith 2 siblings, 1 reply; 23+ messages in thread From: Oren Laadan @ 2009-03-24 15:07 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dan, You might want to look at ckpt-v14-rc1 which has some cleanups, see below. Dan Smith wrote: > This patch adds a "phase" of checkpoint that saves out information about any > namespaces the task(s) may have. Do this by tracking the namespace objects > of the tasks and making sure that tasks with the same namespace that follow > get properly referenced in the checkpoint stream. Note that for now, we > refuse to checkpoint if all tasks in the set don't share the same set of > *all* namespaces. > > Restart is handled in userspace by reading the UTS record(s), calling > unshare() and setting the hostname accordingly. See my changes to > mktree.c for details. [...] > +static int cr_write_ns_uts(struct cr_ctx *ctx, struct task_struct *t) > +{ > + struct cr_hdr h; > + struct cr_hdr_utsns *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct new_utsname *n = &t->nsproxy->uts_ns->name; > + int ret; Need to test whether cr_hbuf_get() succeeds (while now it's trivial, in the future it may fail if we change the allocation method). > + > + h.type = CR_HDR_UTSNS; > + h.len = sizeof(*hh); > + h.parent = 0; The 'h.parent' fields is gone. > + > + hh->nodename_len = sizeof(n->nodename); > + hh->domainname_len = sizeof(n->domainname); > + > + ret = cr_write_obj(ctx, &h, hh); > + if (ret < 0) > + goto out; > + > + ret = cr_write_string(ctx, n->nodename, hh->nodename_len); > + if (ret < 0) > + goto out; > + > + ret = cr_write_string(ctx, n->domainname, hh->domainname_len); > + out: > + cr_hbuf_put(ctx, sizeof(*hh)); > + > + return ret; > +} > + > +static int cr_write_namespaces(struct cr_ctx *ctx, struct task_struct *t) > +{ > + struct cr_hdr h; > + struct cr_hdr_namespaces *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct nsproxy *nsp = t->nsproxy; > + int ret; > + > + h.type = CR_HDR_NS; > + h.len = sizeof(*hh); > + h.parent = 0; Here too. > + > + hh->types = 0; > + > + if (cr_obj_add_ptr(ctx, nsp->uts_ns, &hh->uts_ref, CR_OBJ_UTSNS, 0)) > + hh->types |= CR_NS_UTS; cr_obj_add_ptr() mail fail, with -ENOMEM for instance. If we plan to support multiple uts_ns, then it makes sense to save the 'objref' value. If you omit the 'objref' because you assume a single namespace for all for now, then you may also drop the loop on all tasks (below), for now. > + > + ret = cr_write_obj(ctx, &h, hh); > + if (ret) > + goto out; > + > + if (hh->types & CR_NS_UTS) { > + ret = cr_write_ns_uts(ctx, t); > + if (ret < 0) > + goto out; > + > + /* FIXME: Write other namespaces here */ > + } > + out: > + cr_hbuf_put(ctx, sizeof(*hh)); > + > + return ret; > +} > + > +static int cr_write_all_namespaces(struct cr_ctx *ctx) > +{ > + int n, ret = 0; > + > + for (n = 0; n < ctx->tasks_nr; n++) { > + pr_debug("dumping ns for task #%d\n", n); I changed this back to cr_debug() in response to a comment about pr_fmt. > + ret = cr_write_namespaces(ctx, ctx->tasks_arr[n]); > + if (ret < 0) > + break; > + } I'm still unhappy with this loop. It isn't necessary now, since we assume and enforce a single, common namespace among all tasks. It is unlikely to be useful as is in the future because the format is likely to change anyway. Even in the (userspace) restart part the code to read it is in the global section as opposed to per task section. Is there any reason to keep it ? If you are to dump the namespace information of each task, why not add it to an already existing table of tasks - the pids_arr (cr_hdr_pids) ? > + > + return ret; > +} > + [...] Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] c/r: Add UTS support (v4) 2009-03-24 15:07 ` Oren Laadan @ 2009-03-24 15:21 ` Dan Smith [not found] ` <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 23+ messages in thread From: Dan Smith @ 2009-03-24 15:21 UTC (permalink / raw) To: Oren Laadan Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w I'm a little confused. Haven't you already reviewed v4? Perhaps you meant to reply to v5? OL> Need to test whether cr_hbuf_get() succeeds (while now it's OL> trivial, in the future it may fail if we change the allocation OL> method). Hmm, that's rather unfortunate as it seems to make it messier. I've long wondered, why not have cr_write_obj() do the allocation (and check) so that we can avoid the get and put in the caller? I suppose that introduces an additional copy, but it seems like it would make it significantly more attractive. OL> The 'h.parent' fields is gone. Okay. I need to re-base on your latest. Sorry about that. OL> If we plan to support multiple uts_ns, then it makes sense to save OL> the 'objref' value. If you omit the 'objref' because you assume a OL> single namespace for all for now, then you may also drop the loop OL> on all tasks (below), for now. <snip> OL> I'm still unhappy with this loop. It isn't necessary now, since we OL> assume and enforce a single, common namespace among all tasks. It OL> is unlikely to be useful as is in the future because the format is OL> likely to change anyway. Even in the (userspace) restart part the OL> code to read it is in the global section as opposed to per task OL> section. Is there any reason to keep it ? I guess I'm not sure why the format would change. Rather, I would expect it to look something quite like this when we do support nested namespaces. By having it there, it keeps mktree organized in a similar way for when we do support it. However, if you'd rather be very explicit about the unsupported-ness of it, then I can just completely re-write it to reflect the naive case. -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 23+ messages in thread
[parent not found: <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add UTS support (v4) [not found] ` <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-03-24 15:34 ` Oren Laadan 0 siblings, 0 replies; 23+ messages in thread From: Oren Laadan @ 2009-03-24 15:34 UTC (permalink / raw) To: Dan Smith Cc: containers-qjLDD68F18O7TbgM5vRIOg, adobriyan-Re5JQEeQqe8AvxtiuMwx3w Dan Smith wrote: > I'm a little confused. Haven't you already reviewed v4? Perhaps you > meant to reply to v5? Uhhhhh... you are so right and I'm so confused .. Sorry, ignore those comments please. > > OL> Need to test whether cr_hbuf_get() succeeds (while now it's > OL> trivial, in the future it may fail if we change the allocation > OL> method). > > Hmm, that's rather unfortunate as it seems to make it messier. I've > long wondered, why not have cr_write_obj() do the allocation (and > check) so that we can avoid the get and put in the caller? I suppose > that introduces an additional copy, but it seems like it would make it > significantly more attractive. > > OL> The 'h.parent' fields is gone. > > Okay. I need to re-base on your latest. Sorry about that. > > OL> If we plan to support multiple uts_ns, then it makes sense to save > OL> the 'objref' value. If you omit the 'objref' because you assume a > OL> single namespace for all for now, then you may also drop the loop > OL> on all tasks (below), for now. > > <snip> > > OL> I'm still unhappy with this loop. It isn't necessary now, since we > OL> assume and enforce a single, common namespace among all tasks. It > OL> is unlikely to be useful as is in the future because the format is > OL> likely to change anyway. Even in the (userspace) restart part the > OL> code to read it is in the global section as opposed to per task > OL> section. Is there any reason to keep it ? > > I guess I'm not sure why the format would change. Rather, I would > expect it to look something quite like this when we do support nested > namespaces. By having it there, it keeps mktree organized in a > similar way for when we do support it. I'd expect it to be a per task information, as opposed to a "global" part, similar to a session ID. I don't see how the restart of ns can be done by the first task (in the "global" part) only. While we have not implemented anything yet, it sounds to me that the right place to do that will be per task. In other words, it makes sense to save that information for each task, but, I think, not as a separate and dedicated array only handled in the "global" part. Hence my suggestion that you do dump that information, but instead of that loop, you add it to the existing array of tasks that is written, and extend the 'struct cr_hdr_pids'. It's flexible to be used "global" or per-task. > However, if you'd rather be very explicit about the unsupported-ness > of it, then I can just completely re-write it to reflect the naive > case. We're already explicit in the test for 'nxproxy != ctx->task->nxproxy'... Oren. ^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2009-03-24 15:34 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-03-18 18:51 [PATCH] c/r: Add UTS support (v4) Dan Smith
[not found] ` <1237402291-28812-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-19 22:26 ` Oren Laadan
[not found] ` <49C2C686.2060806-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-19 22:39 ` Dan Smith
[not found] ` <871vstdtn1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-19 22:58 ` Oren Laadan
[not found] ` <49C2CDFA.4010907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-19 23:13 ` Oren Laadan
2009-03-20 13:56 ` Dan Smith
[not found] ` <49C2D183.8040905-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 18:10 ` Serge E. Hallyn
[not found] ` <20090320181043.GB8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-20 19:34 ` Oren Laadan
[not found] ` <49C3EFAF.9030706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 19:59 ` Dave Hansen
2009-03-20 20:48 ` Oren Laadan
[not found] ` <49C4011C.1050707-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 21:00 ` Dave Hansen
2009-03-20 21:26 ` Oren Laadan
[not found] ` <49C40A23.6080708-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 21:39 ` Dave Hansen
2009-03-20 21:58 ` Oren Laadan
2009-03-23 14:52 ` Dan Smith
[not found] ` <87eiwoi956.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-23 15:08 ` Serge E. Hallyn
2009-03-20 18:05 ` Serge E. Hallyn
[not found] ` <20090320180506.GA8380-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-20 19:38 ` Oren Laadan
[not found] ` <49C3F0A9.9080703-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-20 20:42 ` Serge E. Hallyn
2009-03-19 22:28 ` Oren Laadan
2009-03-24 15:07 ` Oren Laadan
2009-03-24 15:21 ` Dan Smith
[not found] ` <87d4c7gd54.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-03-24 15:34 ` Oren Laadan
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.