* [PATCH RFC] c/r mounts ns. Except not really.
@ 2009-12-15 2:39 Serge E. Hallyn
[not found] ` <20091215023945.GA12100-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 4+ messages in thread
From: Serge E. Hallyn @ 2009-12-15 2:39 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
We only c/r a mounts ns with objref 0, meaning inherit the existing
mounts ns. We do intend to implement c/r of mounts and mounts
namespaces in the kernel. It shouldn't be ugly or complicate locking
to do so. Just haven't gotten around to it.
Why did I bother with this? Because we can't re-create private
mounts yet, and while I don't expect trouble doing so, I think
it's more than we want to take on now for v19. But I'd like
as much as possible for everything which we don't support, to not
be checkpointable, since not doing so has in the past invited
slanderous accusations of being a toy implementation :)
Comments?
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
checkpoint/objhash.c | 26 ++++++++++++++++++++++++++
fs/namespace.c | 35 +++++++++++++++++++++++++++++++++++
include/linux/checkpoint.h | 2 +-
include/linux/checkpoint_hdr.h | 10 ++++++++++
include/linux/mnt_namespace.h | 8 ++++++++
kernel/nsproxy.c | 33 +++++++++++++++++++++++++++++++--
6 files changed, 111 insertions(+), 3 deletions(-)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 20fd3e9..b379809 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -19,6 +19,7 @@
#include <linux/kref.h>
#include <linux/ipc_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
#include <net/sock.h>
@@ -236,6 +237,22 @@ static void obj_groupinfo_drop(void *ptr, int lastref)
put_group_info((struct group_info *) ptr);
}
+static int obj_mntns_grab(void *ptr)
+{
+ get_mnt_ns((struct mnt_namespace *) ptr);
+ return 0;
+}
+
+static void obj_mntns_drop(void *ptr, int lastref)
+{
+ put_mnt_ns((struct mnt_namespace *) ptr);
+}
+
+static int obj_mntns_users(void *ptr)
+{
+ return atomic_read(&((struct mnt_namespace *) ptr)->count);
+}
+
static int obj_sock_grab(void *ptr)
{
sock_hold((struct sock *) ptr);
@@ -500,6 +517,15 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
.checkpoint = checkpoint_lsm_string,
.restore = restore_lsm_string_wrap,
},
+ {
+ .obj_name = "MOUNTS NS",
+ .obj_type = CKPT_OBJ_MNT_NS,
+ .ref_grab = obj_mntns_grab,
+ .ref_drop = obj_mntns_drop,
+ .ref_users = obj_mntns_users,
+ .checkpoint = checkpoint_mnt_ns,
+ .restore = restore_mnt_ns,
+ },
};
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4..3a61197 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/idr.h>
#include <linux/fs_struct.h>
+#include <linux/checkpoint.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -2323,3 +2324,37 @@ void put_mnt_ns(struct mnt_namespace *ns)
kfree(ns);
}
EXPORT_SYMBOL(put_mnt_ns);
+
+#ifdef CONFIG_CHECKPOINT
+int checkpoint_mounts_ns(struct ckpt_ctx *ctx, struct mnt_namespace *mnt_ns)
+{
+ if (mnt_ns == ctx->root_nsproxy->mnt_ns)
+ return 0;
+ return checkpoint_obj(ctx, mnt_ns, CKPT_OBJ_MNT_NS);
+}
+/*
+ * c/r with private mounts namespaces is unsupported. We'll need
+ * to also support c/r of mounts to meaningfully do so.
+ */
+static int do_checkpoint_mnt_ns(struct ckpt_ctx *ctx,
+ struct mnt_namespace *mnt_ns)
+{
+ return -EINVAL;
+}
+
+int checkpoint_mnt_ns(struct ckpt_ctx *ctx, void *ptr)
+{
+ return do_checkpoint_mnt_ns(ctx, (struct mnt_namespace *) ptr);
+}
+
+static struct mnt_namespace *do_restore_mnt_ns(struct ckpt_ctx *ctx)
+{
+ ckpt_err(ctx, -EINVAL, "private mounts namespace unsupported\n");
+ return ERR_PTR(-EINVAL);
+}
+
+void *restore_mnt_ns(struct ckpt_ctx *ctx)
+{
+ return (void *) do_restore_mnt_ns(ctx);
+}
+#endif
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 92d47fa..93bdbdd 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,7 +10,7 @@
* distribution for more details.
*/
-#define CHECKPOINT_VERSION 5
+#define CHECKPOINT_VERSION 6
/* checkpoint user flags */
#define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f07e527..cdc6531 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -117,6 +117,8 @@ enum {
#define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
CKPT_HDR_TASK_CREDS,
#define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+ CKPT_HDR_MNT_NS,
+#define CKPT_HDR_MNT_NS CKPT_HDR_MNT_NS
/* 201-299: reserved for arch-dependent */
@@ -246,6 +248,8 @@ enum obj_type {
#define CKPT_OBJ_SECURITY_PTR CKPT_OBJ_SECURITY_PTR
CKPT_OBJ_SECURITY,
#define CKPT_OBJ_SECURITY CKPT_OBJ_SECURITY
+ CKPT_OBJ_MNT_NS,
+#define CKPT_OBJ_MNT_NS CKPT_OBJ_MNT_NS
CKPT_OBJ_MAX
#define CKPT_OBJ_MAX CKPT_OBJ_MAX
};
@@ -428,10 +432,16 @@ struct ckpt_hdr_task_ns {
__s32 ns_objref;
} __attribute__((aligned(8)));
+/*
+ * mnt_objref is 0 if shared with ctx->root_task->parent's mntns
+ * Come up with a better name?
+ */
+#define CKPT_MNT_NS_INHERIT 0
struct ckpt_hdr_ns {
struct ckpt_hdr h;
__s32 uts_objref;
__s32 ipc_objref;
+ __s32 mnt_objref;
} __attribute__((aligned(8)));
/* cannot include <linux/tty.h> from userspace, so define: */
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index d74785c..ad242bb 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -32,6 +32,14 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
atomic_inc(&ns->count);
}
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+extern void *restore_mnt_ns(struct ckpt_ctx *ctx);
+extern int checkpoint_mnt_ns(struct ckpt_ctx *ctx, void *ptr);
+extern int checkpoint_mounts_ns(struct ckpt_ctx *ctx,
+ struct mnt_namespace *mnt_ns);
+#endif
+
extern const struct seq_operations mounts_op;
extern const struct seq_operations mountinfo_op;
extern const struct seq_operations mountstats_op;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index e7aaa00..c91b725 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -282,6 +282,13 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
goto out;
h->ipc_objref = ret;
+ ret = checkpoint_mounts_ns(ctx, nsproxy->mnt_ns);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)process mounts ns\n");
+ goto out;
+ }
+ h->mnt_objref = ret;
+
/* TODO: Write other namespaces here */
ret = ckpt_write_obj(ctx, &h->h);
@@ -302,6 +309,7 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
struct nsproxy *nsproxy = NULL;
struct uts_namespace *uts_ns;
struct ipc_namespace *ipc_ns;
+ struct mnt_namespace *mnt_ns = NULL;
int ret;
h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NS);
@@ -323,6 +331,25 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
ret = PTR_ERR(ipc_ns);
goto out;
}
+ if (h->mnt_objref == CKPT_MNT_NS_INHERIT)
+ /* inherit from root task.
+ * XXX And right here we come upon a dilemma. Note that by
+ * doing that, we wouldn't allow userspace to recreate mounts
+ * namespace and mounts between being cloned() and running
+ * sys_restart().
+ *
+ * SO, we can claim that if userspace does nothing fancy, then
+ * current->nsproxy->mnt_ns() will be root task mnt_ns. If
+ * userspace does, then it clearly wants to set up mounts, so
+ * we let it.
+ */
+ mnt_ns = current->nsproxy->mnt_ns;
+ else
+ mnt_ns = ckpt_obj_fetch(ctx, h->mnt_objref, CKPT_OBJ_MNT_NS);
+ if (IS_ERR(mnt_ns)) {
+ ret = PTR_ERR(mnt_ns);
+ goto out;
+ }
#if defined(COFNIG_UTS_NS) || defined(CONFIG_IPC_NS)
ret = -ENOMEM;
@@ -335,10 +362,9 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
get_ipc_ns(ipc_ns);
nsproxy->ipc_ns = ipc_ns;
+ /* unsupported namespaces: */
get_pid_ns(current->nsproxy->pid_ns);
nsproxy->pid_ns = current->nsproxy->pid_ns;
- get_mnt_ns(current->nsproxy->mnt_ns);
- nsproxy->mnt_ns = current->nsproxy->mnt_ns;
get_net(current->nsproxy->net_ns);
nsproxy->net_ns = current->nsproxy->net_ns;
#else
@@ -349,6 +375,9 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
BUG_ON(nsproxy->ipc_ns != ipc_ns);
#endif
+ get_mnt_ns(mnt_ns);
+ nsproxy->mnt_ns = mnt_ns;
+
/* TODO: add more namespaces here */
ret = 0;
out:
--
1.6.0.4
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH RFC] c/r mounts ns. Except not really.
[not found] ` <20091215023945.GA12100-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-12-15 20:19 ` Serge E. Hallyn
[not found] ` <20091215201928.GA20037-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 4+ messages in thread
From: Serge E. Hallyn @ 2009-12-15 20:19 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
Quoting Serge E. Hallyn (serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> We only c/r a mounts ns with objref 0, meaning inherit the existing
> mounts ns. We do intend to implement c/r of mounts and mounts
> namespaces in the kernel. It shouldn't be ugly or complicate locking
> to do so. Just haven't gotten around to it.
>
> Why did I bother with this? Because we can't re-create private
> mounts yet, and while I don't expect trouble doing so, I think
> it's more than we want to take on now for v19. But I'd like
> as much as possible for everything which we don't support, to not
> be checkpointable, since not doing so has in the past invited
> slanderous accusations of being a toy implementation :)
>
> Comments?
>
> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
> checkpoint/objhash.c | 26 ++++++++++++++++++++++++++
> fs/namespace.c | 35 +++++++++++++++++++++++++++++++++++
> include/linux/checkpoint.h | 2 +-
> include/linux/checkpoint_hdr.h | 10 ++++++++++
> include/linux/mnt_namespace.h | 8 ++++++++
> kernel/nsproxy.c | 33 +++++++++++++++++++++++++++++++--
> 6 files changed, 111 insertions(+), 3 deletions(-)
>
> diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
> index 20fd3e9..b379809 100644
> --- a/checkpoint/objhash.c
> +++ b/checkpoint/objhash.c
> @@ -19,6 +19,7 @@
> #include <linux/kref.h>
> #include <linux/ipc_namespace.h>
> #include <linux/user_namespace.h>
> +#include <linux/mnt_namespace.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> #include <net/sock.h>
> @@ -236,6 +237,22 @@ static void obj_groupinfo_drop(void *ptr, int lastref)
> put_group_info((struct group_info *) ptr);
> }
>
> +static int obj_mntns_grab(void *ptr)
> +{
> + get_mnt_ns((struct mnt_namespace *) ptr);
> + return 0;
> +}
> +
> +static void obj_mntns_drop(void *ptr, int lastref)
> +{
> + put_mnt_ns((struct mnt_namespace *) ptr);
> +}
> +
> +static int obj_mntns_users(void *ptr)
> +{
> + return atomic_read(&((struct mnt_namespace *) ptr)->count);
> +}
> +
> static int obj_sock_grab(void *ptr)
> {
> sock_hold((struct sock *) ptr);
> @@ -500,6 +517,15 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
> .checkpoint = checkpoint_lsm_string,
> .restore = restore_lsm_string_wrap,
> },
> + {
> + .obj_name = "MOUNTS NS",
> + .obj_type = CKPT_OBJ_MNT_NS,
> + .ref_grab = obj_mntns_grab,
> + .ref_drop = obj_mntns_drop,
> + .ref_users = obj_mntns_users,
> + .checkpoint = checkpoint_mnt_ns,
> + .restore = restore_mnt_ns,
> + },
> };
>
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index bdc3cb4..3a61197 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -29,6 +29,7 @@
> #include <linux/log2.h>
> #include <linux/idr.h>
> #include <linux/fs_struct.h>
> +#include <linux/checkpoint.h>
> #include <asm/uaccess.h>
> #include <asm/unistd.h>
> #include "pnode.h"
> @@ -2323,3 +2324,37 @@ void put_mnt_ns(struct mnt_namespace *ns)
> kfree(ns);
> }
> EXPORT_SYMBOL(put_mnt_ns);
> +
> +#ifdef CONFIG_CHECKPOINT
> +int checkpoint_mounts_ns(struct ckpt_ctx *ctx, struct mnt_namespace *mnt_ns)
> +{
> + if (mnt_ns == ctx->root_nsproxy->mnt_ns)
> + return 0;
Technically note that this is not even correct. If the container
being checkpointed had a mnt_ns private from its parents, then
it could have its own mounts which are not checkpointed (and therefore
can't even be generically restored by userspace). So really this should
be returning 0 (which should be CKPT_MNT_NS_INHERIT) only if mnt_ns
== ctx->root_task->parent->nsproxy->mnt_ns (requiring more careful
dereferencing since we don't have it pinned).
Whatever we do here for v19, I intend to do the same thing for
network devices for v19. The question is, are we being unprofessional,
or are we being useful+flexible, by simply ignoring these things at
checkpoint when we know we won't restore them?
> + return checkpoint_obj(ctx, mnt_ns, CKPT_OBJ_MNT_NS);
> +}
> +/*
> + * c/r with private mounts namespaces is unsupported. We'll need
> + * to also support c/r of mounts to meaningfully do so.
> + */
> +static int do_checkpoint_mnt_ns(struct ckpt_ctx *ctx,
> + struct mnt_namespace *mnt_ns)
> +{
> + return -EINVAL;
> +}
> +
> +int checkpoint_mnt_ns(struct ckpt_ctx *ctx, void *ptr)
> +{
> + return do_checkpoint_mnt_ns(ctx, (struct mnt_namespace *) ptr);
> +}
> +
> +static struct mnt_namespace *do_restore_mnt_ns(struct ckpt_ctx *ctx)
> +{
> + ckpt_err(ctx, -EINVAL, "private mounts namespace unsupported\n");
> + return ERR_PTR(-EINVAL);
> +}
> +
> +void *restore_mnt_ns(struct ckpt_ctx *ctx)
> +{
> + return (void *) do_restore_mnt_ns(ctx);
> +}
> +#endif
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 92d47fa..93bdbdd 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -10,7 +10,7 @@
> * distribution for more details.
> */
>
> -#define CHECKPOINT_VERSION 5
> +#define CHECKPOINT_VERSION 6
>
> /* checkpoint user flags */
> #define CHECKPOINT_SUBTREE 0x1
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index f07e527..cdc6531 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -117,6 +117,8 @@ enum {
> #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
> CKPT_HDR_TASK_CREDS,
> #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
> + CKPT_HDR_MNT_NS,
> +#define CKPT_HDR_MNT_NS CKPT_HDR_MNT_NS
>
> /* 201-299: reserved for arch-dependent */
>
> @@ -246,6 +248,8 @@ enum obj_type {
> #define CKPT_OBJ_SECURITY_PTR CKPT_OBJ_SECURITY_PTR
> CKPT_OBJ_SECURITY,
> #define CKPT_OBJ_SECURITY CKPT_OBJ_SECURITY
> + CKPT_OBJ_MNT_NS,
> +#define CKPT_OBJ_MNT_NS CKPT_OBJ_MNT_NS
> CKPT_OBJ_MAX
> #define CKPT_OBJ_MAX CKPT_OBJ_MAX
> };
> @@ -428,10 +432,16 @@ struct ckpt_hdr_task_ns {
> __s32 ns_objref;
> } __attribute__((aligned(8)));
>
> +/*
> + * mnt_objref is 0 if shared with ctx->root_task->parent's mntns
> + * Come up with a better name?
> + */
> +#define CKPT_MNT_NS_INHERIT 0
> struct ckpt_hdr_ns {
> struct ckpt_hdr h;
> __s32 uts_objref;
> __s32 ipc_objref;
> + __s32 mnt_objref;
> } __attribute__((aligned(8)));
>
> /* cannot include <linux/tty.h> from userspace, so define: */
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index d74785c..ad242bb 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -32,6 +32,14 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
> atomic_inc(&ns->count);
> }
>
> +#ifdef CONFIG_CHECKPOINT
> +struct ckpt_ctx;
> +extern void *restore_mnt_ns(struct ckpt_ctx *ctx);
> +extern int checkpoint_mnt_ns(struct ckpt_ctx *ctx, void *ptr);
> +extern int checkpoint_mounts_ns(struct ckpt_ctx *ctx,
> + struct mnt_namespace *mnt_ns);
> +#endif
> +
> extern const struct seq_operations mounts_op;
> extern const struct seq_operations mountinfo_op;
> extern const struct seq_operations mountstats_op;
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index e7aaa00..c91b725 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -282,6 +282,13 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
> goto out;
> h->ipc_objref = ret;
>
> + ret = checkpoint_mounts_ns(ctx, nsproxy->mnt_ns);
> + if (ret < 0) {
> + ckpt_err(ctx, ret, "%(T)process mounts ns\n");
> + goto out;
> + }
> + h->mnt_objref = ret;
> +
> /* TODO: Write other namespaces here */
>
> ret = ckpt_write_obj(ctx, &h->h);
> @@ -302,6 +309,7 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> struct nsproxy *nsproxy = NULL;
> struct uts_namespace *uts_ns;
> struct ipc_namespace *ipc_ns;
> + struct mnt_namespace *mnt_ns = NULL;
> int ret;
>
> h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NS);
> @@ -323,6 +331,25 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> ret = PTR_ERR(ipc_ns);
> goto out;
> }
> + if (h->mnt_objref == CKPT_MNT_NS_INHERIT)
> + /* inherit from root task.
> + * XXX And right here we come upon a dilemma. Note that by
> + * doing that, we wouldn't allow userspace to recreate mounts
> + * namespace and mounts between being cloned() and running
> + * sys_restart().
> + *
> + * SO, we can claim that if userspace does nothing fancy, then
> + * current->nsproxy->mnt_ns() will be root task mnt_ns. If
> + * userspace does, then it clearly wants to set up mounts, so
> + * we let it.
> + */
> + mnt_ns = current->nsproxy->mnt_ns;
> + else
> + mnt_ns = ckpt_obj_fetch(ctx, h->mnt_objref, CKPT_OBJ_MNT_NS);
> + if (IS_ERR(mnt_ns)) {
> + ret = PTR_ERR(mnt_ns);
> + goto out;
> + }
>
> #if defined(COFNIG_UTS_NS) || defined(CONFIG_IPC_NS)
> ret = -ENOMEM;
> @@ -335,10 +362,9 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> get_ipc_ns(ipc_ns);
> nsproxy->ipc_ns = ipc_ns;
>
> + /* unsupported namespaces: */
> get_pid_ns(current->nsproxy->pid_ns);
> nsproxy->pid_ns = current->nsproxy->pid_ns;
> - get_mnt_ns(current->nsproxy->mnt_ns);
> - nsproxy->mnt_ns = current->nsproxy->mnt_ns;
> get_net(current->nsproxy->net_ns);
> nsproxy->net_ns = current->nsproxy->net_ns;
> #else
> @@ -349,6 +375,9 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> BUG_ON(nsproxy->ipc_ns != ipc_ns);
> #endif
>
> + get_mnt_ns(mnt_ns);
> + nsproxy->mnt_ns = mnt_ns;
> +
> /* TODO: add more namespaces here */
> ret = 0;
> out:
> --
> 1.6.0.4
>
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH RFC] c/r mounts ns. Except not really.
[not found] ` <20091215201928.GA20037-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-12-23 1:18 ` Oren Laadan
[not found] ` <4B317003.2060706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 4+ messages in thread
From: Oren Laadan @ 2009-12-23 1:18 UTC (permalink / raw)
To: Serge E. Hallyn; +Cc: Linux Containers
Serge E. Hallyn wrote:
> Quoting Serge E. Hallyn (serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
>> We only c/r a mounts ns with objref 0, meaning inherit the existing
>> mounts ns. We do intend to implement c/r of mounts and mounts
>> namespaces in the kernel. It shouldn't be ugly or complicate locking
>> to do so. Just haven't gotten around to it.
>>
>> Why did I bother with this? Because we can't re-create private
>> mounts yet, and while I don't expect trouble doing so, I think
>> it's more than we want to take on now for v19. But I'd like
>> as much as possible for everything which we don't support, to not
>> be checkpointable, since not doing so has in the past invited
>> slanderous accusations of being a toy implementation :)
>>
>> Comments?
This seems to me like the proper intermediary step.
>>
>> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
[...]
>> @@ -2323,3 +2324,37 @@ void put_mnt_ns(struct mnt_namespace *ns)
>> kfree(ns);
>> }
>> EXPORT_SYMBOL(put_mnt_ns);
>> +
>> +#ifdef CONFIG_CHECKPOINT
>> +int checkpoint_mounts_ns(struct ckpt_ctx *ctx, struct mnt_namespace *mnt_ns)
>> +{
>> + if (mnt_ns == ctx->root_nsproxy->mnt_ns)
>> + return 0;
As you mention below, this should be CKPT_MNT_NS_INHERIT.
>
> Technically note that this is not even correct. If the container
> being checkpointed had a mnt_ns private from its parents, then
> it could have its own mounts which are not checkpointed (and therefore
> can't even be generically restored by userspace). So really this should
> be returning 0 (which should be CKPT_MNT_NS_INHERIT) only if mnt_ns
> == ctx->root_task->parent->nsproxy->mnt_ns (requiring more careful
> dereferencing since we don't have it pinned).
>
> Whatever we do here for v19, I intend to do the same thing for
> network devices for v19. The question is, are we being unprofessional,
> or are we being useful+flexible, by simply ignoring these things at
> checkpoint when we know we won't restore them?
IIRC we agreed that so-called "external" mounts - whatever is mounted
at the root of the container is the responsibility of userspace to have
prepared properly before restart.
So I don't see an issue with ignoring these at checkpoint, with one
exception: leak detection in full-container checkpoint should complain
if the root container fs is shared with parent (above container) tasks.
I think about it this way: at checkpoint, we care about container (or
checkpoint) root and below. Then at restart, we should decide whether
the root task (of the restore) should inherit its initial FS, or
whether it should start with a private one.
Recall that we had/have this dilemma also with UTS namespace, and with
IPC namespace.
Here is one way to do it: (consider ipc-ns) at checkpoint, for the
kernel will compare every task's ipc-ns to the container's _parent_'s
ipc-ns, and if equal will save CKPT_OBJ_NS_INHERIT. If --inherit-ipc
is given, it will not the state of the parent's namespace, otherwise
it will.
At restart, if --inherit-ipc then whenever the kernel sees objref
that is CKPT_OBJ_NS_INHERIT, it will use the container's parent's
ipc-ns. Otherwise, it will create a new, private namespace for it.
(Also, with --inherit-ipc, if there is namespace state following
the CKPT_OBJ_NS_INHERIT, it should skip it without using it).
Oren.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH RFC] c/r mounts ns. Except not really.
[not found] ` <4B317003.2060706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-12-28 15:27 ` Serge E. Hallyn
0 siblings, 0 replies; 4+ messages in thread
From: Serge E. Hallyn @ 2009-12-28 15:27 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>
>
> Serge E. Hallyn wrote:
> > Quoting Serge E. Hallyn (serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> >> We only c/r a mounts ns with objref 0, meaning inherit the existing
> >> mounts ns. We do intend to implement c/r of mounts and mounts
> >> namespaces in the kernel. It shouldn't be ugly or complicate locking
> >> to do so. Just haven't gotten around to it.
> >>
> >> Why did I bother with this? Because we can't re-create private
> >> mounts yet, and while I don't expect trouble doing so, I think
> >> it's more than we want to take on now for v19. But I'd like
> >> as much as possible for everything which we don't support, to not
> >> be checkpointable, since not doing so has in the past invited
> >> slanderous accusations of being a toy implementation :)
> >>
> >> Comments?
>
> This seems to me like the proper intermediary step.
>
> >>
> >> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>
> [...]
>
> >> @@ -2323,3 +2324,37 @@ void put_mnt_ns(struct mnt_namespace *ns)
> >> kfree(ns);
> >> }
> >> EXPORT_SYMBOL(put_mnt_ns);
> >> +
> >> +#ifdef CONFIG_CHECKPOINT
> >> +int checkpoint_mounts_ns(struct ckpt_ctx *ctx, struct mnt_namespace *mnt_ns)
> >> +{
> >> + if (mnt_ns == ctx->root_nsproxy->mnt_ns)
> >> + return 0;
>
> As you mention below, this should be CKPT_MNT_NS_INHERIT.
>
> >
> > Technically note that this is not even correct. If the container
> > being checkpointed had a mnt_ns private from its parents, then
> > it could have its own mounts which are not checkpointed (and therefore
> > can't even be generically restored by userspace). So really this should
> > be returning 0 (which should be CKPT_MNT_NS_INHERIT) only if mnt_ns
> > == ctx->root_task->parent->nsproxy->mnt_ns (requiring more careful
> > dereferencing since we don't have it pinned).
> >
> > Whatever we do here for v19, I intend to do the same thing for
> > network devices for v19. The question is, are we being unprofessional,
> > or are we being useful+flexible, by simply ignoring these things at
> > checkpoint when we know we won't restore them?
>
> IIRC we agreed that so-called "external" mounts - whatever is mounted
> at the root of the container is the responsibility of userspace to have
> prepared properly before restart.
Right, but your description (and my code) misrepresent "external"
mounts. External mounts are those which are shared with the *parent*
of the container init. If the container init has a private namespace,
then things mounted at the root of the container can in fact be
internal mounts.
> So I don't see an issue with ignoring these at checkpoint, with one
> exception: leak detection in full-container checkpoint should complain
> if the root container fs is shared with parent (above container) tasks.
>
> I think about it this way: at checkpoint, we care about container (or
> checkpoint) root and below. Then at restart, we should decide whether
> the root task (of the restore) should inherit its initial FS, or
> whether it should start with a private one.
As a specific example: let's say container root did:
unshare(CLONE_NEWNS)
mkdir privtmp
mount --bind privtmp /tmp
With the code I have, and with what you describe above - or put another
way, if we define "whatever is mounted at the root of the container" as
an external mount - then we won't reproduce this mount at restart.
> Recall that we had/have this dilemma also with UTS namespace, and with
> IPC namespace.
I don't think we can draw many comparisons with those, because those
namespaces are fully isolated. The mounts namespaces have all sorts
of funky relationships: not only mounts which were inherited from
the parent ns at copy time vs. new private mounts, but also slave
and shared mounts. Heck, if a stupid application started on a system
where /var/spool/mail was bind-mounted to /var/mail does:
unshare(CLONE_NEWNS)
umount /var/mail
mount --bind /var/spool/mail
Then (a) will we be able to tell that the umount/mount happened vs
whether it didn't happen, and (b) do we care? Now yes, we actually
don't care in that case, but the point is that we don't have any real
way to compare two mountpoints between mounts namespaces. We may have
to 'color' mounts at mnt_ns clone, so that at checkpoint we can
distinguish private mounts by a unique color.
(I'm not certain, but I think we could produce a case which we couldn't
detect but which we do care about by doing something like:
unshare(CLONE_NEWNS)
umount /var/mail
mount --bind /var/spool /var/spool
mount --bind /var/spool/mail /var/mail
now a simple dentry-based duplicate mount detection algorithm might
think the /var/spool/mail->/var/mail mount was the same as the
original?)
> Here is one way to do it: (consider ipc-ns) at checkpoint, for the
> kernel will compare every task's ipc-ns to the container's _parent_'s
> ipc-ns, and if equal will save CKPT_OBJ_NS_INHERIT. If --inherit-ipc
> is given, it will not the state of the parent's namespace, otherwise
> it will.
>
> At restart, if --inherit-ipc then whenever the kernel sees objref
> that is CKPT_OBJ_NS_INHERIT, it will use the container's parent's
> ipc-ns. Otherwise, it will create a new, private namespace for it.
> (Also, with --inherit-ipc, if there is namespace state following
> the CKPT_OBJ_NS_INHERIT, it should skip it without using it).
>
> Oren.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2009-12-28 15:27 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-15 2:39 [PATCH RFC] c/r mounts ns. Except not really Serge E. Hallyn
[not found] ` <20091215023945.GA12100-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-12-15 20:19 ` Serge E. Hallyn
[not found] ` <20091215201928.GA20037-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-12-23 1:18 ` Oren Laadan
[not found] ` <4B317003.2060706-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-12-28 15:27 ` Serge E. Hallyn
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.