* [PATCH v21 002/100] eclone (2/11): Have alloc_pidmap() return actual error code
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 004/100] eclone (4/11): Add target_pids parameter to alloc_pid() Oren Laadan
` (7 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-s390, linux-api, containers, x86, linux-kernel,
linuxppc-dev, Matt Helsley, Serge Hallyn, Sukadev Bhattiprolu,
Pavel Emelyanov
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
alloc_pidmap() can fail either because all pid numbers are in use or
because memory allocation failed. With support for setting a specific
pid number, alloc_pidmap() would also fail if either the given pid
number is invalid or in use.
Rather than have callers assume -ENOMEM, have alloc_pidmap() return
the actual error.
Changelog[v1]:
- [Oren Laadan] Rebase to kernel 2.6.33
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
Reviewed-by: Oren Laadan <orenl@cs.columbia.edu>
---
kernel/fork.c | 5 +++--
kernel/pid.c | 10 ++++++----
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791..afdfb08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1147,10 +1147,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
- retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns);
- if (!pid)
+ if (IS_ERR(pid)) {
+ retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
+ }
if (clone_flags & CLONE_NEWPID) {
retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
diff --git a/kernel/pid.c b/kernel/pid.c
index 52a371a..8330488 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -160,7 +160,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page))
if (alloc_pidmap_page(map) < 0)
- break;
+ return -ENOMEM;
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
@@ -191,7 +191,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
}
pid = mk_pid(pid_ns, map, offset);
}
- return -1;
+ return -EBUSY;
}
int next_pidmap(struct pid_namespace *pid_ns, int last)
@@ -260,8 +260,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
- if (!pid)
+ if (!pid) {
+ pid = ERR_PTR(-ENOMEM);
goto out;
+ }
tmp = ns;
for (i = ns->level; i >= 0; i--) {
@@ -295,7 +297,7 @@ out_free:
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
- pid = NULL;
+ pid = ERR_PTR(nr);
goto out;
}
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 004/100] eclone (4/11): Add target_pids parameter to alloc_pid()
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
2010-05-01 14:14 ` [PATCH v21 002/100] eclone (2/11): Have alloc_pidmap() return actual error code Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 005/100] eclone (5/11): Add target_pids parameter to copy_process() Oren Laadan
` (6 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Sukadev Bhattiprolu, linux-api, x86, linux-s390,
linuxppc-dev
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
This parameter is currently NULL, but will be used in a follow-on patch.
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
Reviewed-by: Oren Laadan <orenl@cs.columbia.edu>
---
include/linux/pid.h | 2 +-
kernel/fork.c | 3 ++-
kernel/pid.c | 9 +++++++--
3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 49f1c2f..914185d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, int last);
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *target_pids);
extern void free_pid(struct pid *pid);
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index afdfb08..62018c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -962,6 +962,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
int retval;
struct task_struct *p;
int cgroup_callbacks_done = 0;
+ pid_t *target_pids = NULL;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1147,7 +1148,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns, target_pids);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
diff --git a/kernel/pid.c b/kernel/pid.c
index 4eaf975..57f1344 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -276,13 +276,14 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *target_pids)
{
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
+ pid_t tpid;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid) {
@@ -292,7 +293,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ tpid = 0;
+ if (target_pids)
+ tpid = target_pids[i];
+
+ nr = set_pidmap(tmp, tpid);
if (nr < 0)
goto out_free;
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 005/100] eclone (5/11): Add target_pids parameter to copy_process()
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
2010-05-01 14:14 ` [PATCH v21 002/100] eclone (2/11): Have alloc_pidmap() return actual error code Oren Laadan
2010-05-01 14:14 ` [PATCH v21 004/100] eclone (4/11): Add target_pids parameter to alloc_pid() Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 007/100] eclone (7/11): Define do_fork_with_pids() Oren Laadan
` (5 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Sukadev Bhattiprolu, linux-api, x86, linux-s390,
linuxppc-dev, Oleg Nesterov
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Add a 'target_pids' parameter to copy_process(). The new parameter will be
used in a follow-on patch when eclone() is implemented.
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
Reviewed-by: Oren Laadan <orenl@cs.columbia.edu>
---
kernel/fork.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 62018c8..9d2b57e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -957,12 +957,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
+ pid_t *target_pids,
int trace)
{
int retval;
struct task_struct *p;
int cgroup_callbacks_done = 0;
- pid_t *target_pids = NULL;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1339,7 +1339,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
struct pt_regs regs;
task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL,
- &init_struct_pid, 0);
+ &init_struct_pid, NULL, 0);
if (!IS_ERR(task))
init_idle(task, cpu);
@@ -1362,6 +1362,7 @@ long do_fork(unsigned long clone_flags,
struct task_struct *p;
int trace = 0;
long nr;
+ pid_t *target_pids = NULL;
/*
* Do some preliminary argument and permissions checking before we
@@ -1402,7 +1403,7 @@ long do_fork(unsigned long clone_flags,
trace = tracehook_prepare_clone(clone_flags);
p = copy_process(clone_flags, stack_start, regs, stack_size,
- child_tidptr, NULL, trace);
+ child_tidptr, NULL, target_pids, trace);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 007/100] eclone (7/11): Define do_fork_with_pids()
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (2 preceding siblings ...)
2010-05-01 14:14 ` [PATCH v21 005/100] eclone (5/11): Add target_pids parameter to copy_process() Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 008/100] eclone (8/11): Implement sys_eclone for x86 (32, 64) Oren Laadan
` (4 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Sukadev Bhattiprolu, linux-api, x86, linux-s390,
linuxppc-dev, Oleg Nesterov
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
do_fork_with_pids() is same as do_fork(), except that it takes an
additional, 'pid_set', parameter. This parameter, currently unused,
specifies the set of target pids of the process in each of its pid
namespaces.
Changelog[v7]:
- Drop 'struct pid_set' object and pass in 'pid_t *target_pids'
instead of 'struct pid_set *'.
Changelog[v6]:
- (Nathan Lynch, Arnd Bergmann, H. Peter Anvin, Linus Torvalds)
Change 'pid_set.pids' to a 'pid_t pids[]' so size of 'struct pid_set'
is constant across architectures.
- (Nathan Lynch) Change 'pid_set.num_pids' to 'unsigned int'.
Changelog[v4]:
- Rename 'struct target_pid_set' to 'struct pid_set' since it may
be useful in other contexts.
Changelog[v3]:
- Fix "long-line" warning from checkpatch.pl
Changelog[v2]:
- To facilitate moving architecture-inpdendent code to kernel/fork.c
pass in 'struct target_pid_set __user *' to do_fork_with_pids()
rather than 'pid_t *' (next patch moves the arch-independent
code to kernel/fork.c)
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
Reviewed-by: Oren Laadan <orenl@cs.columbia.edu>
---
include/linux/sched.h | 3 +++
kernel/fork.c | 17 +++++++++++++++--
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5de3ce5..f4ae3e3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2129,6 +2129,9 @@ extern int disallow_signal(int);
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern long do_fork_with_pids(unsigned long, unsigned long, struct pt_regs *,
+ unsigned long, int __user *, int __user *,
+ unsigned int, pid_t __user *);
struct task_struct *fork_idle(int);
extern void set_task_comm(struct task_struct *tsk, char *from);
diff --git a/kernel/fork.c b/kernel/fork.c
index e41b3d1..2559d7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1355,12 +1355,14 @@ struct task_struct * __cpuinit fork_idle(int cpu)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long do_fork(unsigned long clone_flags,
+long do_fork_with_pids(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr,
+ unsigned int num_pids,
+ pid_t __user *upids)
{
struct task_struct *p;
int trace = 0;
@@ -1463,6 +1465,17 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ return do_fork_with_pids(clone_flags, stack_start, regs, stack_size,
+ parent_tidptr, child_tidptr, 0, NULL);
+}
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 008/100] eclone (8/11): Implement sys_eclone for x86 (32, 64)
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (3 preceding siblings ...)
2010-05-01 14:14 ` [PATCH v21 007/100] eclone (7/11): Define do_fork_with_pids() Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
` (3 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-s390, linux-api, containers, x86, linux-kernel,
linuxppc-dev, Matt Helsley, Serge Hallyn, Sukadev Bhattiprolu,
Pavel Emelyanov
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Container restart requires that a task have the same pid it had when it was
checkpointed. When containers are nested the tasks within the containers
exist in multiple pid namespaces and hence have multiple pids to specify
during restart.
eclone(), intended for use during restart, is the same as
clone(), except that it takes a 'pids' paramter. This parameter lets
caller choose specific pid numbers for the child process, in the
process's active and ancestor pid namespaces. (Descendant pid namespaces
in general don't matter since processes don't have pids in them anyway,
but see comments in copy_target_pids() regarding CLONE_NEWPID).
eclone() also attempts to address a second limitation of the
clone() system call. clone() is restricted to 32 clone flags and all but
one of these are in use. If more new clone flags are needed, we will be
forced to define a new variant of the clone() system call. To address
this, eclone() allows at least 64 clone flags with some room
for more if necessary.
To prevent unprivileged processes from misusing this interface,
eclone() currently needs CAP_SYS_ADMIN, when the 'pids' parameter
is non-NULL.
See Documentation/eclone in next patch for more details and an
example of its usage.
NOTE:
- System calls are restricted to 6 parameters and the number and sizes
of parameters needed for eclone() exceed 6 integers. The new
prototype works around this restriction while providing some
flexibility if eclone() needs to be further extended in the
future.
TODO:
- We should convert clone-flags to 64-bit value in all architectures.
Its probably best to do that as a separate patchset since clone_flags
touches several functions and that patchset seems independent of this
new system call.
Changelog[v14]:
- [Oren Laadan] Rebase to kernel 2.6.33
* introduce PTREGSCALL4 for sys_eclone
* consolidate syscall definitions for 32/64 bit
- [Oren Laadan] Merge x86_64 (trivial patch) with current
- [Serge Hallyn] Add eclone stub for ia32 eclone
Changelog[v13]:
- [Dave Hansen]: Reorg to enable sharing code between x86 and x86-64.
- [Arnd Bergmann]: With args_size parameter, ->reserved1 is redundant
and can be removed.
- [Nathan Lynch]: stop warnings about assigning u64 to a (32-bit) int*.
- [Nathan Lynch, Serge Hallyn] Rename ->child_stack_base to
->child_stack and ensure ->child_stack_size is 0 on architectures
that don't need it (see comments in types.h for details).
Changelog[v12]:
- [Serge Hallyn] Ignore ->child_stack_size if ->child_stack_base
is NULL.
- [Oren Laadan, Serge Hallyn] Rename clone_with_pids() to eclone()
Changelog[v11]:
- [Dave Hansen] Move clone_args validation checks to arch-indpeendent
code.
- [Oren Laadan] Make args_size a parameter to system call and remove
it from 'struct clone_args'
Changelog[v10]:
- Rename clone3() to clone_with_pids()
- [Linus Torvalds] Use PTREGSCALL() rather than the generic syscall
implementation
Changelog[v9]:
- [Roland McGrath, H. Peter Anvin] To avoid confusion on 64-bit
architectures split the new clone-flags into 'low' and 'high'
words and pass in the 'lower' flags as the first argument.
This would maintain similarity of the clone3() with clone()/
clone2(). Also has the side-effect of the name matching the
number of parameters :-)
- [Roland McGrath] Rename structure to 'clone_args' and add a
'child_stack_size' field
Changelog[v8]
- [Oren Laadan] parent_tid and child_tid fields in 'struct clone_arg'
must be 64-bit.
- clone2() is in use in IA64. Rename system call to clone3().
Changelog[v7]:
- [Peter Zijlstra, Arnd Bergmann] Rename system call to clone2()
and group parameters into a new 'struct clone_struct' object.
Changelog[v6]:
- (Nathan Lynch, Arnd Bergmann, H. Peter Anvin, Linus Torvalds)
Change 'pid_set.pids' to a 'pid_t pids[]' so size of 'struct pid_set'
is constant across architectures.
- (Nathan Lynch) Change pid_set.num_pids to unsigned and remove
'unum_pids < 0' check.
Changelog[v4]:
- (Oren Laadan) rename 'struct target_pid_set' to 'struct pid_set'
Changelog[v3]:
- (Oren Laadan) Allow CLONE_NEWPID flag (by allocating an extra pid
in the target_pids[] list and setting it 0. See copy_target_pids()).
- (Oren Laadan) Specified target pids should apply only to youngest
pid-namespaces (see copy_target_pids())
- (Matt Helsley) Update patch description.
Changelog[v2]:
- Remove unnecessary printk and add a note to callers of
copy_target_pids() to free target_pids.
- (Serge Hallyn) Mention CAP_SYS_ADMIN restriction in patch description.
- (Oren Laadan) Add checks for 'num_pids < 0' (return -EINVAL) and
'num_pids == 0' (fall back to normal clone()).
- Move arch-independent code (sanity checks and copy-in of target-pids)
into kernel/fork.c and simplify sys_clone_with_pids()
Changelog[v1]:
- Fixed some compile errors (had fixed these errors earlier in my
git tree but had not refreshed patches before emailing them)
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Oren Laadan <orenl.cs.columbia.edu>
---
arch/x86/ia32/ia32entry.S | 2 +
arch/x86/include/asm/syscalls.h | 2 +
arch/x86/include/asm/unistd_32.h | 3 +-
arch/x86/include/asm/unistd_64.h | 2 +
arch/x86/kernel/entry_32.S | 14 ++++
arch/x86/kernel/entry_64.S | 1 +
arch/x86/kernel/process.c | 40 +++++++++++-
arch/x86/kernel/syscall_table_32.S | 1 +
include/linux/sched.h | 2 +
include/linux/types.h | 16 +++++
kernel/fork.c | 124 +++++++++++++++++++++++++++++++++++-
11 files changed, 204 insertions(+), 3 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 59b4556..b7f3f34 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -477,6 +477,7 @@ quiet_ni_syscall:
PTREGSCALL stub32_clone, sys32_clone, %rdx
PTREGSCALL stub32_vfork, sys_vfork, %rdi
PTREGSCALL stub32_iopl, sys_iopl, %rsi
+ PTREGSCALL stub32_eclone, sys_eclone, %r8
ENTRY(ia32_ptregs_common)
popq %r11
@@ -842,4 +843,5 @@ ia32_sys_call_table:
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
.quad compat_sys_recvmmsg
+ .quad stub32_eclone
ia32_syscall_end:
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b4..d525677 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -27,6 +27,8 @@ long sys_execve(char __user *, char __user * __user *,
char __user * __user *, struct pt_regs *);
long sys_clone(unsigned long, unsigned long, void __user *,
void __user *, struct pt_regs *);
+long sys_eclone(unsigned flags_low, struct clone_args __user *uca,
+ int args_size, pid_t __user *pids, struct pt_regs *regs);
/* kernel/ldt.c */
asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f..e543b0e 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,11 @@
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
+#define __NR_eclone 338
#ifdef __KERNEL__
-#define NR_syscalls 338
+#define NR_syscalls 339
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b..1cd16af 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __NR_recvmmsg 299
__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_eclone 300
+__SYSCALL(__NR_eclone, stub_eclone)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0d..65e1735 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -758,6 +758,19 @@ ptregs_##name: \
addl $4,%esp; \
ret
+#define PTREGSCALL4(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ pushl %eax; \
+ pushl PT_ESI(%eax); \
+ movl PT_EDX(%eax),%ecx; \
+ movl PT_ECX(%eax),%edx; \
+ movl PT_EBX(%eax),%eax; \
+ call sys_##name; \
+ addl $8,%esp; \
+ ret
+
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
PTREGSCALL0(vfork)
@@ -767,6 +780,7 @@ PTREGSCALL0(sigreturn)
PTREGSCALL0(rt_sigreturn)
PTREGSCALL2(vm86)
PTREGSCALL1(vm86old)
+PTREGSCALL4(eclone)
/* Clone is an oddball. The 4th arg is in %edi */
ALIGN;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0697ff1..216681e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -698,6 +698,7 @@ END(\label)
PTREGSCALL stub_vfork, sys_vfork, %rdi
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
+ PTREGSCALL stub_eclone, sys_eclone, %r8
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4..5abad20 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -259,6 +259,45 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
+long
+sys_eclone(unsigned flags_low, struct clone_args __user *uca,
+ int args_size, pid_t __user *pids, struct pt_regs *regs)
+{
+ int rc;
+ struct clone_args kca;
+ unsigned long flags;
+ int __user *parent_tidp;
+ int __user *child_tidp;
+ unsigned long __user stack;
+ unsigned long stack_size;
+
+ rc = fetch_clone_args_from_user(uca, args_size, &kca);
+ if (rc)
+ return rc;
+
+ /*
+ * TODO: Convert 'clone-flags' to 64-bits on all architectures.
+ * TODO: When ->clone_flags_high is non-zero, copy it in to the
+ * higher word(s) of 'flags':
+ *
+ * flags = (kca.clone_flags_high << 32) | flags_low;
+ */
+ flags = flags_low;
+ parent_tidp = (int *)(unsigned long)kca.parent_tid_ptr;
+ child_tidp = (int *)(unsigned long)kca.child_tid_ptr;
+
+ stack_size = (unsigned long)kca.child_stack_size;
+ if (stack_size)
+ return -EINVAL;
+
+ stack = (unsigned long)kca.child_stack;
+ if (!stack)
+ stack = regs->sp;
+
+ return do_fork_with_pids(flags, stack, regs, stack_size, parent_tidp,
+ child_tidp, kca.nr_pids, pids);
+}
+
/*
* This gets run with %si containing the
* function to call, and %di containing
@@ -700,4 +739,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
-
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b37293..0c92570 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_recvmmsg
+ .long ptregs_eclone
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4ae3e3..8593051 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2129,6 +2129,8 @@ extern int disallow_signal(int);
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern int fetch_clone_args_from_user(struct clone_args __user *, int,
+ struct clone_args *);
extern long do_fork_with_pids(unsigned long, unsigned long, struct pt_regs *,
unsigned long, int __user *, int __user *,
unsigned int, pid_t __user *);
diff --git a/include/linux/types.h b/include/linux/types.h
index c42724f..d8bfd6b 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -204,6 +204,22 @@ struct ustat {
char f_fpack[6];
};
+struct clone_args {
+ u64 clone_flags_high;
+ /*
+ * Architectures can use child_stack for either the stack pointer or
+ * the base of of stack. If child_stack is used as the stack pointer,
+ * child_stack_size must be 0. Otherwise child_stack_size must be
+ * set to size of allocated stack.
+ */
+ u64 child_stack;
+ u64 child_stack_size;
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+ u32 nr_pids;
+ u32 reserved0;
+};
+
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2559d7a..9d5be5c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1350,6 +1350,114 @@ struct task_struct * __cpuinit fork_idle(int cpu)
}
/*
+ * If user specified any 'target-pids' in @upid_setp, copy them from
+ * user and return a pointer to a local copy of the list of pids. The
+ * caller must free the list, when they are done using it.
+ *
+ * If user did not specify any target pids, return NULL (caller should
+ * treat this like normal clone).
+ *
+ * On any errors, return the error code
+ */
+static pid_t *copy_target_pids(int unum_pids, pid_t __user *upids)
+{
+ int j;
+ int rc;
+ int size;
+ int knum_pids; /* # of pids needed in kernel */
+ pid_t *target_pids;
+
+ if (!unum_pids)
+ return NULL;
+
+ knum_pids = task_pid(current)->level + 1;
+ if (unum_pids > knum_pids)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * To keep alloc_pid() simple, allocate an extra pid_t in target_pids[]
+ * and set it to 0. This last entry in target_pids[] corresponds to the
+ * (yet-to-be-created) descendant pid-namespace if CLONE_NEWPID was
+ * specified. If CLONE_NEWPID was not specified, this last entry will
+ * simply be ignored.
+ */
+ target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
+ if (!target_pids)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * A process running in a level 2 pid namespace has three pid namespaces
+ * and hence three pid numbers. If this process is checkpointed,
+ * information about these three namespaces are saved. We refer to these
+ * namespaces as 'known namespaces'.
+ *
+ * If this checkpointed process is however restarted in a level 3 pid
+ * namespace, the restarted process has an extra ancestor pid namespace
+ * (i.e 'unknown namespace') and 'knum_pids' exceeds 'unum_pids'.
+ *
+ * During restart, the process requests specific pids for its 'known
+ * namespaces' and lets kernel assign pids to its 'unknown namespaces'.
+ *
+ * Since the requested-pids correspond to 'known namespaces' and since
+ * 'known-namespaces' are younger than (i.e descendants of) 'unknown-
+ * namespaces', copy requested pids to the back-end of target_pids[]
+ * (i.e before the last entry for CLONE_NEWPID mentioned above).
+ * Any entries in target_pids[] not corresponding to a requested pid
+ * will be set to zero and kernel assigns a pid in those namespaces.
+ *
+ * NOTE: The order of pids in target_pids[] is oldest pid namespace
+ * to youngest (target_pids[0] corresponds to init_pid_ns). i.e. the
+ * the order is:
+ *
+ * - pids for 'unknown-namespaces' (if any)
+ * - pids for 'known-namespaces' (requested pids)
+ * - 0 in the last entry (for CLONE_NEWPID).
+ */
+ j = knum_pids - unum_pids;
+ size = unum_pids * sizeof(pid_t);
+
+ rc = copy_from_user(&target_pids[j], upids, size);
+ if (rc) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ return target_pids;
+
+out_free:
+ kfree(target_pids);
+ return ERR_PTR(rc);
+}
+
+int
+fetch_clone_args_from_user(struct clone_args __user *uca, int args_size,
+ struct clone_args *kca)
+{
+ int rc;
+
+ /*
+ * TODO: If size of clone_args is not what the kernel expects, it
+ * could be that kernel is newer and has an extended structure.
+ * When that happens, this check needs to be smarter. For now,
+ * assume exact match.
+ */
+ if (args_size != sizeof(struct clone_args))
+ return -EINVAL;
+
+ rc = copy_from_user(kca, uca, args_size);
+ if (rc)
+ return -EFAULT;
+
+ /*
+ * To avoid future compatibility issues, ensure unused fields are 0.
+ */
+ if (kca->reserved0 || kca->clone_flags_high)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
@@ -1367,7 +1475,7 @@ long do_fork_with_pids(unsigned long clone_flags,
struct task_struct *p;
int trace = 0;
long nr;
- pid_t *target_pids = NULL;
+ pid_t *target_pids;
/*
* Do some preliminary argument and permissions checking before we
@@ -1401,6 +1509,16 @@ long do_fork_with_pids(unsigned long clone_flags,
}
}
+ target_pids = copy_target_pids(num_pids, upids);
+ if (target_pids) {
+ if (IS_ERR(target_pids))
+ return PTR_ERR(target_pids);
+
+ nr = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out_free;
+ }
+
/*
* When called from kernel_thread, don't do user tracing stuff.
*/
@@ -1462,6 +1580,10 @@ long do_fork_with_pids(unsigned long clone_flags,
} else {
nr = PTR_ERR(p);
}
+
+out_free:
+ kfree(target_pids);
+
return nr;
}
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
[parent not found: <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* [PATCH v21 001/100] eclone (1/11): Factor out code to allocate pidmap page
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2010-05-01 14:14 ` Oren Laadan
[not found] ` <1272723382-19470-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-05-01 14:14 ` [PATCH v21 003/100] eclone (3/11): Define set_pidmap() function Oren Laadan
` (3 subsequent siblings)
4 siblings, 1 reply; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
x86-DgEjT+Ai2ygdnm+yROfE0A, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linuxppc-dev-mnsaURCQ41sdnm+yROfE0A, Sukadev Bhattiprolu,
Pavel Emelyanov
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
To simplify alloc_pidmap(), move code to allocate a pid map page to a
separate function.
Changelog[v4]:
- [Oren Laadan] Adapt to kernel 2.6.33-rc5
Changelog[v3]:
- Earlier version of patchset called alloc_pidmap_page() from two
places. But now its called from only one place. Even so, moving
this code out into a separate function simplifies alloc_pidmap().
Changelog[v2]:
- (Matt Helsley, Dave Hansen) Have alloc_pidmap_page() return
-ENOMEM on error instead of -1.
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linuxppc-dev-mnsaURCQ41sdnm+yROfE0A@public.gmane.org
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Tested-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
kernel/pid.c | 41 ++++++++++++++++++++++++++---------------
1 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d..52a371a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,30 @@ static void free_pidmap(struct upid *upid)
atomic_inc(&map->nr_free);
}
+static int alloc_pidmap_page(struct pidmap *map)
+{
+ void *page;
+
+ if (likely(map->page))
+ return 0;
+
+ page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (!map->page) {
+ map->page = page;
+ page = NULL;
+ }
+ spin_unlock_irq(&pidmap_lock);
+ kfree(page);
+ if (unlikely(!map->page))
+ return -1;
+
+ return 0;
+}
+
static int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -134,22 +158,9 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
+ if (unlikely(!map->page))
+ if (alloc_pidmap_page(map) < 0)
break;
- }
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 003/100] eclone (3/11): Define set_pidmap() function
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-05-01 14:14 ` [PATCH v21 001/100] eclone (1/11): Factor out code to allocate pidmap page Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 006/100] eclone (6/11): Check invalid clone flags Oren Laadan
` (2 subsequent siblings)
4 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Sukadev Bhattiprolu,
linux-api-u79uwXL29TY76Z2rM5mHXA, x86-DgEjT+Ai2ygdnm+yROfE0A,
linux-s390-u79uwXL29TY76Z2rM5mHXA,
linuxppc-dev-mnsaURCQ41sdnm+yROfE0A, Sukadev Bhattiprolu
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Define a set_pidmap() interface which is like alloc_pidmap() only that
caller specifies the pid number to be assigned.
Changelog[v13]:
- Don't let do_alloc_pidmap return 0 if it failed to find a pid.
Changelog[v9]:
- Completely rewrote this patch based on Eric Biederman's code.
Changelog[v7]:
- [Eric Biederman] Generalize alloc_pidmap() to take a range of pids.
Changelog[v6]:
- Separate target_pid > 0 case to minimize the number of checks needed.
Changelog[v3]:
- (Eric Biederman): Avoid set_pidmap() function. Added couple of
checks for target_pid in alloc_pidmap() itself.
Changelog[v2]:
- (Serge Hallyn) Check for 'pid < 0' in set_pidmap().(Code
actually checks for 'pid <= 0' for completeness).
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linuxppc-dev-mnsaURCQ41sdnm+yROfE0A@public.gmane.org
Signed-off-by: Sukadev Bhattiprolu <sukadev-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
kernel/pid.c | 41 +++++++++++++++++++++++++++++++++--------
1 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index 8330488..4eaf975 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -146,17 +146,18 @@ static int alloc_pidmap_page(struct pidmap *map)
return 0;
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int do_alloc_pidmap(struct pid_namespace *pid_ns, int last, int min,
+ int max)
{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ int i, offset, max_scan, pid;
struct pidmap *map;
pid = last + 1;
if (pid >= pid_max)
- pid = RESERVED_PIDS;
+ pid = min;
offset = pid & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+ max_scan = (max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page))
if (alloc_pidmap_page(map) < 0)
@@ -165,7 +166,6 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
- pid_ns->last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
@@ -176,16 +176,16 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
* bitmap block and the final block was the same
* as the starting point, pid is before last_pid.
*/
- } while (offset < BITS_PER_PAGE && pid < pid_max &&
+ } while (offset < BITS_PER_PAGE && pid < max &&
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+ if (map < &pid_ns->pidmap[(max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
+ offset = min;
if (unlikely(last == offset))
break;
}
@@ -194,6 +194,31 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return -EBUSY;
}
+static int alloc_pidmap(struct pid_namespace *pid_ns)
+{
+ int nr;
+
+ nr = do_alloc_pidmap(pid_ns, pid_ns->last_pid, RESERVED_PIDS, pid_max);
+ if (nr >= 0)
+ pid_ns->last_pid = nr;
+ return nr;
+}
+
+static int set_pidmap(struct pid_namespace *pid_ns, int target)
+{
+ if (!target)
+ return alloc_pidmap(pid_ns);
+
+ if (target >= pid_max)
+ return -EINVAL;
+
+ if ((target < 0) || (target < RESERVED_PIDS &&
+ pid_ns->last_pid >= RESERVED_PIDS))
+ return -EINVAL;
+
+ return do_alloc_pidmap(pid_ns, target - 1, target, target + 1);
+}
+
int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 006/100] eclone (6/11): Check invalid clone flags
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-05-01 14:14 ` [PATCH v21 001/100] eclone (1/11): Factor out code to allocate pidmap page Oren Laadan
2010-05-01 14:14 ` [PATCH v21 003/100] eclone (3/11): Define set_pidmap() function Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 009/100] eclone (9/11): Implement sys_eclone for s390 Oren Laadan
2010-05-01 14:14 ` [PATCH v21 010/100] eclone (10/11): Implement sys_eclone for powerpc Oren Laadan
4 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Sukadev Bhattiprolu,
linux-api-u79uwXL29TY76Z2rM5mHXA, x86-DgEjT+Ai2ygdnm+yROfE0A,
linux-s390-u79uwXL29TY76Z2rM5mHXA,
linuxppc-dev-mnsaURCQ41sdnm+yROfE0A, Oleg Nesterov
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
As pointed out by Oren Laadan, we want to ensure that unused bits in the
clone-flags remain unused and available for future. To ensure this, define
a mask of clone-flags and check the flags in the clone() system calls.
Changelog[v9]:
- Include the unused clone-flag (CLONE_UNUSED) to VALID_CLONE_FLAGS
to avoid breaking any applications that may have set it. IOW, this
patch/check only applies to clone-flags bits 33 and higher.
Changelog[v8]:
- New patch in set
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linuxppc-dev-mnsaURCQ41sdnm+yROfE0A@public.gmane.org
Cc: Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Tested-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Acked-by: Oren Laadan <orenl.cs.columbia.edu>
---
include/linux/sched.h | 12 ++++++++++++
kernel/fork.c | 3 +++
2 files changed, 15 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f66..5de3ce5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,18 @@
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
+#define CLONE_UNUSED 0x00001000 /* Can be reused ? */
+
+#define VALID_CLONE_FLAGS (CSIGNAL | CLONE_VM | CLONE_FS | CLONE_FILES |\
+ CLONE_SIGHAND | CLONE_UNUSED | CLONE_PTRACE |\
+ CLONE_VFORK | CLONE_PARENT | CLONE_THREAD |\
+ CLONE_NEWNS | CLONE_SYSVSEM | CLONE_SETTLS |\
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |\
+ CLONE_DETACHED | CLONE_UNTRACED |\
+ CLONE_CHILD_SETTID | CLONE_STOPPED |\
+ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER |\
+ CLONE_NEWPID | CLONE_NEWNET | CLONE_IO)
+
/*
* Scheduling policies
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d2b57e..e41b3d1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -964,6 +964,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
struct task_struct *p;
int cgroup_callbacks_done = 0;
+ if (clone_flags & ~VALID_CLONE_FLAGS)
+ return ERR_PTR(-EINVAL);
+
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 009/100] eclone (9/11): Implement sys_eclone for s390
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
` (2 preceding siblings ...)
2010-05-01 14:14 ` [PATCH v21 006/100] eclone (6/11): Check invalid clone flags Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
2010-05-01 14:14 ` [PATCH v21 010/100] eclone (10/11): Implement sys_eclone for powerpc Oren Laadan
4 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
x86-DgEjT+Ai2ygdnm+yROfE0A, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linuxppc-dev-mnsaURCQ41sdnm+yROfE0A, Pavel Emelyanov
From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Implement the s390 hook for sys_eclone().
Changelog:
Nov 24: Removed user-space code from commit log. See user-cr git tree.
Nov 17: remove redundant flags_high check
Nov 13: As suggested by Heiko, convert eclone to take its
parameters via registers.
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linuxppc-dev-mnsaURCQ41sdnm+yROfE0A@public.gmane.org
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
arch/s390/include/asm/unistd.h | 3 ++-
arch/s390/kernel/compat_linux.c | 17 +++++++++++++++++
arch/s390/kernel/compat_wrapper.S | 8 ++++++++
arch/s390/kernel/process.c | 37 +++++++++++++++++++++++++++++++++++++
arch/s390/kernel/syscalls.S | 1 +
5 files changed, 65 insertions(+), 1 deletions(-)
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 5f00751..ff13be1 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -269,7 +269,8 @@
#define __NR_pwritev 329
#define __NR_rt_tgsigqueueinfo 330
#define __NR_perf_event_open 331
-#define NR_syscalls 332
+#define __NR_eclone 332
+#define NR_syscalls 333
/*
* There are some system calls that are not present on 64 bit, some
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 73b624e..1f70d6f 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -663,6 +663,23 @@ asmlinkage long sys32_write(unsigned int fd, char __user * buf, size_t count)
return sys_write(fd, buf, count);
}
+asmlinkage long sys32_clone(void)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ unsigned long clone_flags;
+ unsigned long newsp;
+ int __user *parent_tidptr, *child_tidptr;
+
+ clone_flags = regs->gprs[3] & 0xffffffffUL;
+ newsp = regs->orig_gpr2 & 0x7fffffffUL;
+ parent_tidptr = compat_ptr(regs->gprs[4]);
+ child_tidptr = compat_ptr(regs->gprs[5]);
+ if (!newsp)
+ newsp = regs->gprs[15];
+ return do_fork(clone_flags, newsp, regs, 0,
+ parent_tidptr, child_tidptr);
+}
+
/*
* 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64.
* These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE}
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 672ce52..b7bedfa 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1847,6 +1847,14 @@ sys_clone_wrapper:
llgtr %r5,%r5 # int *
jg sys_clone # branch to system call
+ .globl sys_eclone_wrapper
+sys_eclone_wrapper:
+ llgfr %r2,%r2 # unsigned int
+ llgtr %r3,%r3 # struct clone_args *
+ lgfr %r4,%r4 # int
+ llgtr %r5,%r5 # pid_t *
+ jg sys_eclone # branch to system call
+
.globl sys32_execve_wrapper
sys32_execve_wrapper:
llgtr %r2,%r2 # char *
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 1039fde..799cbb0 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -240,6 +240,43 @@ SYSCALL_DEFINE4(clone, unsigned long, newsp, unsigned long, clone_flags,
parent_tidptr, child_tidptr);
}
+SYSCALL_DEFINE4(eclone, unsigned int, flags_low, struct clone_args __user *,
+ uca, int, args_size, pid_t __user *, pids)
+{
+ int rc;
+ struct pt_regs *regs = task_pt_regs(current);
+ struct clone_args kca;
+ int __user *parent_tid_ptr;
+ int __user *child_tid_ptr;
+ unsigned long flags;
+ unsigned long __user child_stack;
+ unsigned long stack_size;
+
+ rc = fetch_clone_args_from_user(uca, args_size, &kca);
+ if (rc)
+ return rc;
+
+ flags = flags_low;
+ parent_tid_ptr = (int __user *) kca.parent_tid_ptr;
+ child_tid_ptr = (int __user *) kca.child_tid_ptr;
+
+ stack_size = (unsigned long) kca.child_stack_size;
+ if (stack_size)
+ return -EINVAL;
+
+ child_stack = (unsigned long) kca.child_stack;
+ if (!child_stack)
+ child_stack = regs->gprs[15];
+
+ /*
+ * TODO: On 32-bit systems, clone_flags is passed in as 32-bit value
+ * to several functions. Need to convert clone_flags to 64-bit.
+ */
+ return do_fork_with_pids(flags, child_stack, regs, stack_size,
+ parent_tid_ptr, child_tid_ptr, kca.nr_pids,
+ pids);
+}
+
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 201ce6b..08eab1d 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -340,3 +340,4 @@ SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper)
SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper)
SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */
SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper)
+SYSCALL(sys_eclone,sys_eclone,sys_eclone_wrapper)
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 010/100] eclone (10/11): Implement sys_eclone for powerpc
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
` (3 preceding siblings ...)
2010-05-01 14:14 ` [PATCH v21 009/100] eclone (9/11): Implement sys_eclone for s390 Oren Laadan
@ 2010-05-01 14:14 ` Oren Laadan
4 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Nathan Lynch, linux-api-u79uwXL29TY76Z2rM5mHXA,
x86-DgEjT+Ai2ygdnm+yROfE0A, linux-s390-u79uwXL29TY76Z2rM5mHXA,
linuxppc-dev-mnsaURCQ41sdnm+yROfE0A
From: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
Wired up for both ppc32 and ppc64, but tested only with the latter.
Changelog:
- Jan 20: (ntl) fix 32-bit build
- Nov 17: (serge) remove redundant flags_high check, and
don't fold it into flags.
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linuxppc-dev-mnsaURCQ41sdnm+yROfE0A@public.gmane.org
Signed-off-by: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
arch/powerpc/include/asm/syscalls.h | 6 ++++
arch/powerpc/include/asm/systbl.h | 1 +
arch/powerpc/include/asm/unistd.h | 3 +-
arch/powerpc/kernel/entry_32.S | 8 +++++
arch/powerpc/kernel/entry_64.S | 5 +++
arch/powerpc/kernel/process.c | 54 ++++++++++++++++++++++++++++++++++-
6 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index 4084e56..920cefd 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -23,6 +23,12 @@ asmlinkage int sys_execve(unsigned long a0, unsigned long a1,
asmlinkage int sys_clone(unsigned long clone_flags, unsigned long usp,
int __user *parent_tidp, void __user *child_threadptr,
int __user *child_tidp, int p6, struct pt_regs *regs);
+asmlinkage int sys_eclone(unsigned long flags_low,
+ struct clone_args __user *args,
+ size_t args_size,
+ pid_t __user *pids,
+ unsigned long p5, unsigned long p6,
+ struct pt_regs *regs);
asmlinkage int sys_fork(unsigned long p1, unsigned long p2,
unsigned long p3, unsigned long p4, unsigned long p5,
unsigned long p6, struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index a5ee345..f94fc43 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -326,3 +326,4 @@ SYSCALL_SPU(perf_event_open)
COMPAT_SYS_SPU(preadv)
COMPAT_SYS_SPU(pwritev)
COMPAT_SYS(rt_tgsigqueueinfo)
+PPC_SYS(eclone)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f0a1026..4cdbd5c 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -345,10 +345,11 @@
#define __NR_preadv 320
#define __NR_pwritev 321
#define __NR_rt_tgsigqueueinfo 322
+#define __NR_eclone 323
#ifdef __KERNEL__
-#define __NR_syscalls 323
+#define __NR_syscalls 324
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 1175a85..579f1da 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -586,6 +586,14 @@ ppc_clone:
stw r0,_TRAP(r1) /* register set saved */
b sys_clone
+ .globl ppc_eclone
+ppc_eclone:
+ SAVE_NVGPRS(r1)
+ lwz r0,_TRAP(r1)
+ rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */
+ stw r0,_TRAP(r1) /* register set saved */
+ b sys_eclone
+
.globl ppc_swapcontext
ppc_swapcontext:
SAVE_NVGPRS(r1)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 07109d8..b763340 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -344,6 +344,11 @@ _GLOBAL(ppc_clone)
bl .sys_clone
b syscall_exit
+_GLOBAL(ppc_eclone)
+ bl .save_nvgprs
+ bl .sys_eclone
+ b syscall_exit
+
_GLOBAL(ppc32_swapcontext)
bl .save_nvgprs
bl .compat_sys_swapcontext
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e4d71ce..b183287 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -961,7 +961,59 @@ int sys_clone(unsigned long clone_flags, unsigned long usp,
child_tidp = TRUNC_PTR(child_tidp);
}
#endif
- return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp);
+ return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp);
+}
+
+int sys_eclone(unsigned long clone_flags_low,
+ struct clone_args __user *uclone_args,
+ size_t size,
+ pid_t __user *upids,
+ unsigned long p5, unsigned long p6,
+ struct pt_regs *regs)
+{
+ struct clone_args kclone_args;
+ unsigned long stack_base;
+ int __user *parent_tidp;
+ int __user *child_tidp;
+ unsigned long stack_sz;
+ unsigned int nr_pids;
+ unsigned long flags;
+ unsigned long usp;
+ int rc;
+
+ CHECK_FULL_REGS(regs);
+
+ rc = fetch_clone_args_from_user(uclone_args, size, &kclone_args);
+ if (rc)
+ return rc;
+
+ stack_sz = kclone_args.child_stack_size;
+ stack_base = kclone_args.child_stack;
+
+ /* powerpc doesn't do anything useful with the stack size */
+ if (stack_sz)
+ return -EINVAL;
+
+ /* Interpret stack_base as the child sp if it is set. */
+ usp = regs->gpr[1];
+ if (stack_base)
+ usp = stack_base;
+
+ flags = clone_flags_low;
+
+ nr_pids = kclone_args.nr_pids;
+
+ parent_tidp = (int __user *)(unsigned long)kclone_args.parent_tid_ptr;
+ child_tidp = (int __user *)(unsigned long)kclone_args.child_tid_ptr;
+
+#ifdef CONFIG_PPC64
+ if (test_thread_flag(TIF_32BIT)) {
+ parent_tidp = TRUNC_PTR(parent_tidp);
+ child_tidp = TRUNC_PTR(child_tidp);
+ }
+#endif
+ return do_fork_with_pids(flags, stack_base, regs, stack_sz,
+ parent_tidp, child_tidp, nr_pids, upids);
}
int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3,
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 011/100] eclone (11/11): Document sys_eclone
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (5 preceding siblings ...)
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2010-05-01 14:14 ` Oren Laadan
[not found] ` <1272723382-19470-12-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
2010-05-01 14:15 ` [PATCH v21 021/100] c/r: create syscalls: sys_checkpoint, sys_restart Oren Laadan
8 siblings, 1 reply; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:14 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Sukadev Bhattiprolu, linux-api, x86, linux-s390,
linuxppc-dev
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
This gives a brief overview of the eclone() system call. We should
eventually describe more details in existing clone(2) man page or in
a new man page.
Changelog[v13]:
- [Nathan Lynch, Serge Hallyn] Rename ->child_stack_base to
->child_stack and ensure ->child_stack_size is 0 on architectures
that don't need it.
- [Arnd Bergmann] Remove ->reserved1 field
- [Louis Rilling, Dave Hansen] Combine the two asm statements in the
example into one and use memory constraint to avoid unncessary copies.
Changelog[v12]:
- [Serge Hallyn] Fix/simplify stack-setup in the example code
- [Serge Hallyn, Oren Laadan] Rename syscall to eclone()
Changelog[v11]:
- [Dave Hansen] Move clone_args validation checks to arch-indpendent
code.
- [Oren Laadan] Make args_size a parameter to system call and remove
it from 'struct clone_args'
- [Oren Laadan] Fix some typos and clarify the order of pids in the
@pids parameter.
Changelog[v10]:
- Rename clone3() to clone_with_pids() and fix some typos.
- Modify example to show usage with the ptregs implementation.
Changelog[v9]:
- [Pavel Machek]: Fix an inconsistency and rename new file to
Documentation/clone3.
- [Roland McGrath, H. Peter Anvin] Updates to description and
example to reflect new prototype of clone3() and the updated/
renamed 'struct clone_args'.
Changelog[v8]:
- clone2() is already in use in IA64. Rename syscall to clone3()
- Add notes to say that we return -EINVAL if invalid clone flags
are specified or if the reserved fields are not 0.
Changelog[v7]:
- Rename clone_with_pids() to clone2()
- Changes to reflect new prototype of clone2() (using clone_struct).
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
---
Documentation/eclone | 348 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 348 insertions(+), 0 deletions(-)
create mode 100644 Documentation/eclone
diff --git a/Documentation/eclone b/Documentation/eclone
new file mode 100644
index 0000000..c2f1b4b
--- /dev/null
+++ b/Documentation/eclone
@@ -0,0 +1,348 @@
+
+struct clone_args {
+ u64 clone_flags_high;
+ u64 child_stack;
+ u64 child_stack_size;
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+ u32 nr_pids;
+ u32 reserved0;
+};
+
+
+sys_eclone(u32 flags_low, struct clone_args * __user cargs, int cargs_size,
+ pid_t * __user pids)
+
+ In addition to doing everything that clone() system call does, the
+ eclone() system call:
+
+ - allows additional clone flags (31 of 32 bits in the flags
+ parameter to clone() are in use)
+
+ - allows user to specify a pid for the child process in its
+ active and ancestor pid namespaces.
+
+ This system call is meant to be used when restarting an application
+ from a checkpoint. Such restart requires that the processes in the
+ application have the same pids they had when the application was
+ checkpointed. When containers are nested, the processes within the
+ containers exist in multiple pid namespaces and hence have multiple
+ pids to specify during restart.
+
+ The @flags_low parameter is identical to the 'clone_flags' parameter
+ in existing clone() system call.
+
+ The fields in 'struct clone_args' are meant to be used as follows:
+
+ u64 clone_flags_high:
+
+ When eclone() supports more than 32 flags, the additional bits
+ in the clone_flags should be specified in this field. This
+ field is currently unused and must be set to 0.
+
+ u64 child_stack;
+ u64 child_stack_size;
+
+ These two fields correspond to the 'child_stack' fields in
+ clone() and clone2() (on IA64) system calls. The usage of
+ these two fields depends on the processor architecture.
+
+ Most architectures use ->child_stack to pass-in a stack-pointer
+ itself and don't need the ->child_stack_size field. On these
+ architectures the ->child_stack_size field must be 0.
+
+ Some architectures, eg IA64, use ->child_stack to pass-in the
+ base of the region allocated for stack. These architectures
+ must pass in the size of the stack-region in ->child_stack_size.
+
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+
+ These two fields correspond to the 'parent_tid_ptr' and
+ 'child_tid_ptr' fields in the clone() system call
+
+ u32 nr_pids;
+
+ nr_pids specifies the number of pids in the @pids array
+ parameter to eclone() (see below). nr_pids should not exceed
+ the current nesting level of the calling process (i.e if the
+ process is in init_pid_ns, nr_pids must be 1, if process is
+ in a pid namespace that is a child of init-pid-ns, nr_pids
+ cannot exceed 2, and so on).
+
+ u32 reserved0;
+ u64 reserved1;
+
+ These fields are intended to extend the functionality of the
+ eclone() in the future, while preserving backward compatibility.
+ They must be set to 0 for now.
+
+ The @cargs_size parameter specifes the sizeof(struct clone_args) and
+ is intended to enable extending this structure in the future, while
+ preserving backward compatibility. For now, this field must be set
+ to the sizeof(struct clone_args) and this size must match the kernel's
+ view of the structure.
+
+ The @pids parameter defines the set of pids that should be assigned to
+ the child process in its active and ancestor pid namespaces. The
+ descendant pid namespaces do not matter since a process does not have a
+ pid in descendant namespaces, unless the process is in a new pid
+ namespace in which case the process is a container-init (and must have
+ the pid 1 in that namespace).
+
+ See CLONE_NEWPID section of clone(2) man page for details about pid
+ namespaces.
+
+ If a pid in the @pids list is 0, the kernel will assign the next
+ available pid in the pid namespace.
+
+ If a pid in the @pids list is non-zero, the kernel tries to assign
+ the specified pid in that namespace. If that pid is already in use
+ by another process, the system call fails (see EBUSY below).
+
+ The order of pids in @pids is oldest in pids[0] to youngest pid
+ namespace in pids[nr_pids-1]. If the number of pids specified in the
+ @pids list is fewer than the nesting level of the process, the pids
+ are applied from youngest namespace. i.e if the process is nested in
+ a level-6 pid namespace and @pids only specifies 3 pids, the 3 pids
+ are applied to levels 6, 5 and 4. Levels 0 through 3 are assumed to
+ have a pid of '0' (the kernel will assign a pid in those namespaces).
+
+ On success, the system call returns the pid of the child process in
+ the parent's active pid namespace.
+
+ On failure, eclone() returns -1 and sets 'errno' to one of following
+ values (the child process is not created).
+
+ EPERM Caller does not have the CAP_SYS_ADMIN privilege needed to
+ specify the pids in this call (if pids are not specifed
+ CAP_SYS_ADMIN is not required).
+
+ EINVAL The number of pids specified in 'clone_args.nr_pids' exceeds
+ the current nesting level of parent process
+
+ EINVAL Not all specified clone-flags are valid.
+
+ EINVAL The reserved fields in the clone_args argument are not 0.
+
+ EINVAL The child_stack_size field is not 0 (on architectures that
+ pass in a stack pointer in ->child_stack field)
+
+ EBUSY A requested pid is in use by another process in that namespace.
+
+---
+/*
+ * Example eclone() usage - Create a child process with pid CHILD_TID1 in
+ * the current pid namespace. The child gets the usual "random" pid in any
+ * ancestor pid namespaces.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
+#include <unistd.h>
+#include <wait.h>
+#include <sys/syscall.h>
+
+#define __NR_eclone 337
+#define CLONE_NEWPID 0x20000000
+#define CLONE_CHILD_SETTID 0x01000000
+#define CLONE_PARENT_SETTID 0x00100000
+#define CLONE_UNUSED 0x00001000
+
+#define STACKSIZE 8192
+
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef int pid_t;
+struct clone_args {
+ u64 clone_flags_high;
+ u64 child_stack;
+ u64 child_stack_size;
+
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+
+ u32 nr_pids;
+
+ u32 reserved0;
+};
+
+#define exit _exit
+
+/*
+ * Following eclone() is based on code posted by Oren Laadan at:
+ * https://lists.linux-foundation.org/pipermail/containers/2009-June/018463.html
+ */
+#if defined(__i386__) && defined(__NR_eclone)
+
+int eclone(u32 flags_low, struct clone_args *clone_args, int args_size,
+ int *pids)
+{
+ long retval;
+
+ __asm__ __volatile__(
+ "movl %3, %%ebx\n\t" /* flags_low -> 1st (ebx) */
+ "movl %4, %%ecx\n\t" /* clone_args -> 2nd (ecx)*/
+ "movl %5, %%edx\n\t" /* args_size -> 3rd (edx) */
+ "movl %6, %%edi\n\t" /* pids -> 4th (edi)*/
+
+ "pushl %%ebp\n\t" /* save value of ebp */
+ "int $0x80\n\t" /* Linux/i386 system call */
+ "testl %0,%0\n\t" /* check return value */
+ "jne 1f\n\t" /* jump if parent */
+
+ "popl %%esi\n\t" /* get subthread function */
+ "call *%%esi\n\t" /* start subthread function */
+ "movl %2,%0\n\t"
+ "int $0x80\n" /* exit system call: exit subthread */
+ "1:\n\t"
+ "popl %%ebp\t" /* restore parent's ebp */
+
+ :"=a" (retval)
+
+ :"0" (__NR_eclone),
+ "i" (__NR_exit),
+ "m" (flags_low),
+ "m" (clone_args),
+ "m" (args_size),
+ "m" (pids)
+ );
+
+ if (retval < 0) {
+ errno = -retval;
+ retval = -1;
+ }
+ return retval;
+}
+
+/*
+ * Allocate a stack for the clone-child and arrange to have the child
+ * execute @child_fn with @child_arg as the argument.
+ */
+void *setup_stack(int (*child_fn)(void *), void *child_arg, int size)
+{
+ void *stack_base;
+ void **stack_top;
+
+ stack_base = malloc(size + size);
+ if (!stack_base) {
+ perror("malloc()");
+ exit(1);
+ }
+
+ stack_top = (void **)((char *)stack_base + (size - 4));
+ *--stack_top = child_arg;
+ *--stack_top = child_fn;
+
+ return stack_top;
+}
+#endif
+
+/* gettid() is a bit more useful than getpid() when messing with clone() */
+int gettid()
+{
+ int rc;
+
+ rc = syscall(__NR_gettid, 0, 0, 0);
+ if (rc < 0) {
+ printf("rc %d, errno %d\n", rc, errno);
+ exit(1);
+ }
+ return rc;
+}
+
+#define CHILD_TID1 377
+#define CHILD_TID2 1177
+#define CHILD_TID3 2799
+
+struct clone_args clone_args;
+void *child_arg = &clone_args;
+int child_tid;
+
+int do_child(void *arg)
+{
+ struct clone_args *cs = (struct clone_args *)arg;
+ int ctid;
+
+ /* Verify we pushed the arguments correctly on the stack... */
+ if (arg != child_arg) {
+ printf("Child: Incorrect child arg pointer, expected %p,"
+ "actual %p\n", child_arg, arg);
+ exit(1);
+ }
+
+ /* ... and that we got the thread-id we expected */
+ ctid = *((int *)(unsigned long)cs->child_tid_ptr);
+ if (ctid != CHILD_TID1) {
+ printf("Child: Incorrect child tid, expected %d, actual %d\n",
+ CHILD_TID1, ctid);
+ exit(1);
+ } else {
+ printf("Child got the expected tid, %d\n", gettid());
+ }
+ sleep(2);
+
+ printf("[%d, %d]: Child exiting\n", getpid(), ctid);
+ exit(0);
+}
+
+static int do_clone(int (*child_fn)(void *), void *child_arg,
+ unsigned int flags_low, int nr_pids, pid_t *pids_list)
+{
+ int rc;
+ void *stack;
+ struct clone_args *ca = &clone_args;
+ int args_size;
+
+ stack = setup_stack(child_fn, child_arg, STACKSIZE);
+
+ memset(ca, 0, sizeof(*ca));
+
+ ca->child_stack = (u64)(unsigned long)stack;
+ ca->child_stack_size = (u64)0;
+ ca->child_tid_ptr = (u64)(unsigned long)&child_tid;
+ ca->nr_pids = nr_pids;
+
+ args_size = sizeof(struct clone_args);
+ rc = eclone(flags_low, ca, args_size, pids_list);
+
+ printf("[%d, %d]: eclone() returned %d, error %d\n", getpid(), gettid(),
+ rc, errno);
+ return rc;
+}
+
+/*
+ * Multiple pid_t pid_t values in pids_list[] here are just for illustration.
+ * The test case creates a child in the current pid namespace and uses only
+ * the first value, CHILD_TID1.
+ */
+pid_t pids_list[] = { CHILD_TID1, CHILD_TID2, CHILD_TID3 };
+int main()
+{
+ int rc, pid, status;
+ unsigned long flags;
+ int nr_pids = 1;
+
+ flags = SIGCHLD|CLONE_CHILD_SETTID;
+
+ pid = do_clone(do_child, &clone_args, flags, nr_pids, pids_list);
+
+ printf("[%d, %d]: Parent waiting for %d\n", getpid(), gettid(), pid);
+
+ rc = waitpid(pid, &status, __WALL);
+ if (rc < 0) {
+ printf("waitpid(): rc %d, error %d\n", rc, errno);
+ } else {
+ printf("[%d, %d]: child %d:\n\t wait-status 0x%x\n", getpid(),
+ gettid(), rc, status);
+
+ if (WIFEXITED(status)) {
+ printf("\t EXITED, %d\n", WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ printf("\t SIGNALED, %d\n", WTERMSIG(status));
+ }
+ }
+ return 0;
+}
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH v21 020/100] c/r: documentation
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (6 preceding siblings ...)
2010-05-01 14:14 ` [PATCH v21 011/100] eclone (11/11): Document sys_eclone Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-06 20:27 ` Randy Dunlap
2010-05-01 14:15 ` [PATCH v21 021/100] c/r: create syscalls: sys_checkpoint, sys_restart Oren Laadan
8 siblings, 1 reply; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-api, linux-mm, linux-fsdevel,
netdev, Dave Hansen
Covers application checkpoint/restart, overall design, interfaces,
usage, shared objects, and and checkpoint image format.
Changelog[v19-rc1]:
- Update documentation and examples for new syscalls API
- [Liu Alexander] Fix typos
- [Serge Hallyn] Update checkpoint image format
Changelog[v16]:
- Update documentation
- Unify into readme.txt and usage.txt
Changelog[v14]:
- Discard the 'h.parent' field
- New image format (shared objects appear before they are referenced
unless they are compound)
Changelog[v8]:
- Split into multiple files in Documentation/checkpoint/...
- Extend documentation, fix typos and comments from feedback
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
Documentation/checkpoint/checkpoint.c | 38 +++
Documentation/checkpoint/readme.txt | 370 ++++++++++++++++++++++++++++
Documentation/checkpoint/self_checkpoint.c | 69 +++++
Documentation/checkpoint/self_restart.c | 40 +++
Documentation/checkpoint/usage.txt | 247 +++++++++++++++++++
5 files changed, 764 insertions(+), 0 deletions(-)
create mode 100644 Documentation/checkpoint/checkpoint.c
create mode 100644 Documentation/checkpoint/readme.txt
create mode 100644 Documentation/checkpoint/self_checkpoint.c
create mode 100644 Documentation/checkpoint/self_restart.c
create mode 100644 Documentation/checkpoint/usage.txt
diff --git a/Documentation/checkpoint/checkpoint.c b/Documentation/checkpoint/checkpoint.c
new file mode 100644
index 0000000..8560f30
--- /dev/null
+++ b/Documentation/checkpoint/checkpoint.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+ return syscall(__NR_checkpoint, pid, fd, flags);
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t pid;
+ int ret;
+
+ if (argc != 2) {
+ printf("usage: ckpt PID\n");
+ exit(1);
+ }
+
+ pid = atoi(argv[1]);
+ if (pid <= 0) {
+ printf("invalid pid\n");
+ exit(1);
+ }
+
+ ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+
+ if (ret < 0)
+ perror("checkpoint");
+ else
+ printf("checkpoint id %d\n", ret);
+
+ return (ret > 0 ? 0 : 1);
+}
diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
new file mode 100644
index 0000000..4fa5560
--- /dev/null
+++ b/Documentation/checkpoint/readme.txt
@@ -0,0 +1,370 @@
+
+ Checkpoint-Restart support in the Linux kernel
+ ==========================================================
+
+Copyright (C) 2008-2010 Oren Laadan
+
+Author: Oren Laadan <orenl@cs.columbia.edu>
+
+License: The GNU Free Documentation License, Version 1.2
+ (dual licensed under the GPL v2)
+
+Contributors: Oren Laadan <orenl@cs.columbia.edu>
+ Serge Hallyn <serue@us.ibm.com>
+ Dan Smith <danms@us.ibm.com>
+ Matt Helsley <matthltc@us.ibm.com>
+ Nathan Lynch <ntl@pobox.com>
+ Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
+ Dave Hansen <dave@linux.vnet.ibm.com>
+
+
+Introduction
+============
+
+Application checkpoint/restart [C/R] is the ability to save the state
+of a running application so that it can later resume its execution
+from the time at which it was checkpointed. An application can be
+migrated by checkpointing it on one machine and restarting it on
+another. C/R can provide many potential benefits:
+
+* Failure recovery: by rolling back to a previous checkpoint
+
+* Improved response time: by restarting applications from checkpoints
+ instead of from scratch.
+
+* Improved system utilization: by suspending long running CPU
+ intensive jobs and resuming them when load decreases.
+
+* Fault resilience: by migrating applications off faulty hosts.
+
+* Dynamic load balancing: by migrating applications to less loaded
+ hosts.
+
+* Improved service availability and administration: by migrating
+ applications before host maintenance so that they continue to run
+ with minimal downtime
+
+* Time-travel: by taking periodic checkpoints and restarting from
+ any previous checkpoint.
+
+Compared to hypervisor approaches, application C/R is more lightweight
+since it need only save the state associated with applications, while
+operating system data structures (e.g. buffer cache, drivers state
+and the like) are uninteresting.
+
+
+Overall design
+==============
+
+Checkpoint and restart are done in the kernel as much as possible.
+Two new system calls are introduced to provide C/R: sys_checkpoint()
+and sys_restart(). They both operate on a process tree (hierarchy),
+either a whole container or a subtree of a container.
+
+Checkpointing entire containers ensures that there are no dependencies
+on anything outside the container, which guarantees that a matching
+restart will succeed (assuming that the file system state remains
+consistent). However, it requires that users will always run the tasks
+that they wish to checkpoint inside containers. This is ideal for,
+e.g., private virtual servers and the like.
+
+In contrast, when checkpointing a subtree of a container it is up to
+the user to ensure that dependencies either don't exist or can be
+safely ignored. This is useful, for instance, for HPC scenarios or
+even a user that would like to periodically checkpoint a long-running
+batch job.
+
+An additional system call, a la madvise(), is planned, so that tasks
+can advise the kernel how to handle specific resources. For instance,
+a task could ask to skip a memory area at checkpoint to save space,
+or to use a preset file descriptor at restart instead of restoring it
+from the checkpoint image. It will provide the flexibility that is
+particularly useful to address the needs of a diverse crowd of users
+and use-cases.
+
+Syscall sys_checkpoint() is given a pid that indicates the top of the
+hierarchy, a file descriptor to store the image, and flags. The code
+serializes internal user- and kernel-state and writes it out to the
+file descriptor. The resulting image is stream-able. The processes are
+expected to be frozen for the duration of the checkpoint.
+
+In general, a checkpoint consists of 5 steps:
+1. Pre-dump
+2. Freeze the container/subtree
+3. Save tasks' and kernel state <-- sys_checkpoint()
+4. Thaw (or kill) the container/subtree
+5. Post-dump
+
+Step 3 is done by calling sys_checkpoint(). Steps 1 and 5 are an
+optimization to reduce application downtime. In particular, "pre-dump"
+works before freezing the container, e.g. the pre-copy for live
+migration, and "post-dump" works after the container resumes
+execution, e.g. write-back the data to secondary storage.
+
+The kernel exports a relatively opaque 'blob' of data to userspace
+which can then be handed to the new kernel at restart time. The
+'blob' contains data and state of select portions of kernel structures
+such as VMAs and mm_structs, as well as copies of the actual memory
+that the tasks use. Any changes in this blob's format between kernel
+revisions can be handled by an in-userspace conversion program.
+
+To restart, userspace first create a process hierarchy that matches
+that of the checkpoint, and each task calls sys_restart(). The syscall
+reads the saved kernel state from a file descriptor, and re-creates
+the resources that the tasks need to resume execution. The restart
+code is executed by each task that is restored in the new hierarchy to
+reconstruct its own state.
+
+In general, a restart consists of 3 steps:
+1. Create hierarchy
+2. Restore tasks' and kernel state <-- sys_restart()
+3. Resume userspace (or freeze tasks)
+
+Because the process hierarchy, during restart in created in userspace,
+the restarting tasks have the flexibility to prepare before calling
+sys_restart().
+
+
+Checkpoint image format
+=======================
+
+The checkpoint image format is built of records that consist of a
+pre-header identifying its contents, followed by a payload. This
+format allow userspace tools to easily parse and skip through the
+image without requiring intimate knowledge of the data. It will also
+be handy to enable parallel checkpointing in the future where multiple
+threads interleave data from multiple processes into a single stream.
+
+The pre-header is defined by 'struct ckpt_hdr' as follows: @type
+identifies the type of the payload, @len tells its length in bytes
+including the pre-header.
+
+struct ckpt_hdr {
+ __s32 type;
+ __s32 len;
+};
+
+The pre-header must be the first component in all other headers. For
+instance, the task data is saved in 'struct ckpt_hdr_task', which
+looks something like this:
+
+struct ckpt_hdr_task {
+ struct ckpt_hdr h;
+ __u32 pid;
+ ...
+};
+
+THE IMAGE FORMAT IS EXPECTED TO CHANGE over time as more features are
+supported, or as existing features change in the kernel and require to
+adjust their representation. Any such changes will be be handled by
+in-userspace conversion tools.
+
+The general format of the checkpoint image is as follows:
+* Image header
+* Container configuration
+* Task hierarchy
+* Tasks' state
+* Image trailer
+
+The image always begins with a general header that holds a magic
+number, an architecture identifier (little endian format), a format
+version number (@rev), followed by information about the kernel
+(currently version and UTS data). It also holds the time of the
+checkpoint and the flags given to sys_checkpoint(). This header is
+followed by an arch-specific header.
+
+The container configuration section containers information that is
+global to the container. Security (LSM) configuration is one example.
+Network configuration and container-wide mounts may also go here, so
+that the userspace restart coordinator can re-create a suitable
+environment.
+
+The task hierarchy comes next so that userspace tools can read it
+early (even from a stream) and re-create the restarting tasks. This is
+basically an array of all checkpointed tasks, and their relationships
+(parent, siblings, threads, etc).
+
+Then the state of all tasks is saved, in the order that they appear in
+the tasks array above. For each state, we save data like task_struct,
+namespaces, open files, memory layout, memory contents, cpu state,
+signals and signal handlers, etc. For resources that are shared among
+multiple processes, we first checkpoint said resource (and only once),
+and in the task data we give a reference to it. More about shared
+resources below.
+
+Finally, the image always ends with a trailer that holds a (different)
+magic number, serving for sanity check.
+
+
+Shared objects
+==============
+
+Many resources may be shared by multiple tasks (e.g. file descriptors,
+memory address space, etc), or even have multiple references from
+other resources (e.g. a single inode that represents two ends of a
+pipe).
+
+Shared objects are tracked using a hash table (objhash) to ensure that
+they are only checkpointed or restored once. To handle a shared
+object, it is first looked up in the hash table, to determine if is
+the first encounter or a recurring appearance. The hash table itself
+is not saved as part of the checkpoint image: it is constructed
+dynamically during both checkpoint and restart, and discarded at the
+end of the operation.
+
+During checkpoint, when a shared object is encountered for the first
+time, it is inserted to the hash table, indexed by its kernel address.
+It is assigned an identifier (@objref) in order of appearance, and
+then its state is saved. Subsequent lookups of that object in the hash
+will yield that entry, in which case only the @objref is saved, as
+opposed the entire state of the object.
+
+During restart, shared objects are indexed by their @objref as given
+during the checkpoint. On the first appearance of each shared object,
+a new resource will be created and its state restored from the image.
+Then the object is added to the hash table. Subsequent lookups of the
+same unique identifier in the hash table will yield that entry, and
+then the existing object instance is reused instead of creating
+a new one.
+
+The hash grabs a reference to each object that is inserted, and
+maintains this reference for the entire lifetime of the hash. Thus,
+it is always safe to reference an object that is stored in the hash.
+The hash is "one-way" in the sense that objects that are added are
+never deleted from the hash until the hash is discarded. This, in
+turn, happens only when the checkpoint (or restart) terminates.
+
+Shared objects are thus saved when they are first seen, and _before_
+the parent object that uses them. Therefore by the time the parent
+objects needs them, they should already be in the objhash. The one
+exception is when more than a single shared resource will be restarted
+at once (e.g. like the two ends of a pipe, or all the namespaces in an
+nsproxy). In this case the parent object is dumped first followed by
+the individual sub-resources).
+
+The checkpoint image is stream-able, meaning that restarting from it
+may not require lseek(). This is enforced at checkpoint time, by
+carefully selecting the order of shared objects, to respect the rule
+that an object is always saved before the objects that refers to it.
+
+
+Memory contents format
+======================
+
+The memory contents of a given memory address space (->mm) is dumped
+as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
+This header details the vma properties, and a reference to a file
+(if file backed) or an inode (or shared memory) object.
+
+The vma header is followed by the actual contents - but only those
+pages that need to be saved, i.e. dirty pages. They are written in
+chunks of data, where each chunks contains a header that indicates
+that number of pages in the chunk, followed by an array of virtual
+addresses and then an array of actual page contents. The last chunk
+holds zero pages.
+
+To illustrate this, consider a single simple task with two vmas: one
+is file mapped with two dumped pages, and the other is anonymous with
+three dumped pages. The memory dump will look like this:
+
+ ckpt_hdr + ckpt_hdr_vma
+ ckpt_hdr_pgarr (nr_pages = 2)
+ addr1, addr2
+ page1, page2
+ ckpt_hdr_pgarr (nr_pages = 0)
+ ckpt_hdr + ckpt_hdr_vma
+ ckpt_hdr_pgarr (nr_pages = 3)
+ addr3, addr4, addr5
+ page3, page4, page5
+ ckpt_hdr_pgarr (nr_pages = 0)
+
+
+Error handling
+==============
+
+Both checkpoint and restart operations may fail due to a variety of
+reasons. Using a simple, single return value from the system call is
+insufficient to report the reason of a failure.
+
+Instead, both sys_checkpoint() and sys_restart() accept an additional
+argument - a file descriptor to which the kernel writes diagnostic
+and debugging information. Both the checkpoint and restart userspace
+utilities have options to specify a filename to store this log.
+
+In addition, checkpoint provides informative status report upon
+failure in the checkpoint image in the form of (one or more) error
+objects, 'struct ckpt_hdr_err'. An error objects consists of a
+mandatory pre-header followed by a null character ('\0'), and then a
+string that describes the error. By default, if an error occurs, this
+will be the last object written to the checkpoint image.
+
+Upon failure, the caller can examine the image (e.g. with 'ckptinfo')
+and extract the detailed error message. The leading '\0' is useful if
+one wants to seek back from the end of the checkpoint image, instead
+of parsing the entire image separately.
+
+
+Security
+========
+
+The main question is whether sys_checkpoint() and sys_restart()
+require privileged or unprivileged operation.
+
+Early versions checked capable(CAP_SYS_ADMIN) assuming that we would
+attempt to remove the need for privilege, so that all users could
+safely use it. Arnd Bergmann pointed out that it'd make more sense to
+let unprivileged users use them now, so that we'll be more careful
+about the security as patches roll in.
+
+Checkpoint: the main concern is whether a task that performs the
+checkpoint of another task has sufficient privileges to access its
+state. We address this by requiring that the checkpointer task will be
+able to ptrace the target task, by means of ptrace_may_access() with
+access mode.
+
+Restart: the main concern is that we may allow an unprivileged user to
+feed the kernel with random data. To this end, the restart works in a
+way that does not skip the usual security checks. Task credentials,
+i.e. euid, reuid, and LSM security contexts currently come from the
+caller, not the checkpoint image. As credentials are restored too,
+the ability of a task that calls sys_restore() to setresuid/setresgid
+to those values must be checked.
+
+Keeping the restart procedure to operate within the limits of the
+caller's credentials means that there various scenarios that cannot
+be supported. For instance, a setuid program that opened a protected
+log file and then dropped privileges will fail the restart, because
+the user won't have enough credentials to reopen the file. In these
+cases, we should probably treat restarting like inserting a kernel
+module: surely the user can cause havoc by providing incorrect data,
+but then again we must trust the root account.
+
+So that's why we don't want CAP_SYS_ADMIN required up-front. That way
+we will be forced to more carefully review each of those features.
+However, this can be controlled with a sysctl-variable.
+
+
+Kernel interfaces
+=================
+
+* To checkpoint a vma, the 'struct vm_operations_struct' needs to
+ provide a method ->checkpoint:
+ int checkpoint(struct ckpt_ctx *, struct vma_struct *)
+ Restart requires a matching (exported) restore:
+ int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
+
+* To checkpoint a file, the 'struct file_operations' needs to provide
+ the methods ->checkpoint and ->collect:
+ int checkpoint(struct ckpt_ctx *, struct file *)
+ int collect(struct ckpt_ctx *, struct file *)
+ Restart requires a matching (exported) restore:
+ int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
+ For most file systems, generic_file_{checkpoint,restore}() can be
+ used.
+
+* To checkpoint a socket, the 'struct proto_ops' needs to provide
+ the methods ->checkpoint, ->collect and ->restore:
+ int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
+ int collect(struct ckpt_ctx *ctx, struct socket *sock);
+ int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
+
diff --git a/Documentation/checkpoint/self_checkpoint.c b/Documentation/checkpoint/self_checkpoint.c
new file mode 100644
index 0000000..27dba0d
--- /dev/null
+++ b/Documentation/checkpoint/self_checkpoint.c
@@ -0,0 +1,69 @@
+/*
+ * self_checkpoint.c: demonstrate self-checkpoint
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <math.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+ return syscall(__NR_checkpoint, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+#define OUTFILE "/tmp/cr-self.out"
+
+int main(int argc, char *argv[])
+{
+ pid_t pid = getpid();
+ FILE *file;
+ int i, ret;
+
+ close(0);
+ close(2);
+
+ unlink(OUTFILE);
+ file = fopen(OUTFILE, "w+");
+ if (!file) {
+ perror("open");
+ exit(1);
+ }
+ if (dup2(0, 2) < 0) {
+ perror("dup2");
+ exit(1);
+ }
+
+ fprintf(file, "hello, world!\n");
+ fflush(file);
+
+ for (i = 0; i < 1000; i++) {
+ sleep(1);
+ fprintf(file, "count %d\n", i);
+ fflush(file);
+
+ if (i != 2)
+ continue;
+ ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+ if (ret < 0) {
+ fprintf(file, "ckpt: %s\n", strerror(errno));
+ exit(2);
+ }
+
+ fprintf(file, "checkpoint ret: %d\n", ret);
+ fflush(file);
+ }
+
+ return 0;
+}
diff --git a/Documentation/checkpoint/self_restart.c b/Documentation/checkpoint/self_restart.c
new file mode 100644
index 0000000..647ce51
--- /dev/null
+++ b/Documentation/checkpoint/self_restart.c
@@ -0,0 +1,40 @@
+/*
+ * self_restart.c: demonstrate self-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#define _GNU_SOURCE /* or _BSD_SOURCE or _SVID_SOURCE */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int restart(pid_t pid, int fd, unsigned long flags)
+{
+ return syscall(__NR_restart, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t pid = getpid();
+ int ret;
+
+ ret = restart(pid, STDIN_FILENO, RESTART_TASKSELF);
+ if (ret < 0)
+ perror("restart");
+
+ printf("should not reach here !\n");
+
+ return 0;
+}
diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
new file mode 100644
index 0000000..c6fc045
--- /dev/null
+++ b/Documentation/checkpoint/usage.txt
@@ -0,0 +1,247 @@
+
+ How to use Checkpoint-Restart
+ =========================================
+
+
+API
+===
+
+The API consists of three new system calls:
+
+* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
+
+ Checkpoint a (sub-)container whose root task is identified by @pid,
+ to the open file indicated by @fd. If @logfd isn't -1, it indicates
+ an open file to which error and debug messages are written. @flags
+ may be one or more of:
+ - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
+ (other value are not allowed).
+
+ Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
+ it returns from a restart, and -1 if an error occurs. The ckptid will
+ uniquely identify a checkpoint image, for as long as the checkpoint
+ is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
+ partial checkpoint, residing in kernel memory).
+
+* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
+
+ Restart a process hierarchy from a checkpoint image that is read from
+ the blob stored in the file indicated by @fd. If @logfd isn't -1, it
+ indicates an open file to which error and debug messages are written.
+ @flags will have future meaning (must be 0 for now). @pid indicates
+ the root of the hierarchy as seen in the coordinator's pid-namespace,
+ and is expected to be a child of the coordinator. @flags may be one
+ or more of:
+ - RESTART_TASKSELF : (self) restart of a single process
+ - RESTART_FROEZN : processes remain frozen once restart completes
+ - RESTART_GHOST : process is a ghost (placeholder for a pid)
+ (Note that this argument may mean 'ckptid' to identify an in-kernel
+ checkpoint image, with some @flags in the future).
+
+ Returns: -1 if an error occurs, 0 on success when restarting from a
+ "self" checkpoint, and return value of system call at the time of the
+ checkpoint when restarting from an "external" checkpoint.
+
+ (If a process was frozen for checkpoint while in userspace, it will
+ resume running in userspace exactly where it was interrupted. If it
+ was frozen while in kernel doing a syscall, it will return what the
+ syscall returned when interrupted/completed, and proceed from there
+ as if it had only been frozen and then thawed. Finally, if it did a
+ self-checkpoint, it will resume to the first instruction after the
+ call to checkpoint(2), having returned 0, to indicate whether the
+ return is from the checkpoint or a restart).
+
+* int clone_with_pid(unsigned long clone_flags, void *news,
+ int *parent_tidptr, int *child_tidptr,
+ struct target_pid_set *pid_set)
+
+ struct target_pid_set {
+ int num_pids;
+ pid_t *target_pids;
+ }
+
+ Container restart requires that a task have the same pid it had when
+ it was checkpointed. When containers are nested the tasks within the
+ containers exist in multiple pid namespaces and hence have multiple
+ pids to specify during restart.
+
+ clone_with_pids(), intended for use during restart, is similar to
+ clone(), except that it takes a 'target_pid_set' parameter. This
+ parameter lets caller choose specific pid numbers for the child
+ process, in the process's active and ancestor pid namespaces.
+
+ Unlike clone(), clone_with_pids() needs CAP_SYS_ADMIN, at least for
+ now, to prevent unprivileged processes from misusing this interface.
+
+ If a target-pid is 0, the kernel continues to assign a pid for the
+ process in that namespace. If a requested pid is taken, the system
+ call fails with -EBUSY. If 'pid_set.num_pids' exceeds the current
+ nesting level of pid namespaces, the system call fails with -EINVAL.
+
+
+Sysctl/proc
+===========
+
+/proc/sys/kernel/ckpt_unpriv_allowed [default = 1]
+ controls whether c/r operation is allowed for unprivileged users
+
+
+Operation
+=========
+
+The granularity of a checkpoint usually is a process hierarchy. The
+'pid' argument is interpreted in the caller's pid namespace. So to
+checkpoint a container whose init task (pid 1 in that pidns) appears
+as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
+pid 1 will attempt to checkpoint the caller's container, and if the
+caller isn't privileged and init is owned by root, it will fail.
+
+Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
+which does not refer to a container's init task, then sys_checkpoint()
+would return -EINVAL.
+
+We assume that during checkpoint and restart the container state is
+quiescent. During checkpoint, this means that all affected tasks are
+frozen (or otherwise stopped). During restart, this means that all
+affected tasks are executing the sys_restart() call. In both cases, if
+there are other tasks possible sharing state with the container, they
+must not modify it during the operation. It is the responsibility of
+the caller to follow this requirement.
+
+If the assumption that all tasks are frozen and that there is no other
+sharing doesn't hold - then the results of the operation are undefined
+(just as, e.g. not calling execve() immediately after vfork() produces
+undefined results). In particular, either checkpoint will fail, or it
+may produce a checkpoint image that can't be restarted, or (unlikely)
+the restart may produce a container whose state does not match that of
+the original container.
+
+
+User tools
+==========
+
+* checkpoint(1): a tool to perform a checkpoint of a container/subtree
+* restart(1): a tool to restart a container/subtree
+* ckptinfo: a tool to examine a checkpoint image
+
+It is best to use the dedicated user tools for checkpoint and restart.
+
+If you insist, then here is a code snippet that illustrates how a
+checkpoint is initiated by a process inside a container - the logic is
+similar to fork():
+ ...
+ ckptid = checkpoint(0, ...);
+ switch (crid) {
+ case -1:
+ perror("checkpoint failed");
+ break;
+ default:
+ fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
+ /* proceed with execution after checkpoint */
+ ...
+ break;
+ case 0:
+ fprintf(stderr, "returned after restart\n");
+ /* proceed with action required following a restart */
+ ...
+ break;
+ }
+ ...
+
+And to initiate a restart, the process in an empty container can use
+logic similar to execve():
+ ...
+ if (restart(pid, ...) < 0)
+ perror("restart failed");
+ /* only get here if restart failed */
+ ...
+
+Note, that the code also supports "self" checkpoint, where a process
+can checkpoint itself. This mode does not capture the relationships of
+the task with other tasks, or any shared resources. It is useful for
+application that wish to be able to save and restore their state.
+They will either not use (or care about) shared resources, or they
+will be aware of the operations and adapt suitably after a restart.
+The code above can also be used for "self" checkpoint.
+
+
+You may find the following sample programs useful:
+
+* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
+* self_checkpoint.c: a simple test program doing self-checkpoint
+* self_restart.c: restarts a (self-) checkpoint image from stdin
+
+See also the utilities 'checkpoint' and 'restart' (from user-cr).
+
+
+"External" checkpoint
+=====================
+
+To do "external" checkpoint, you need to first freeze that other task
+either using the freezer cgroup.
+
+Restart does not preserve the original PID yet, (because we haven't
+solved yet the fork-with-specific-pid issue). In a real scenario, you
+probably want to first create a new names space, and have the init
+task there call 'sys_restart()'.
+
+I tested it this way:
+ $ ./test &
+ [1] 3493
+
+ $ echo 3493 > /cgroup/0/tasks
+ $ echo FROZEN > /cgroup/0/freezer.state
+ $ ./checkpoint 3493 > ckpt.image
+
+ $ mv /tmp/cr-test.out /tmp/cr-test.out.orig
+ $ cp /tmp/cr-test.out.orig /tmp/cr-test.out
+
+ $ echo THAWED > /cgroup/0/freezer.state
+
+ $ ./self_restart < ckpt.image
+Now compare the output of the two output files.
+
+
+"Self" checkpoint
+================
+
+To do self-checkpoint, you can incorporate the code from
+self_checkpoint.c into your application.
+
+Here is how to test the self-checkpoint:
+ $ ./self_checkpoint > self.image &
+ [1] 3512
+
+ $ sleep 3
+ $ mv /tmp/cr-self.out /tmp/cr-self.out.orig
+ $ cp /tmp/cr-self.out.orig /tmp/cr-self.out
+
+ $ cat /tmp/cr-self.out
+ hello, world!
+ count 0
+ count 1
+ count 2
+ checkpoint ret: 1
+ count 3
+ ...
+
+ $ sed -i 's/count/xxxxx/g' /tmp/cr-self.out
+
+ $ ./self_restart < self.image &
+
+Now compare the output of the two output files.
+ $ cat /tmp/cr-self.out
+ hello, world!
+ xxxxx 0
+ xxxxx 1
+ xxxxx 2
+ checkpoint ret: 0
+ count 3
+ ...
+
+
+Note how in test.c we close stdin, stdout, stderr - that's because
+currently we only support regular files (not ttys/ptys).
+
+If you check the output of ps, you'll see that "self_restart" changed
+its name to "test" or "self_checkpoint", as expected.
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH v21 020/100] c/r: documentation
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
@ 2010-05-06 20:27 ` Randy Dunlap
2010-05-07 6:54 ` Oren Laadan
0 siblings, 1 reply; 25+ messages in thread
From: Randy Dunlap @ 2010-05-06 20:27 UTC (permalink / raw)
To: Oren Laadan
Cc: Andrew Morton, containers, linux-kernel, Serge Hallyn,
Matt Helsley, Pavel Emelyanov, linux-api, linux-mm, linux-fsdevel,
netdev, Dave Hansen
On Sat, 1 May 2010 10:15:02 -0400 Oren Laadan wrote:
> Covers application checkpoint/restart, overall design, interfaces,
> usage, shared objects, and and checkpoint image format.
>
> Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> Acked-by: Serge E. Hallyn <serue@us.ibm.com>
> Tested-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
> Documentation/checkpoint/checkpoint.c | 38 +++
> Documentation/checkpoint/readme.txt | 370 ++++++++++++++++++++++++++++
> Documentation/checkpoint/self_checkpoint.c | 69 +++++
> Documentation/checkpoint/self_restart.c | 40 +++
> Documentation/checkpoint/usage.txt | 247 +++++++++++++++++++
> 5 files changed, 764 insertions(+), 0 deletions(-)
> create mode 100644 Documentation/checkpoint/checkpoint.c
> create mode 100644 Documentation/checkpoint/readme.txt
> create mode 100644 Documentation/checkpoint/self_checkpoint.c
> create mode 100644 Documentation/checkpoint/self_restart.c
> create mode 100644 Documentation/checkpoint/usage.txt
> diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
> new file mode 100644
> index 0000000..4fa5560
> --- /dev/null
> +++ b/Documentation/checkpoint/readme.txt
> @@ -0,0 +1,370 @@
> +
...
> +In contrast, when checkpointing a subtree of a container it is up to
> +the user to ensure that dependencies either don't exist or can be
> +safely ignored. This is useful, for instance, for HPC scenarios or
> +even a user that would like to periodically checkpoint a long-running
who
> +batch job.
> +
...
> +
> +Checkpoint image format
> +=======================
> +
...
> +
> +The container configuration section containers information that is
contains
> +global to the container. Security (LSM) configuration is one example.
> +Network configuration and container-wide mounts may also go here, so
> +that the userspace restart coordinator can re-create a suitable
> +environment.
> +
...
> +
> +Then the state of all tasks is saved, in the order that they appear in
> +the tasks array above. For each state, we save data like task_struct,
> +namespaces, open files, memory layout, memory contents, cpu state,
CPU (throughout, please)
> +signals and signal handlers, etc. For resources that are shared among
> +multiple processes, we first checkpoint said resource (and only once),
> +and in the task data we give a reference to it. More about shared
> +resources below.
> +
...
> +
> +Shared objects
> +==============
> +
> +Many resources may be shared by multiple tasks (e.g. file descriptors,
> +memory address space, etc), or even have multiple references from
etc.),
> +other resources (e.g. a single inode that represents two ends of a
> +pipe).
> +
...
> +Memory contents format
> +======================
> +
> +The memory contents of a given memory address space (->mm) is dumped
are (I think)
> +as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
> +This header details the vma properties, and a reference to a file
> +(if file backed) or an inode (or shared memory) object.
> +
> +The vma header is followed by the actual contents - but only those
> +pages that need to be saved, i.e. dirty pages. They are written in
> +chunks of data, where each chunks contains a header that indicates
chunk
> +that number of pages in the chunk, followed by an array of virtual
the
> +addresses and then an array of actual page contents. The last chunk
> +holds zero pages.
> +
...
> +Kernel interfaces
> +=================
> +
> +* To checkpoint a vma, the 'struct vm_operations_struct' needs to
> + provide a method ->checkpoint:
> + int checkpoint(struct ckpt_ctx *, struct vma_struct *)
> + Restart requires a matching (exported) restore:
> + int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
> +
> +* To checkpoint a file, the 'struct file_operations' needs to provide
> + the methods ->checkpoint and ->collect:
> + int checkpoint(struct ckpt_ctx *, struct file *)
> + int collect(struct ckpt_ctx *, struct file *)
> + Restart requires a matching (exported) restore:
> + int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
> + For most file systems, generic_file_{checkpoint,restore}() can be
> + used.
> +
> +* To checkpoint a socket, the 'struct proto_ops' needs to provide
To checkpoint/restart a socket,
> + the methods ->checkpoint, ->collect and ->restore:
> + int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
> + int collect(struct ckpt_ctx *ctx, struct socket *sock);
> + int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
> diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
> new file mode 100644
> index 0000000..c6fc045
> --- /dev/null
> +++ b/Documentation/checkpoint/usage.txt
> @@ -0,0 +1,247 @@
> +
> + How to use Checkpoint-Restart
> + =========================================
> +
> +
> +API
> +===
> +
> +The API consists of three new system calls:
> +
> +* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
flags,
> +
> + Checkpoint a (sub-)container whose root task is identified by @pid,
> + to the open file indicated by @fd. If @logfd isn't -1, it indicates
> + an open file to which error and debug messages are written. @flags
> + may be one or more of:
> + - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
> + (other value are not allowed).
> +
> + Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
> + it returns from a restart, and -1 if an error occurs. The ckptid will
> + uniquely identify a checkpoint image, for as long as the checkpoint
> + is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
> + partial checkpoint, residing in kernel memory).
> +
> +* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
> +
> + Restart a process hierarchy from a checkpoint image that is read from
> + the blob stored in the file indicated by @fd. If @logfd isn't -1, it
> + indicates an open file to which error and debug messages are written.
> + @flags will have future meaning (must be 0 for now). @pid indicates
> + the root of the hierarchy as seen in the coordinator's pid-namespace,
> + and is expected to be a child of the coordinator. @flags may be one
> + or more of:
> + - RESTART_TASKSELF : (self) restart of a single process
> + - RESTART_FROEZN : processes remain frozen once restart completes
FROZEN ?
> + - RESTART_GHOST : process is a ghost (placeholder for a pid)
about @flags: Above says both of these:
a) @flags will have future meaning (must be 0 for now)
b) @flags may be one or more of:
so please decide which one it is ;)
> + (Note that this argument may mean 'ckptid' to identify an in-kernel
> + checkpoint image, with some @flags in the future).
> +
> + Returns: -1 if an error occurs, 0 on success when restarting from a
> + "self" checkpoint, and return value of system call at the time of the
> + checkpoint when restarting from an "external" checkpoint.
> +
...
> +
> +Sysctl/proc
> +===========
> +
> +/proc/sys/kernel/ckpt_unpriv_allowed [default = 1]
> + controls whether c/r operation is allowed for unprivileged users
C/R
> +
> +
> +Operation
> +=========
> +
> +The granularity of a checkpoint usually is a process hierarchy. The
> +'pid' argument is interpreted in the caller's pid namespace. So to
> +checkpoint a container whose init task (pid 1 in that pidns) appears
> +as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
> +pid 1 will attempt to checkpoint the caller's container, and if the
> +caller isn't privileged and init is owned by root, it will fail.
> +
> +Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
> +which does not refer to a container's init task, then sys_checkpoint()
> +would return -EINVAL.
returns -EINVAL.
...
> +
> +
> +User tools
> +==========
> +
> +* checkpoint(1): a tool to perform a checkpoint of a container/subtree
> +* restart(1): a tool to restart a container/subtree
> +* ckptinfo: a tool to examine a checkpoint image
> +
> +It is best to use the dedicated user tools for checkpoint and restart.
> +
> +If you insist, then here is a code snippet that illustrates how a
> +checkpoint is initiated by a process inside a container - the logic is
> +similar to fork():
> + ...
> + ckptid = checkpoint(0, ...);
> + switch (crid) {
(ckptid) ?
> + case -1:
> + perror("checkpoint failed");
> + break;
> + default:
> + fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
s/ret/ckptid/ ?
> + /* proceed with execution after checkpoint */
> + ...
> + break;
> + case 0:
> + fprintf(stderr, "returned after restart\n");
> + /* proceed with action required following a restart */
> + ...
> + break;
> + }
> + ...
> +
> +And to initiate a restart, the process in an empty container can use
> +logic similar to execve():
> + ...
> + if (restart(pid, ...) < 0)
> + perror("restart failed");
> + /* only get here if restart failed */
> + ...
> +
> +Note, that the code also supports "self" checkpoint, where a process
Note that
> +can checkpoint itself. This mode does not capture the relationships of
> +the task with other tasks, or any shared resources. It is useful for
> +application that wish to be able to save and restore their state.
applications
> +They will either not use (or care about) shared resources, or they
> +will be aware of the operations and adapt suitably after a restart.
> +The code above can also be used for "self" checkpoint.
> +
> +
> +You may find the following sample programs useful:
> +
> +* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
checkpoints
> +* self_checkpoint.c: a simple test program doing self-checkpoint
> +* self_restart.c: restarts a (self-) checkpoint image from stdin
> +
> +See also the utilities 'checkpoint' and 'restart' (from user-cr).
> +
> +
> +"External" checkpoint
> +=====================
> +
> +To do "external" checkpoint, you need to first freeze that other task
> +either using the freezer cgroup.
eh? cannot parse that.
> +
> +Restart does not preserve the original PID yet, (because we haven't
> +solved yet the fork-with-specific-pid issue). In a real scenario, you
> +probably want to first create a new names space, and have the init
namespace,
> +task there call 'sys_restart()'.
> +
> +I tested it this way:
...
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH v21 020/100] c/r: documentation
2010-05-06 20:27 ` Randy Dunlap
@ 2010-05-07 6:54 ` Oren Laadan
0 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-07 6:54 UTC (permalink / raw)
To: Randy Dunlap
Cc: Andrew Morton, containers, linux-kernel, Serge Hallyn,
Matt Helsley, Pavel Emelyanov, linux-api, linux-mm, linux-fsdevel,
netdev, Dave Hansen
Thanks for reading carefully through and pointing out
glitches and inconsistencies. I'll fix it for next post.
Oren.
On 05/06/2010 04:27 PM, Randy Dunlap wrote:
> On Sat, 1 May 2010 10:15:02 -0400 Oren Laadan wrote:
>
>> Covers application checkpoint/restart, overall design, interfaces,
>> usage, shared objects, and and checkpoint image format.
>>
>> Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
>> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
>> Acked-by: Serge E. Hallyn <serue@us.ibm.com>
>> Tested-by: Serge E. Hallyn <serue@us.ibm.com>
>> ---
>> Documentation/checkpoint/checkpoint.c | 38 +++
>> Documentation/checkpoint/readme.txt | 370 ++++++++++++++++++++++++++++
>> Documentation/checkpoint/self_checkpoint.c | 69 +++++
>> Documentation/checkpoint/self_restart.c | 40 +++
>> Documentation/checkpoint/usage.txt | 247 +++++++++++++++++++
>> 5 files changed, 764 insertions(+), 0 deletions(-)
>> create mode 100644 Documentation/checkpoint/checkpoint.c
>> create mode 100644 Documentation/checkpoint/readme.txt
>> create mode 100644 Documentation/checkpoint/self_checkpoint.c
>> create mode 100644 Documentation/checkpoint/self_restart.c
>> create mode 100644 Documentation/checkpoint/usage.txt
>
>> diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
>> new file mode 100644
>> index 0000000..4fa5560
>> --- /dev/null
>> +++ b/Documentation/checkpoint/readme.txt
>> @@ -0,0 +1,370 @@
>> +
> ...
>> +In contrast, when checkpointing a subtree of a container it is up to
>> +the user to ensure that dependencies either don't exist or can be
>> +safely ignored. This is useful, for instance, for HPC scenarios or
>> +even a user that would like to periodically checkpoint a long-running
>
> who
>
>> +batch job.
>> +
> ...
>
>> +
>> +Checkpoint image format
>> +=======================
>> +
> ...
>
>> +
>> +The container configuration section containers information that is
>
> contains
>
>> +global to the container. Security (LSM) configuration is one example.
>> +Network configuration and container-wide mounts may also go here, so
>> +that the userspace restart coordinator can re-create a suitable
>> +environment.
>> +
> ...
>
>> +
>> +Then the state of all tasks is saved, in the order that they appear in
>> +the tasks array above. For each state, we save data like task_struct,
>> +namespaces, open files, memory layout, memory contents, cpu state,
>
> CPU (throughout, please)
>
>> +signals and signal handlers, etc. For resources that are shared among
>> +multiple processes, we first checkpoint said resource (and only once),
>> +and in the task data we give a reference to it. More about shared
>> +resources below.
>> +
> ...
>
>> +
>> +Shared objects
>> +==============
>> +
>> +Many resources may be shared by multiple tasks (e.g. file descriptors,
>> +memory address space, etc), or even have multiple references from
>
> etc.),
>
>> +other resources (e.g. a single inode that represents two ends of a
>> +pipe).
>> +
> ...
>
>> +Memory contents format
>> +======================
>> +
>> +The memory contents of a given memory address space (->mm) is dumped
>
> are (I think)
>
>> +as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
>> +This header details the vma properties, and a reference to a file
>> +(if file backed) or an inode (or shared memory) object.
>> +
>> +The vma header is followed by the actual contents - but only those
>> +pages that need to be saved, i.e. dirty pages. They are written in
>> +chunks of data, where each chunks contains a header that indicates
>
> chunk
>
>> +that number of pages in the chunk, followed by an array of virtual
>
> the
>
>> +addresses and then an array of actual page contents. The last chunk
>> +holds zero pages.
>> +
> ...
>
>> +Kernel interfaces
>> +=================
>> +
>> +* To checkpoint a vma, the 'struct vm_operations_struct' needs to
>> + provide a method ->checkpoint:
>> + int checkpoint(struct ckpt_ctx *, struct vma_struct *)
>> + Restart requires a matching (exported) restore:
>> + int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
>> +
>> +* To checkpoint a file, the 'struct file_operations' needs to provide
>> + the methods ->checkpoint and ->collect:
>> + int checkpoint(struct ckpt_ctx *, struct file *)
>> + int collect(struct ckpt_ctx *, struct file *)
>> + Restart requires a matching (exported) restore:
>> + int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
>> + For most file systems, generic_file_{checkpoint,restore}() can be
>> + used.
>> +
>> +* To checkpoint a socket, the 'struct proto_ops' needs to provide
>
> To checkpoint/restart a socket,
>
>> + the methods ->checkpoint, ->collect and ->restore:
>> + int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
>> + int collect(struct ckpt_ctx *ctx, struct socket *sock);
>> + int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
>
>
>> diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
>> new file mode 100644
>> index 0000000..c6fc045
>> --- /dev/null
>> +++ b/Documentation/checkpoint/usage.txt
>> @@ -0,0 +1,247 @@
>> +
>> + How to use Checkpoint-Restart
>> + =========================================
>> +
>> +
>> +API
>> +===
>> +
>> +The API consists of three new system calls:
>> +
>> +* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
>
> flags,
>
>> +
>> + Checkpoint a (sub-)container whose root task is identified by @pid,
>> + to the open file indicated by @fd. If @logfd isn't -1, it indicates
>> + an open file to which error and debug messages are written. @flags
>> + may be one or more of:
>> + - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
>> + (other value are not allowed).
>> +
>> + Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
>> + it returns from a restart, and -1 if an error occurs. The ckptid will
>> + uniquely identify a checkpoint image, for as long as the checkpoint
>> + is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
>> + partial checkpoint, residing in kernel memory).
>> +
>> +* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
>> +
>> + Restart a process hierarchy from a checkpoint image that is read from
>> + the blob stored in the file indicated by @fd. If @logfd isn't -1, it
>> + indicates an open file to which error and debug messages are written.
>> + @flags will have future meaning (must be 0 for now). @pid indicates
>> + the root of the hierarchy as seen in the coordinator's pid-namespace,
>> + and is expected to be a child of the coordinator. @flags may be one
>> + or more of:
>> + - RESTART_TASKSELF : (self) restart of a single process
>> + - RESTART_FROEZN : processes remain frozen once restart completes
>
> FROZEN ?
>
>> + - RESTART_GHOST : process is a ghost (placeholder for a pid)
>
> about @flags: Above says both of these:
> a) @flags will have future meaning (must be 0 for now)
> b) @flags may be one or more of:
>
> so please decide which one it is ;)
>
>> + (Note that this argument may mean 'ckptid' to identify an in-kernel
>> + checkpoint image, with some @flags in the future).
>> +
>> + Returns: -1 if an error occurs, 0 on success when restarting from a
>> + "self" checkpoint, and return value of system call at the time of the
>> + checkpoint when restarting from an "external" checkpoint.
>> +
> ...
>> +
>> +Sysctl/proc
>> +===========
>> +
>> +/proc/sys/kernel/ckpt_unpriv_allowed [default = 1]
>> + controls whether c/r operation is allowed for unprivileged users
>
> C/R
>
>> +
>> +
>> +Operation
>> +=========
>> +
>> +The granularity of a checkpoint usually is a process hierarchy. The
>> +'pid' argument is interpreted in the caller's pid namespace. So to
>> +checkpoint a container whose init task (pid 1 in that pidns) appears
>> +as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
>> +pid 1 will attempt to checkpoint the caller's container, and if the
>> +caller isn't privileged and init is owned by root, it will fail.
>> +
>> +Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
>> +which does not refer to a container's init task, then sys_checkpoint()
>> +would return -EINVAL.
>
> returns -EINVAL.
>
> ...
>
>> +
>> +
>> +User tools
>> +==========
>> +
>> +* checkpoint(1): a tool to perform a checkpoint of a container/subtree
>> +* restart(1): a tool to restart a container/subtree
>> +* ckptinfo: a tool to examine a checkpoint image
>> +
>> +It is best to use the dedicated user tools for checkpoint and restart.
>> +
>> +If you insist, then here is a code snippet that illustrates how a
>> +checkpoint is initiated by a process inside a container - the logic is
>> +similar to fork():
>> + ...
>> + ckptid = checkpoint(0, ...);
>> + switch (crid) {
>
> (ckptid) ?
>
>> + case -1:
>> + perror("checkpoint failed");
>> + break;
>> + default:
>> + fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>
> s/ret/ckptid/ ?
>
>> + /* proceed with execution after checkpoint */
>> + ...
>> + break;
>> + case 0:
>> + fprintf(stderr, "returned after restart\n");
>> + /* proceed with action required following a restart */
>> + ...
>> + break;
>> + }
>> + ...
>> +
>> +And to initiate a restart, the process in an empty container can use
>> +logic similar to execve():
>> + ...
>> + if (restart(pid, ...) < 0)
>> + perror("restart failed");
>> + /* only get here if restart failed */
>> + ...
>> +
>> +Note, that the code also supports "self" checkpoint, where a process
>
> Note that
>
>> +can checkpoint itself. This mode does not capture the relationships of
>> +the task with other tasks, or any shared resources. It is useful for
>> +application that wish to be able to save and restore their state.
>
> applications
>
>> +They will either not use (or care about) shared resources, or they
>> +will be aware of the operations and adapt suitably after a restart.
>> +The code above can also be used for "self" checkpoint.
>> +
>> +
>> +You may find the following sample programs useful:
>> +
>> +* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
>
> checkpoints
>
>> +* self_checkpoint.c: a simple test program doing self-checkpoint
>> +* self_restart.c: restarts a (self-) checkpoint image from stdin
>> +
>> +See also the utilities 'checkpoint' and 'restart' (from user-cr).
>> +
>> +
>> +"External" checkpoint
>> +=====================
>> +
>> +To do "external" checkpoint, you need to first freeze that other task
>> +either using the freezer cgroup.
>
> eh? cannot parse that.
>
>> +
>> +Restart does not preserve the original PID yet, (because we haven't
>> +solved yet the fork-with-specific-pid issue). In a real scenario, you
>> +probably want to first create a new names space, and have the init
>
> namespace,
>
>> +task there call 'sys_restart()'.
>> +
>> +I tested it this way:
>
> ...
>
> ---
> ~Randy
> *** Remember to use Documentation/SubmitChecklist when testing your code ***
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH v21 021/100] c/r: create syscalls: sys_checkpoint, sys_restart
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (7 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
8 siblings, 0 replies; 25+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-api, x86, linux-s390,
linuxppc-dev, Dave Hansen
Create trivial sys_checkpoint and sys_restore system calls. They will
enable to checkpoint and restart an entire container, to and from a
checkpoint image file descriptor.
The syscalls take a pid, a file descriptor (for the image file) and
flags as arguments. The pid identifies the top-most (root) task in the
process tree, e.g. the container init: for sys_checkpoint the first
argument identifies the pid of the target container/subtree; for
sys_restart it will identify the pid of restarting root task.
A checkpoint, much like a process coredump, dumps the state of multiple
processes at once, including the state of the container. The checkpoint
image is written to (and read from) the file descriptor directly from
the kernel. This way the data is generated and then pushed out naturally
as resources and tasks are scanned to save their state. This is the
approach taken by, e.g., Zap and OpenVZ.
By using a return value and not a file descriptor, we can distinguish
between a return from checkpoint, a return from restart (in case of a
checkpoint that includes self, i.e. a task checkpointing its own
container, or itself), and an error condition, in a manner analogous
to a fork() call.
We don't use copy_from_user()/copy_to_user() because it requires
holding the entire image in user space, and does not make sense for
restart. Also, we don't use a pipe, pseudo-fs file and the like,
because they work by generating data on demand as the user pulls it
(unless the entire image is buffered in the kernel) and would require
more complex logic. They also would significantly complicate
checkpoint that includes self.
Changelog[v21-rc3]:
- Reorganize code:move checkpoint/* to kernel/checkpoint/*
Changelog[v19-rc1]:
- Add 'int logfd' to prototype of sys_{checkpoint,restart}
Changelog[v18]:
- [John Dykstra] Fix no-dot-config-targets pattern in linux/Makefile
Changelog[v17]:
- Move checkpoint closer to namespaces (kconfig)
- Kill "Enable" in c/r config option
Changelog[v16]:
- Change sys_restart() first argument to be 'pid_t pid'
Changelog[v14]:
- Change CONFIG_CHEKCPOINT_RESTART to CONFIG_CHECKPOINT (Ingo)
- Remove line 'def_bool n' (default is already 'n')
- Add CHECKPOINT_SUPPORT in Kconfig (Nathan Lynch)
Changelog[v5]:
- Config is 'def_bool n' by default
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
Makefile | 2 +-
arch/x86/Kconfig | 4 +++
arch/x86/include/asm/unistd_32.h | 4 ++-
arch/x86/kernel/syscall_table_32.S | 2 +
include/linux/syscalls.h | 4 +++
init/Kconfig | 2 +
kernel/Makefile | 1 +
kernel/checkpoint/Kconfig | 14 +++++++++++
kernel/checkpoint/Makefile | 5 ++++
kernel/checkpoint/sys.c | 45 ++++++++++++++++++++++++++++++++++++
kernel/sys_ni.c | 4 +++
11 files changed, 85 insertions(+), 2 deletions(-)
create mode 100644 kernel/checkpoint/Kconfig
create mode 100644 kernel/checkpoint/Makefile
create mode 100644 kernel/checkpoint/sys.c
diff --git a/Makefile b/Makefile
index fa1db90..93be4e1 100644
--- a/Makefile
+++ b/Makefile
@@ -409,7 +409,7 @@ endif
# of make so .config is not included in this case either (for *config).
no-dot-config-targets := clean mrproper distclean \
- cscope TAGS tags help %docs check% \
+ cscope TAGS tags help %docs checkstack \
include/linux/version.h headers_% \
kernelrelease kernelversion
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685..0874484 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,10 @@ config STACKTRACE_SUPPORT
config HAVE_LATENCYTOP_SUPPORT
def_bool y
+config CHECKPOINT_SUPPORT
+ bool
+ default y if X86_32
+
config MMU
def_bool y
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index e543b0e..007d7cd 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -344,10 +344,12 @@
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
#define __NR_eclone 338
+#define __NR_checkpoint 339
+#define __NR_restart 340
#ifdef __KERNEL__
-#define NR_syscalls 339
+#define NR_syscalls 341
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0c92570..2d5a6b0 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,3 +338,5 @@ ENTRY(sys_call_table)
.long sys_perf_event_open
.long sys_recvmmsg
.long ptregs_eclone
+ .long sys_checkpoint
+ .long sys_restart /* 340 */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 057929b..d1d1703 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -834,6 +834,10 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
struct timespec __user *, const sigset_t __user *,
size_t);
+asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags,
+ int logfd);
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags,
+ int logfd);
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
diff --git a/init/Kconfig b/init/Kconfig
index bd8174f..2345902 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,8 @@ config NET_NS
Allow user space to create what appear to be multiple instances
of the network stack.
+source "kernel/checkpoint/Kconfig"
+
config BLK_DEV_INITRD
bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
depends on BROKEN || !FRV
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1..1b78cca 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -105,6 +105,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint/
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/checkpoint/Kconfig b/kernel/checkpoint/Kconfig
new file mode 100644
index 0000000..ef7d406
--- /dev/null
+++ b/kernel/checkpoint/Kconfig
@@ -0,0 +1,14 @@
+# Architectures should define CHECKPOINT_SUPPORT when they have
+# implemented the hooks for processor state etc. needed by the
+# core checkpoint/restart code.
+
+config CHECKPOINT
+ bool "Checkpoint/restart (EXPERIMENTAL)"
+ depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
+ help
+ Application checkpoint/restart is the ability to save the
+ state of a running application so that it can later resume
+ its execution from the time at which it was checkpointed.
+
+ Turning this option on will enable checkpoint and restart
+ functionality in the kernel.
diff --git a/kernel/checkpoint/Makefile b/kernel/checkpoint/Makefile
new file mode 100644
index 0000000..8a32c6f
--- /dev/null
+++ b/kernel/checkpoint/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for linux checkpoint/restart.
+#
+
+obj-$(CONFIG_CHECKPOINT) += sys.o
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
new file mode 100644
index 0000000..a81750a
--- /dev/null
+++ b/kernel/checkpoint/sys.c
@@ -0,0 +1,45 @@
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+
+/**
+ * sys_checkpoint - checkpoint a container
+ * @pid: pid of the container init(1) process
+ * @fd: file to which dump the checkpoint image
+ * @flags: checkpoint operation flags
+ * @logfd: fd to which to dump debug and error messages
+ *
+ * Returns positive identifier on success, 0 when returning from restart
+ * or negative value on error
+ */
+SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
+ unsigned long, flags, int, logfd)
+{
+ return -ENOSYS;
+}
+
+/**
+ * sys_restart - restart a container
+ * @pid: pid of task root (in coordinator's namespace), or 0
+ * @fd: file from which read the checkpoint image
+ * @flags: restart operation flags
+ * @logfd: fd to which to dump debug and error messages
+ *
+ * Returns negative value on error, or otherwise returns in the realm
+ * of the original checkpoint
+ */
+SYSCALL_DEFINE4(restart, pid_t, pid, int, fd,
+ unsigned long, flags, int, logfd)
+{
+ return -ENOSYS;
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea7..0206aca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
/* performance counters: */
cond_syscall(sys_perf_event_open);
+
+/* checkpoint/restart */
+cond_syscall(sys_checkpoint);
+cond_syscall(sys_restart);
--
1.6.3.3
^ permalink raw reply related [flat|nested] 25+ messages in thread