* [v13][PATCH 01/12] Factor out code to allocate pidmap page
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-25 18:56 ` Sukadev Bhattiprolu
2009-11-25 18:57 ` [v13][PATCH 02/12] Have alloc_pidmap() return actual error code Sukadev Bhattiprolu
` (6 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 18:56 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 01/12] Factor out code to allocate pidmap page
To simplify alloc_pidmap(), move code to allocate a pid map page to a
separate function.
Changelog[v3]:
- Earlier version of patchset called alloc_pidmap_page() from two
places. But now its called from only one place. Even so, moving
this code out into a separate function simplifies alloc_pidmap().
Changelog[v2]:
- (Matt Helsley, Dave Hansen) Have alloc_pidmap_page() return
-ENOMEM on error instead of -1.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
kernel/pid.c | 45 ++++++++++++++++++++++++++++++---------------
1 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d..7d4bb6e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,9 +122,35 @@ static void free_pidmap(struct upid *upid)
atomic_inc(&map->nr_free);
}
+static int alloc_pidmap_page(struct pidmap *map)
+{
+ void *page;
+
+ if (likely(map->page))
+ return 0;
+
+ page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ /*
+ * Free the page if someone raced with us installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (map->page)
+ kfree(page);
+ else
+ map->page = page;
+ spin_unlock_irq(&pidmap_lock);
+
+ if (unlikely(!map->page))
+ return -ENOMEM;
+
+ return 0;
+}
+
static int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ int rc;
struct pidmap *map;
pid = last + 1;
@@ -134,21 +160,10 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (map->page)
- kfree(page);
- else
- map->page = page;
- spin_unlock_irq(&pidmap_lock);
- if (unlikely(!map->page))
- break;
- }
+ rc = alloc_pidmap_page(map);
+ if (rc)
+ break;
+
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 02/12] Have alloc_pidmap() return actual error code
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-25 18:56 ` [v13][PATCH 01/12] Factor out code to allocate pidmap page Sukadev Bhattiprolu
@ 2009-11-25 18:57 ` Sukadev Bhattiprolu
2009-11-25 18:58 ` [v13][PATCH 03/12] Define set_pidmap() function Sukadev Bhattiprolu
` (5 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 18:57 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 02/12] Have alloc_pidmap() return actual error code
alloc_pidmap() can fail either because all pid numbers are in use or
because memory allocation failed. With support for setting a specific
pid number, alloc_pidmap() would also fail if either the given pid
number is invalid or in use.
Rather than have callers assume -ENOMEM, have alloc_pidmap() return
the actual error.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
kernel/fork.c | 5 +++--
kernel/pid.c | 14 +++++++++-----
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c4..8053c10 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1156,10 +1156,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
- retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns);
- if (!pid)
+ if (IS_ERR(pid)) {
+ retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
+ }
if (clone_flags & CLONE_NEWPID) {
retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
diff --git a/kernel/pid.c b/kernel/pid.c
index 7d4bb6e..c4d9914 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -150,7 +150,7 @@ static int alloc_pidmap_page(struct pidmap *map)
static int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
- int rc;
+ int rc = -EAGAIN;
struct pidmap *map;
pid = last + 1;
@@ -189,12 +189,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
} else {
map = &pid_ns->pidmap[0];
offset = RESERVED_PIDS;
- if (unlikely(last == offset))
+ if (unlikely(last == offset)) {
+ rc = -EAGAIN;
break;
+ }
}
pid = mk_pid(pid_ns, map, offset);
}
- return -1;
+ return rc;
}
int next_pidmap(struct pid_namespace *pid_ns, int last)
@@ -263,8 +265,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
- if (!pid)
+ if (!pid) {
+ pid = ERR_PTR(-ENOMEM);
goto out;
+ }
tmp = ns;
for (i = ns->level; i >= 0; i--) {
@@ -299,7 +303,7 @@ out_free:
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
- pid = NULL;
+ pid = ERR_PTR(nr);
goto out;
}
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 03/12] Define set_pidmap() function
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-25 18:56 ` [v13][PATCH 01/12] Factor out code to allocate pidmap page Sukadev Bhattiprolu
2009-11-25 18:57 ` [v13][PATCH 02/12] Have alloc_pidmap() return actual error code Sukadev Bhattiprolu
@ 2009-11-25 18:58 ` Sukadev Bhattiprolu
2009-11-25 18:59 ` [v13][PATCH 06/12] Check invalid clone flags Sukadev Bhattiprolu
` (4 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 18:58 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 03/12] Define set_pidmap() function
Define a set_pidmap() interface which is like alloc_pidmap() only that
caller specifies the pid number to be assigned.
Changelog[v13]:
- Don't let do_alloc_pidmap return 0 if it failed to find a pid.
Changelog[v9]:
- Completely rewrote this patch based on Eric Biederman's code.
Changelog[v7]:
- [Eric Biederman] Generalize alloc_pidmap() to take a range of pids.
Changelog[v6]:
- Separate target_pid > 0 case to minimize the number of checks needed.
Changelog[v3]:
- (Eric Biederman): Avoid set_pidmap() function. Added couple of
checks for target_pid in alloc_pidmap() itself.
Changelog[v2]:
- (Serge Hallyn) Check for 'pid < 0' in set_pidmap().(Code
actually checks for 'pid <= 0' for completeness).
Signed-off-by: Sukadev Bhattiprolu <sukadev-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
kernel/pid.c | 46 +++++++++++++++++++++++++++++++++++-----------
1 files changed, 35 insertions(+), 11 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index c4d9914..c50a711 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -147,28 +147,29 @@ static int alloc_pidmap_page(struct pidmap *map)
return 0;
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int do_alloc_pidmap(struct pid_namespace *pid_ns, int last, int min,
+ int max)
{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ int i, offset, max_scan, pid;
int rc = -EAGAIN;
struct pidmap *map;
pid = last + 1;
if (pid >= pid_max)
- pid = RESERVED_PIDS;
+ pid = min;
offset = pid & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+ max_scan = (max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
rc = alloc_pidmap_page(map);
if (rc)
break;
+ rc = -EAGAIN;
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
- pid_ns->last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
@@ -179,26 +180,49 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
* bitmap block and the final block was the same
* as the starting point, pid is before last_pid.
*/
- } while (offset < BITS_PER_PAGE && pid < pid_max &&
+ } while (offset < BITS_PER_PAGE && pid < max &&
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+ if (map < &pid_ns->pidmap[(max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset)) {
- rc = -EAGAIN;
+ offset = min;
+ if (unlikely(last == offset))
break;
- }
}
pid = mk_pid(pid_ns, map, offset);
}
return rc;
}
+static int alloc_pidmap(struct pid_namespace *pid_ns)
+{
+ int nr;
+
+ nr = do_alloc_pidmap(pid_ns, pid_ns->last_pid, RESERVED_PIDS, pid_max);
+ if (nr >= 0)
+ pid_ns->last_pid = nr;
+ return nr;
+}
+
+static int set_pidmap(struct pid_namespace *pid_ns, int target)
+{
+ if (!target)
+ return alloc_pidmap(pid_ns);
+
+ if (target >= pid_max)
+ return -EINVAL;
+
+ if ((target < 0) || (target < RESERVED_PIDS &&
+ pid_ns->last_pid >= RESERVED_PIDS))
+ return -EINVAL;
+
+ return do_alloc_pidmap(pid_ns, target - 1, target, target + 1);
+}
+
int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 06/12] Check invalid clone flags
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
` (2 preceding siblings ...)
2009-11-25 18:58 ` [v13][PATCH 03/12] Define set_pidmap() function Sukadev Bhattiprolu
@ 2009-11-25 18:59 ` Sukadev Bhattiprolu
2009-11-25 18:59 ` [v13][PATCH 07/12] Define do_fork_with_pids() Sukadev Bhattiprolu
` (3 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 18:59 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 06/12] Check invalid clone flags
As pointed out by Oren Laadan, we want to ensure that unused bits in the
clone-flags remain unused and available for future. To ensure this, define
a mask of clone-flags and check the flags in the clone() system calls.
Changelog[v9]:
- Include the unused clone-flag (CLONE_UNUSED) to VALID_CLONE_FLAGS
to avoid breaking any applications that may have set it. IOW, this
patch/check only applies to clone-flags bits 33 and higher.
Changelog[v8]:
- New patch in set
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
include/linux/sched.h | 12 ++++++++++++
kernel/fork.c | 3 +++
2 files changed, 15 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60..a4d2c23 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,18 @@
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
+#define CLONE_UNUSED 0x00001000 /* Can be reused ? */
+
+#define VALID_CLONE_FLAGS (CSIGNAL | CLONE_VM | CLONE_FS | CLONE_FILES |\
+ CLONE_SIGHAND | CLONE_UNUSED | CLONE_PTRACE |\
+ CLONE_VFORK | CLONE_PARENT | CLONE_THREAD |\
+ CLONE_NEWNS | CLONE_SYSVSEM | CLONE_SETTLS |\
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |\
+ CLONE_DETACHED | CLONE_UNTRACED |\
+ CLONE_CHILD_SETTID | CLONE_STOPPED |\
+ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER |\
+ CLONE_NEWPID | CLONE_NEWNET | CLONE_IO)
+
/*
* Scheduling policies
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 72c76a1..317adcf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -982,6 +982,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
struct task_struct *p;
int cgroup_callbacks_done = 0;
+ if (clone_flags & ~VALID_CLONE_FLAGS)
+ return ERR_PTR(-EINVAL);
+
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 07/12] Define do_fork_with_pids()
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
` (3 preceding siblings ...)
2009-11-25 18:59 ` [v13][PATCH 06/12] Check invalid clone flags Sukadev Bhattiprolu
@ 2009-11-25 18:59 ` Sukadev Bhattiprolu
2009-11-25 18:59 ` [v13][PATCH 09/12] Implement sys_eclone for x86_64 Sukadev Bhattiprolu
` (2 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 18:59 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 07/12] Define do_fork_with_pids()
do_fork_with_pids() is same as do_fork(), except that it takes an
additional, 'pid_set', parameter. This parameter, currently unused,
specifies the set of target pids of the process in each of its pid
namespaces.
Changelog[v7]:
- Drop 'struct pid_set' object and pass in 'pid_t *target_pids'
instead of 'struct pid_set *'.
Changelog[v6]:
- (Nathan Lynch, Arnd Bergmann, H. Peter Anvin, Linus Torvalds)
Change 'pid_set.pids' to a 'pid_t pids[]' so size of 'struct pid_set'
is constant across architectures.
- (Nathan Lynch) Change 'pid_set.num_pids' to 'unsigned int'.
Changelog[v4]:
- Rename 'struct target_pid_set' to 'struct pid_set' since it may
be useful in other contexts.
Changelog[v3]:
- Fix "long-line" warning from checkpatch.pl
Changelog[v2]:
- To facilitate moving architecture-inpdendent code to kernel/fork.c
pass in 'struct target_pid_set __user *' to do_fork_with_pids()
rather than 'pid_t *' (next patch moves the arch-independent
code to kernel/fork.c)
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
include/linux/sched.h | 3 +++
kernel/fork.c | 17 +++++++++++++++--
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a4d2c23..85e971a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2153,6 +2153,9 @@ extern int disallow_signal(int);
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern long do_fork_with_pids(unsigned long, unsigned long, struct pt_regs *,
+ unsigned long, int __user *, int __user *,
+ unsigned int, pid_t __user *);
struct task_struct *fork_idle(int);
extern void set_task_comm(struct task_struct *tsk, char *from);
diff --git a/kernel/fork.c b/kernel/fork.c
index 317adcf..2610a64 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1377,12 +1377,14 @@ struct task_struct * __cpuinit fork_idle(int cpu)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long do_fork(unsigned long clone_flags,
+long do_fork_with_pids(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr,
+ unsigned int num_pids,
+ pid_t __user *upids)
{
struct task_struct *p;
int trace = 0;
@@ -1485,6 +1487,17 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ return do_fork_with_pids(clone_flags, stack_start, regs, stack_size,
+ parent_tidptr, child_tidptr, 0, NULL);
+}
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 09/12] Implement sys_eclone for x86_64
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
` (4 preceding siblings ...)
2009-11-25 18:59 ` [v13][PATCH 07/12] Define do_fork_with_pids() Sukadev Bhattiprolu
@ 2009-11-25 18:59 ` Sukadev Bhattiprolu
2009-11-25 19:00 ` [v13][PATCH 11/12] Implement sys_eclone for powerpc Sukadev Bhattiprolu
2009-11-25 19:02 ` [v13][PATCH 12/12] Document sys_eclone Sukadev Bhattiprolu
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 18:59 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 09/12] Implement sys_eclone for x86_64
Implement sys_eclone() system call for x86_64.
This is based on earlier code from Dave Hansen. Modified to share code
between x86 and x86_64 kernels.
Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
arch/x86/include/asm/unistd_64.h | 3 +++
arch/x86/kernel/entry_64.S | 1 +
arch/x86/kernel/process_64.c | 7 +++++++
3 files changed, 11 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0a..d2ffc89 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,9 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
#define __NR_perf_event_open 298
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_eclone 299
+__SYSCALL(__NR_eclone, stub_eclone)
+
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f..6d60cd1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -698,6 +698,7 @@ END(\label)
PTREGSCALL stub_vfork, sys_vfork, %rdi
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
+ PTREGSCALL stub_eclone, sys_eclone, %r8
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbc..2c306b9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -540,6 +540,13 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
+asmlinkage long
+sys_eclone(unsigned int flags_low, struct clone_args * __user uca,
+ int args_size, pid_t * __user pids, struct pt_regs *regs)
+{
+ return do_eclone_common(regs, flags_low, uca, args_size, pids);
+}
+
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 11/12] Implement sys_eclone for powerpc
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
` (5 preceding siblings ...)
2009-11-25 18:59 ` [v13][PATCH 09/12] Implement sys_eclone for x86_64 Sukadev Bhattiprolu
@ 2009-11-25 19:00 ` Sukadev Bhattiprolu
2009-11-25 19:02 ` [v13][PATCH 12/12] Document sys_eclone Sukadev Bhattiprolu
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 19:00 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
Subject: [v13][PATCH 11/12] Implement sys_eclone for powerpc
Wired up for both ppc32 and ppc64, but tested only with the latter.
Changelog:
Nov 17: (serge) remove redundant flags_high check, and
don't fold it into flags.
Signed-off-by: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
arch/powerpc/include/asm/syscalls.h | 6 ++++
arch/powerpc/include/asm/systbl.h | 1 +
arch/powerpc/include/asm/unistd.h | 3 +-
arch/powerpc/kernel/entry_32.S | 8 +++++
arch/powerpc/kernel/entry_64.S | 5 +++
arch/powerpc/kernel/process.c | 52 +++++++++++++++++++++++++++++++++++
6 files changed, 74 insertions(+), 1 deletions(-)
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index eb8eb40..1674544 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -24,6 +24,12 @@ asmlinkage int sys_execve(unsigned long a0, unsigned long a1,
asmlinkage int sys_clone(unsigned long clone_flags, unsigned long usp,
int __user *parent_tidp, void __user *child_threadptr,
int __user *child_tidp, int p6, struct pt_regs *regs);
+asmlinkage int sys_eclone(unsigned long flags_low,
+ struct clone_args __user *args,
+ size_t args_size,
+ pid_t __user *pids,
+ unsigned long p5, unsigned long p6,
+ struct pt_regs *regs);
asmlinkage int sys_fork(unsigned long p1, unsigned long p2,
unsigned long p3, unsigned long p4, unsigned long p5,
unsigned long p6, struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index c7d671a..a7f67ee 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -326,3 +326,4 @@ SYSCALL_SPU(perf_event_open)
COMPAT_SYS_SPU(preadv)
COMPAT_SYS_SPU(pwritev)
COMPAT_SYS(rt_tgsigqueueinfo)
+PPC_SYS(eclone)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f6ca761..37357a2 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -345,10 +345,11 @@
#define __NR_preadv 320
#define __NR_pwritev 321
#define __NR_rt_tgsigqueueinfo 322
+#define __NR_eclone 323
#ifdef __KERNEL__
-#define __NR_syscalls 323
+#define __NR_syscalls 324
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 1175a85..579f1da 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -586,6 +586,14 @@ ppc_clone:
stw r0,_TRAP(r1) /* register set saved */
b sys_clone
+ .globl ppc_eclone
+ppc_eclone:
+ SAVE_NVGPRS(r1)
+ lwz r0,_TRAP(r1)
+ rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */
+ stw r0,_TRAP(r1) /* register set saved */
+ b sys_eclone
+
.globl ppc_swapcontext
ppc_swapcontext:
SAVE_NVGPRS(r1)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 9763267..febdca1 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -344,6 +344,11 @@ _GLOBAL(ppc_clone)
bl .sys_clone
b syscall_exit
+_GLOBAL(ppc_eclone)
+ bl .save_nvgprs
+ bl .sys_eclone
+ b syscall_exit
+
_GLOBAL(ppc32_swapcontext)
bl .save_nvgprs
bl .compat_sys_swapcontext
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index c930ac3..3874477 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -900,6 +900,58 @@ int sys_clone(unsigned long clone_flags, unsigned long usp,
return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp);
}
+int sys_eclone(unsigned long clone_flags_low,
+ struct clone_args __user *uclone_args,
+ size_t size,
+ pid_t __user *upids,
+ unsigned long p5, unsigned long p6,
+ struct pt_regs *regs)
+{
+ struct clone_args kclone_args;
+ unsigned long stack_base;
+ int __user *parent_tidp;
+ int __user *child_tidp;
+ unsigned long stack_sz;
+ unsigned int nr_pids;
+ unsigned long flags;
+ unsigned long usp;
+ int rc;
+
+ CHECK_FULL_REGS(regs);
+
+ rc = fetch_clone_args_from_user(uclone_args, size, &kclone_args);
+ if (rc)
+ return rc;
+
+ stack_sz = kclone_args.child_stack_size;
+ stack_base = kclone_args.child_stack;
+
+ /* powerpc doesn't do anything useful with the stack size */
+ if (stack_sz)
+ return -EINVAL;
+
+ /* Interpret stack_base as the child sp if it is set. */
+ usp = regs->gpr[1];
+ if (stack_base)
+ usp = stack_base;
+
+ flags = clone_flags_low;
+
+ nr_pids = kclone_args.nr_pids;
+
+ parent_tidp = (int __user *)kclone_args.parent_tid_ptr;
+ child_tidp = (int __user *)kclone_args.child_tid_ptr;
+
+#ifdef CONFIG_PPC64
+ if (test_thread_flag(TIF_32BIT)) {
+ parent_tidp = TRUNC_PTR(parent_tidp);
+ child_tidp = TRUNC_PTR(child_tidp);
+ }
+#endif
+ return do_fork_with_pids(flags, stack_base, regs, stack_sz,
+ parent_tidp, child_tidp, nr_pids, upids);
+}
+
int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3,
unsigned long p4, unsigned long p5, unsigned long p6,
struct pt_regs *regs)
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [v13][PATCH 12/12] Document sys_eclone
[not found] ` <20091125185543.GA30858-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
` (6 preceding siblings ...)
2009-11-25 19:00 ` [v13][PATCH 11/12] Implement sys_eclone for powerpc Sukadev Bhattiprolu
@ 2009-11-25 19:02 ` Sukadev Bhattiprolu
7 siblings, 0 replies; 14+ messages in thread
From: Sukadev Bhattiprolu @ 2009-11-25 19:02 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oren Laadan,
serue-r/Jw6+rmf7HQT0dZR+AlfA, Eric W. Biederman, Alexey Dobriyan,
Pavel Emelyanov, hpa-YMNOUZJC4hwAvxtiuMwx3w, Nathan Lynch,
haveblue-r/Jw6+rmf7HQT0dZR+AlfA, Matt Helsley, arnd-r2nGTMty4D4,
roland-H+wXaHxf7aLQT0dZR+AlfA,
mtk.manpages-gM/Ye1E23mwN+BqQ9rBEUg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Containers
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [v13][PATCH 12/12] Document sys_eclone
This gives a brief overview of the eclone() system call. We should
eventually describe more details in existing clone(2) man page or in
a new man page.
Changelog[v13]:
- [Nathan Lynch, Serge Hallyn] Rename ->child_stack_base to
->child_stack and ensure ->child_stack_size is 0 on architectures
that don't need it.
- [Arnd Bergmann] Remove ->reserved1 field
- [Louis Rilling, Dave Hansen] Combine the two asm statements in the
example into one and use memory constraint to avoid unncessary copies.
Changelog[v12]:
- [Serge Hallyn] Fix/simplify stack-setup in the example code
- [Serge Hallyn, Oren Laadan] Rename syscall to eclone()
Changelog[v11]:
- [Dave Hansen] Move clone_args validation checks to arch-indpendent
code.
- [Oren Laadan] Make args_size a parameter to system call and remove
it from 'struct clone_args'
- [Oren Laadan] Fix some typos and clarify the order of pids in the
@pids parameter.
Changelog[v10]:
- Rename clone3() to clone_with_pids() and fix some typos.
- Modify example to show usage with the ptregs implementation.
Changelog[v9]:
- [Pavel Machek]: Fix an inconsistency and rename new file to
Documentation/clone3.
- [Roland McGrath, H. Peter Anvin] Updates to description and
example to reflect new prototype of clone3() and the updated/
renamed 'struct clone_args'.
Changelog[v8]:
- clone2() is already in use in IA64. Rename syscall to clone3()
- Add notes to say that we return -EINVAL if invalid clone flags
are specified or if the reserved fields are not 0.
Changelog[v7]:
- Rename clone_with_pids() to clone2()
- Changes to reflect new prototype of clone2() (using clone_struct).
Signed-off-by: Sukadev Bhattiprolu <sukadev-8jLBTbqmX/OZamtmwQBW5tBPR1lH4CV8@public.gmane.org>
Acked-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
Documentation/eclone | 348 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 348 insertions(+), 0 deletions(-)
create mode 100644 Documentation/eclone
diff --git a/Documentation/eclone b/Documentation/eclone
new file mode 100644
index 0000000..c2f1b4b
--- /dev/null
+++ b/Documentation/eclone
@@ -0,0 +1,348 @@
+
+struct clone_args {
+ u64 clone_flags_high;
+ u64 child_stack;
+ u64 child_stack_size;
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+ u32 nr_pids;
+ u32 reserved0;
+};
+
+
+sys_eclone(u32 flags_low, struct clone_args * __user cargs, int cargs_size,
+ pid_t * __user pids)
+
+ In addition to doing everything that clone() system call does, the
+ eclone() system call:
+
+ - allows additional clone flags (31 of 32 bits in the flags
+ parameter to clone() are in use)
+
+ - allows user to specify a pid for the child process in its
+ active and ancestor pid namespaces.
+
+ This system call is meant to be used when restarting an application
+ from a checkpoint. Such restart requires that the processes in the
+ application have the same pids they had when the application was
+ checkpointed. When containers are nested, the processes within the
+ containers exist in multiple pid namespaces and hence have multiple
+ pids to specify during restart.
+
+ The @flags_low parameter is identical to the 'clone_flags' parameter
+ in existing clone() system call.
+
+ The fields in 'struct clone_args' are meant to be used as follows:
+
+ u64 clone_flags_high:
+
+ When eclone() supports more than 32 flags, the additional bits
+ in the clone_flags should be specified in this field. This
+ field is currently unused and must be set to 0.
+
+ u64 child_stack;
+ u64 child_stack_size;
+
+ These two fields correspond to the 'child_stack' fields in
+ clone() and clone2() (on IA64) system calls. The usage of
+ these two fields depends on the processor architecture.
+
+ Most architectures use ->child_stack to pass-in a stack-pointer
+ itself and don't need the ->child_stack_size field. On these
+ architectures the ->child_stack_size field must be 0.
+
+ Some architectures, eg IA64, use ->child_stack to pass-in the
+ base of the region allocated for stack. These architectures
+ must pass in the size of the stack-region in ->child_stack_size.
+
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+
+ These two fields correspond to the 'parent_tid_ptr' and
+ 'child_tid_ptr' fields in the clone() system call
+
+ u32 nr_pids;
+
+ nr_pids specifies the number of pids in the @pids array
+ parameter to eclone() (see below). nr_pids should not exceed
+ the current nesting level of the calling process (i.e if the
+ process is in init_pid_ns, nr_pids must be 1, if process is
+ in a pid namespace that is a child of init-pid-ns, nr_pids
+ cannot exceed 2, and so on).
+
+ u32 reserved0;
+ u64 reserved1;
+
+ These fields are intended to extend the functionality of the
+ eclone() in the future, while preserving backward compatibility.
+ They must be set to 0 for now.
+
+ The @cargs_size parameter specifes the sizeof(struct clone_args) and
+ is intended to enable extending this structure in the future, while
+ preserving backward compatibility. For now, this field must be set
+ to the sizeof(struct clone_args) and this size must match the kernel's
+ view of the structure.
+
+ The @pids parameter defines the set of pids that should be assigned to
+ the child process in its active and ancestor pid namespaces. The
+ descendant pid namespaces do not matter since a process does not have a
+ pid in descendant namespaces, unless the process is in a new pid
+ namespace in which case the process is a container-init (and must have
+ the pid 1 in that namespace).
+
+ See CLONE_NEWPID section of clone(2) man page for details about pid
+ namespaces.
+
+ If a pid in the @pids list is 0, the kernel will assign the next
+ available pid in the pid namespace.
+
+ If a pid in the @pids list is non-zero, the kernel tries to assign
+ the specified pid in that namespace. If that pid is already in use
+ by another process, the system call fails (see EBUSY below).
+
+ The order of pids in @pids is oldest in pids[0] to youngest pid
+ namespace in pids[nr_pids-1]. If the number of pids specified in the
+ @pids list is fewer than the nesting level of the process, the pids
+ are applied from youngest namespace. i.e if the process is nested in
+ a level-6 pid namespace and @pids only specifies 3 pids, the 3 pids
+ are applied to levels 6, 5 and 4. Levels 0 through 3 are assumed to
+ have a pid of '0' (the kernel will assign a pid in those namespaces).
+
+ On success, the system call returns the pid of the child process in
+ the parent's active pid namespace.
+
+ On failure, eclone() returns -1 and sets 'errno' to one of following
+ values (the child process is not created).
+
+ EPERM Caller does not have the CAP_SYS_ADMIN privilege needed to
+ specify the pids in this call (if pids are not specifed
+ CAP_SYS_ADMIN is not required).
+
+ EINVAL The number of pids specified in 'clone_args.nr_pids' exceeds
+ the current nesting level of parent process
+
+ EINVAL Not all specified clone-flags are valid.
+
+ EINVAL The reserved fields in the clone_args argument are not 0.
+
+ EINVAL The child_stack_size field is not 0 (on architectures that
+ pass in a stack pointer in ->child_stack field)
+
+ EBUSY A requested pid is in use by another process in that namespace.
+
+---
+/*
+ * Example eclone() usage - Create a child process with pid CHILD_TID1 in
+ * the current pid namespace. The child gets the usual "random" pid in any
+ * ancestor pid namespaces.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
+#include <unistd.h>
+#include <wait.h>
+#include <sys/syscall.h>
+
+#define __NR_eclone 337
+#define CLONE_NEWPID 0x20000000
+#define CLONE_CHILD_SETTID 0x01000000
+#define CLONE_PARENT_SETTID 0x00100000
+#define CLONE_UNUSED 0x00001000
+
+#define STACKSIZE 8192
+
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef int pid_t;
+struct clone_args {
+ u64 clone_flags_high;
+ u64 child_stack;
+ u64 child_stack_size;
+
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+
+ u32 nr_pids;
+
+ u32 reserved0;
+};
+
+#define exit _exit
+
+/*
+ * Following eclone() is based on code posted by Oren Laadan at:
+ * https://lists.linux-foundation.org/pipermail/containers/2009-June/018463.html
+ */
+#if defined(__i386__) && defined(__NR_eclone)
+
+int eclone(u32 flags_low, struct clone_args *clone_args, int args_size,
+ int *pids)
+{
+ long retval;
+
+ __asm__ __volatile__(
+ "movl %3, %%ebx\n\t" /* flags_low -> 1st (ebx) */
+ "movl %4, %%ecx\n\t" /* clone_args -> 2nd (ecx)*/
+ "movl %5, %%edx\n\t" /* args_size -> 3rd (edx) */
+ "movl %6, %%edi\n\t" /* pids -> 4th (edi)*/
+
+ "pushl %%ebp\n\t" /* save value of ebp */
+ "int $0x80\n\t" /* Linux/i386 system call */
+ "testl %0,%0\n\t" /* check return value */
+ "jne 1f\n\t" /* jump if parent */
+
+ "popl %%esi\n\t" /* get subthread function */
+ "call *%%esi\n\t" /* start subthread function */
+ "movl %2,%0\n\t"
+ "int $0x80\n" /* exit system call: exit subthread */
+ "1:\n\t"
+ "popl %%ebp\t" /* restore parent's ebp */
+
+ :"=a" (retval)
+
+ :"0" (__NR_eclone),
+ "i" (__NR_exit),
+ "m" (flags_low),
+ "m" (clone_args),
+ "m" (args_size),
+ "m" (pids)
+ );
+
+ if (retval < 0) {
+ errno = -retval;
+ retval = -1;
+ }
+ return retval;
+}
+
+/*
+ * Allocate a stack for the clone-child and arrange to have the child
+ * execute @child_fn with @child_arg as the argument.
+ */
+void *setup_stack(int (*child_fn)(void *), void *child_arg, int size)
+{
+ void *stack_base;
+ void **stack_top;
+
+ stack_base = malloc(size + size);
+ if (!stack_base) {
+ perror("malloc()");
+ exit(1);
+ }
+
+ stack_top = (void **)((char *)stack_base + (size - 4));
+ *--stack_top = child_arg;
+ *--stack_top = child_fn;
+
+ return stack_top;
+}
+#endif
+
+/* gettid() is a bit more useful than getpid() when messing with clone() */
+int gettid()
+{
+ int rc;
+
+ rc = syscall(__NR_gettid, 0, 0, 0);
+ if (rc < 0) {
+ printf("rc %d, errno %d\n", rc, errno);
+ exit(1);
+ }
+ return rc;
+}
+
+#define CHILD_TID1 377
+#define CHILD_TID2 1177
+#define CHILD_TID3 2799
+
+struct clone_args clone_args;
+void *child_arg = &clone_args;
+int child_tid;
+
+int do_child(void *arg)
+{
+ struct clone_args *cs = (struct clone_args *)arg;
+ int ctid;
+
+ /* Verify we pushed the arguments correctly on the stack... */
+ if (arg != child_arg) {
+ printf("Child: Incorrect child arg pointer, expected %p,"
+ "actual %p\n", child_arg, arg);
+ exit(1);
+ }
+
+ /* ... and that we got the thread-id we expected */
+ ctid = *((int *)(unsigned long)cs->child_tid_ptr);
+ if (ctid != CHILD_TID1) {
+ printf("Child: Incorrect child tid, expected %d, actual %d\n",
+ CHILD_TID1, ctid);
+ exit(1);
+ } else {
+ printf("Child got the expected tid, %d\n", gettid());
+ }
+ sleep(2);
+
+ printf("[%d, %d]: Child exiting\n", getpid(), ctid);
+ exit(0);
+}
+
+static int do_clone(int (*child_fn)(void *), void *child_arg,
+ unsigned int flags_low, int nr_pids, pid_t *pids_list)
+{
+ int rc;
+ void *stack;
+ struct clone_args *ca = &clone_args;
+ int args_size;
+
+ stack = setup_stack(child_fn, child_arg, STACKSIZE);
+
+ memset(ca, 0, sizeof(*ca));
+
+ ca->child_stack = (u64)(unsigned long)stack;
+ ca->child_stack_size = (u64)0;
+ ca->child_tid_ptr = (u64)(unsigned long)&child_tid;
+ ca->nr_pids = nr_pids;
+
+ args_size = sizeof(struct clone_args);
+ rc = eclone(flags_low, ca, args_size, pids_list);
+
+ printf("[%d, %d]: eclone() returned %d, error %d\n", getpid(), gettid(),
+ rc, errno);
+ return rc;
+}
+
+/*
+ * Multiple pid_t pid_t values in pids_list[] here are just for illustration.
+ * The test case creates a child in the current pid namespace and uses only
+ * the first value, CHILD_TID1.
+ */
+pid_t pids_list[] = { CHILD_TID1, CHILD_TID2, CHILD_TID3 };
+int main()
+{
+ int rc, pid, status;
+ unsigned long flags;
+ int nr_pids = 1;
+
+ flags = SIGCHLD|CLONE_CHILD_SETTID;
+
+ pid = do_clone(do_child, &clone_args, flags, nr_pids, pids_list);
+
+ printf("[%d, %d]: Parent waiting for %d\n", getpid(), gettid(), pid);
+
+ rc = waitpid(pid, &status, __WALL);
+ if (rc < 0) {
+ printf("waitpid(): rc %d, error %d\n", rc, errno);
+ } else {
+ printf("[%d, %d]: child %d:\n\t wait-status 0x%x\n", getpid(),
+ gettid(), rc, status);
+
+ if (WIFEXITED(status)) {
+ printf("\t EXITED, %d\n", WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ printf("\t SIGNALED, %d\n", WTERMSIG(status));
+ }
+ }
+ return 0;
+}
--
1.6.0.4
^ permalink raw reply related [flat|nested] 14+ messages in thread