* [PATCH linux-cr] implement s390 eclone syscall
@ 2009-11-13 5:24 serue-r/Jw6+rmf7HQT0dZR+AlfA
[not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2009-11-13 5:24 UTC (permalink / raw)
To: containers-qjLDD68F18O7TbgM5vRIOg
From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
This patch implements the s390 hook for sys_eclone.
The user-space clone-with-pids glue for s390 (clone_s390x.c
from the user-cr package) is now:
struct pid_set {
int num_pids;
pid_t *pids;
};
#define do_eclone(flags, pids, args, sz) \
( { \
register unsigned long int __r1 asm ("1") = (unsigned long int)(__NR_eclone); \
register unsigned long int __r2 asm ("2") = (unsigned long int)(flags); \
register unsigned long int __r3 asm ("3") = (unsigned long int)(args); \
register unsigned long int __r4 asm ("4") = (unsigned long int)(sz); \
register unsigned long int __r5 asm ("5") = (unsigned long int)(pids); \
register long int __result asm ("2"); \
__asm__ __volatile__( \
" svc 0\n" /* do __NR_eclone syscall */ \
" ltgr %%r2,%%r2\n" /* returned 0? */ \
" jnz 1f\n" /* if not goto label 1 */ \
" lg %%r3,0(%%r15)\n" /* get fnarg off stack into arg 1 */ \
" lg %%r2,8(%%r15)\n" /* get fn off stack int r3 basr*/ \
" lgr %%r1,%%r15\n" /* tmp store old stack pointer */ \
" aghi %%r15,-160\n" /* move the stack */ \
" stg %%r1,0(%%r15)\n" /* and save old stack pointer */ \
" basr %%r14,%%r3\n" /* call fn(arg) */ \
" svc 1\n" /* call exit */ \
" 1:\n" \
: "=d" (__result) \
: "d" (__r1), "0" (__r2), "d" (__r3), "d" (__r4), "d" (__r5) \
: "memory"); \
__result; \
} )
int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
struct pid_set *target_pids, void *arg)
{
struct clone_args clone_args, *ca = &clone_args;
u64 *s;
memset(ca, 0, sizeof(struct clone_args));
ca->nr_pids = target_pids->num_pids;
if (!child_stack) {
/* we could pass in null and then in eclone not
* call exit if child_stack was null, but we'll
* just malloc here */
int sz = 4*getpagesize();
child_stack = malloc(sz);
if (!child_stack)
return -ENOMEM;
child_stack += sz; /* we'll decrement before assigning */
}
ca->child_stack = (u64) child_stack;
s = (u64 *) ca->child_stack;
*--s = (u64) arg;
*--s = (u64) fn;
ca->child_stack -= 16;
return do_eclone(flags, target_pids->pids, ca,
sizeof(struct clone_args));
}
Changelog:
Nov 12: switch to latest (Nov 12) eclone format
Nov 10: use orig_gpr2, not gprs[2] for input arg 1
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
arch/s390/include/asm/unistd.h | 3 +-
arch/s390/kernel/compat_linux.c | 47 ++++++++++++++++++++++++++++++++++++++
arch/s390/kernel/process.c | 48 +++++++++++++++++++++++++++++++++++++++
arch/s390/kernel/syscalls.S | 1 +
4 files changed, 98 insertions(+), 1 deletions(-)
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index cb5232d..cbf6c7c 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -269,7 +269,8 @@
#define __NR_pwritev 329
#define __NR_rt_tgsigqueueinfo 330
#define __NR_perf_event_open 331
-#define NR_syscalls 332
+#define __NR_eclone 332
+#define NR_syscalls 333
/*
* There are some system calls that are not present on 64 bit, some
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 0debcec..c3dc6bd 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -762,6 +762,53 @@ asmlinkage long sys32_write(unsigned int fd, char __user * buf, size_t count)
return sys_write(fd, buf, count);
}
+asmlinkage long sys32_eclone(void)
+{
+ int rc;
+ struct pt_regs *regs = task_pt_regs(current);
+ int args_size;
+ struct clone_args kca;
+ unsigned long flags;
+ int __user *parent_tid_ptr;
+ int __user *child_tid_ptr;
+ unsigned long __user child_stack;
+ unsigned long stack_size;
+ unsigned int flags_low;
+ struct clone_args __user *uca;
+ pid_t __user *pids;
+
+ flags_low = regs->orig_gpr2 & 0xffffffffUL;
+ uca = compat_ptr(regs->gprs[3]);
+ args_size = regs->gprs[4] & 0xffffffffUL;
+ pids = compat_ptr(regs->gprs[5]);
+
+ rc = fetch_clone_args_from_user(uca, args_size, &kca);
+ if (rc)
+ return rc;
+
+ if (kca.clone_flags_high)
+ return -EINVAL;
+ flags = flags_low;
+ parent_tid_ptr = (int __user *) kca.parent_tid_ptr;
+ child_tid_ptr = (int __user *) kca.child_tid_ptr;
+
+ stack_size = (unsigned long) kca.child_stack_size;
+ if (stack_size)
+ return -EINVAL;
+
+ child_stack = (unsigned long) kca.child_stack;
+ if (!child_stack)
+ child_stack = regs->gprs[15];
+
+ /*
+ * TODO: On 32-bit systems, clone_flags is passed in as 32-bit value
+ * to several functions. Need to convert clone_flags to 64-bit.
+ */
+ return do_fork_with_pids(flags, child_stack, regs, stack_size,
+ parent_tid_ptr, child_tid_ptr, kca.nr_pids,
+ pids);
+}
+
/*
* 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64.
* These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE}
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 5417eb5..51f11a1 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -241,6 +241,54 @@ SYSCALL_DEFINE4(clone, unsigned long, newsp, unsigned long, clone_flags,
parent_tidptr, child_tidptr);
}
+SYSCALL_DEFINE0(eclone)
+{
+ int rc;
+ struct pt_regs *regs = task_pt_regs(current);
+ int args_size;
+ struct clone_args kca;
+ unsigned long flags;
+ int __user *parent_tid_ptr;
+ int __user *child_tid_ptr;
+ unsigned long __user child_stack;
+ unsigned long stack_size;
+ unsigned int flags_low;
+ struct clone_args __user *uca;
+ pid_t __user *pids;
+
+ flags_low = regs->orig_gpr2;
+ uca = (struct clone_args __user *) regs->gprs[3];
+ args_size = regs->gprs[4];
+ pids = (pid_t __user *) regs->gprs[5];
+
+ rc = fetch_clone_args_from_user(uca, args_size, &kca);
+ if (rc)
+ return rc;
+
+ if (kca.clone_flags_high)
+ return -EINVAL;
+
+ flags = flags_low;
+ parent_tid_ptr = (int __user *) kca.parent_tid_ptr;
+ child_tid_ptr = (int __user *) kca.child_tid_ptr;
+
+ stack_size = (unsigned long) kca.child_stack_size;
+ if (stack_size)
+ return -EINVAL;
+
+ child_stack = (unsigned long) kca.child_stack;
+ if (!child_stack)
+ child_stack = regs->gprs[15];
+
+ /*
+ * TODO: On 32-bit systems, clone_flags is passed in as 32-bit value
+ * to several functions. Need to convert clone_flags to 64-bit.
+ */
+ return do_fork_with_pids(flags, child_stack, regs, stack_size,
+ parent_tid_ptr, child_tid_ptr, kca.nr_pids,
+ pids);
+}
+
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 30eca07..fb8708d 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -340,3 +340,4 @@ SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper)
SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper)
SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */
SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper)
+SYSCALL(sys_eclone,sys_eclone,sys_eclone_wrapper)
--
1.6.1
^ permalink raw reply related [flat|nested] 17+ messages in thread[parent not found: <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* [PATCH user-cr 1/2] use v13 of eclone in clone_s390x.c [not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-13 5:24 ` serue-r/Jw6+rmf7HQT0dZR+AlfA 2009-11-13 5:24 ` [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids serue-r/Jw6+rmf7HQT0dZR+AlfA 2009-11-16 23:36 ` [PATCH linux-cr] implement s390 eclone syscall Nathan Lynch 2 siblings, 0 replies; 17+ messages in thread From: serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2009-11-13 5:24 UTC (permalink / raw) To: containers-qjLDD68F18O7TbgM5vRIOg From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> --- clone_s390x.c | 100 ++++++++++++++++++++++++++++++++++++++------------------ 1 files changed, 68 insertions(+), 32 deletions(-) diff --git a/clone_s390x.c b/clone_s390x.c index dada822..945a98d 100644 --- a/clone_s390x.c +++ b/clone_s390x.c @@ -13,17 +13,20 @@ #define _GNU_SOURCE #include <unistd.h> +#include <string.h> +#include <malloc.h> #include <errno.h> #include <sys/types.h> #include <sys/syscall.h> #include <asm/unistd.h> +#define __NR_eclone 332 /* - * libc doesn't support clone_with_pid() yet... + * libc doesn't support eclone() yet... * below is arch-dependent code to use the syscall */ #include <linux/checkpoint.h> -#if defined(__NR_clone_with_pids) +#if defined(__NR_eclone) /* this really belongs to some kernel header ! */ struct pid_set { @@ -31,42 +34,75 @@ struct pid_set { pid_t *pids; }; -/* (see: http://lkml.indiana.edu/hypermail/linux/kernel/9604.3/0204.html) */ +typedef unsigned long long u64; +typedef unsigned int u32; +typedef int pid_t; +struct clone_args { + u64 clone_flags_high; + u64 child_stack; + u64 child_stack_size; + u64 parent_tid_ptr; + u64 child_tid_ptr; -#define do_clone_with_pids(stack, flags, ptid, ctid, setp) ({ \ - register unsigned long int __r2 asm ("2") = (unsigned long int)(stack);\ - register unsigned long int __r3 asm ("3") = (unsigned long int)(flags);\ - register unsigned long int __r4 asm ("4") = (unsigned long int)(ptid); \ - register unsigned long int __r5 asm ("5") = (unsigned long int)(ctid); \ - register unsigned long int __r6 asm ("6") = (unsigned long int)(NULL); \ - register unsigned long int __r7 asm ("7") = (unsigned long int)(setp); \ - register unsigned long int __result asm ("2"); \ - __asm__ __volatile__( \ - " lghi %%r1,%7\n" \ - " svc 0\n" \ - : "=d" (__result) \ - : "0" (__r2), "d" (__r3), \ - "d" (__r4), "d" (__r5), "d" (__r6), "d" (__r7), \ - "i" (__NR_clone_with_pids) \ - : "1", "cc", "memory" \ - ); \ - __result; \ - }) + u32 nr_pids; + + u32 reserved0; + u64 reserved1; +}; + +#define do_eclone(flags, pids, args, sz) \ +( { \ + register unsigned long int __r1 asm ("1") = (unsigned long int)(__NR_eclone); \ + register unsigned long int __r2 asm ("2") = (unsigned long int)(flags); \ + register unsigned long int __r3 asm ("3") = (unsigned long int)(args); \ + register unsigned long int __r4 asm ("4") = (unsigned long int)(sz); \ + register unsigned long int __r5 asm ("5") = (unsigned long int)(pids); \ + register long int __result asm ("2"); \ + __asm__ __volatile__( \ + " svc 0\n" /* do __NR_eclone syscall */ \ + " ltgr %%r2,%%r2\n" /* returned 0? */ \ + " jnz 1f\n" /* if not goto label 1 */ \ + " lg %%r3,0(%%r15)\n" /* get fnarg off stack into arg 1 */ \ + " lg %%r2,8(%%r15)\n" /* get fn off stack int r3 basr*/ \ + " lgr %%r1,%%r15\n" /* tmp store old stack pointer */ \ + " aghi %%r15,-160\n" /* move the stack */ \ + " stg %%r1,0(%%r15)\n" /* and save old stack pointer */ \ + " basr %%r14,%%r3\n" /* call fn(arg) */ \ + " svc 1\n" /* call exit */ \ + " 1:\n" \ + : "=d" (__result) \ + : "d" (__r1), "0" (__r2), "d" (__r3), "d" (__r4), "d" (__r5) \ + : "memory"); \ + __result; \ +} ) + int clone_with_pids(int (*fn)(void *), void *child_stack, int flags, struct pid_set *target_pids, void *arg) { - long retval; - retval = do_clone_with_pids(child_stack, flags, NULL, NULL, - target_pids); + struct clone_args clone_args, *ca = &clone_args; + u64 *s; + + memset(ca, 0, sizeof(struct clone_args)); + ca->nr_pids = target_pids->num_pids; + if (!child_stack) { + /* we could pass in null and then in eclone not + * call exit if child_stack was null, but we'll + * just malloc here */ + int sz = 4*getpagesize(); + child_stack = malloc(sz); + if (!child_stack) + return -ENOMEM; + child_stack += sz; /* we'll decrement before assigning */ + } + ca->child_stack = (u64) child_stack; + s = (u64 *) ca->child_stack; + *--s = (u64) arg; + *--s = (u64) fn; + ca->child_stack -= 16; - if (retval < 0) { - errno = -retval; - return -1; - } else if (retval == 0) { - return fn(arg); - } else - return retval; + return do_eclone(flags, target_pids->pids, ca, + sizeof(struct clone_args)); } #endif /* !defined(__NR_clone_with_pids) */ -- 1.6.1.1 ^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-11-13 5:24 ` [PATCH user-cr 1/2] use v13 of eclone in clone_s390x.c serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2009-11-13 5:24 ` serue-r/Jw6+rmf7HQT0dZR+AlfA [not found] ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-11-16 23:36 ` [PATCH linux-cr] implement s390 eclone syscall Nathan Lynch 2 siblings, 1 reply; 17+ messages in thread From: serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2009-11-13 5:24 UTC (permalink / raw) To: containers-qjLDD68F18O7TbgM5vRIOg From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> One of the concerns with clone-with-pids is whether the stack handling is all correct and robust enough to withstand real usage. Little testcases playing with pid values are also necessary, but can't replace really using clone-with-pids to start a shell from which to keep working. This patch tweaks the old ns_exec.c namespace manipulation program to add a -z option to specify a pid. So you can: nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns mount -t proc proc /proc # mount private /proc echo $$ 1 nsexeccwp -z /bin/bash # start a shell with pid 999 echo $$ 999 Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> --- Makefile | 5 +- clone.h | 54 +++++++++ nsexeccwp.c | 352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 410 insertions(+), 1 deletions(-) create mode 100644 clone.h create mode 100644 nsexeccwp.c diff --git a/Makefile b/Makefile index 181cc1c..32a6893 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG) # install dir INSTALL_DIR = /bin -PROGS = checkpoint restart ckptinfo +PROGS = checkpoint restart ckptinfo nsexeccwp # other cleanup OTHER = ckptinfo_types.c @@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread ifneq ($(SUBARCH),) restart: clone_$(SUBARCH).o restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID +nsexeccwp: clone_$(SUBARCH).o +nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID endif # on powerpc, need also assembly file ifeq ($(SUBARCH),ppc) restart: clone_$(SUBARCH)_.o +nsexeccwp: clone_$(SUBARCH)_.o endif # ckptinfo dependencies diff --git a/clone.h b/clone.h new file mode 100644 index 0000000..3569a45 --- /dev/null +++ b/clone.h @@ -0,0 +1,54 @@ +#ifndef CLONE_H +#define CLONE_H +/* + * Copyright (C) 2007 IBM Corporation + * + * Author: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ +#include <sys/syscall.h> + +#ifndef HAVE_UNSHARE + +#if __i386__ +# define __NR_unshare 310 +#elif __x86_64__ +# define __NR_unshare 272 +#elif __ia64__ +# define __NR_unshare 1296 +#elif __s390x__ +# define __NR_unshare 303 +#elif __powerpc__ +# define __NR_unshare 282 +#else +# error "Architecture not supported" +#endif + +#endif /* HAVE_UNSHARE */ + +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000 +#endif + +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000 +#endif + +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 +#endif + +#ifndef CLONE_NEWPID +#define CLONE_NEWPID 0x20000000 +#endif + +#ifndef CLONE_NEWNET +#define CLONE_NEWNET 0x40000000 +#endif + +#endif /* CLONE_H */ diff --git a/nsexeccwp.c b/nsexeccwp.c new file mode 100644 index 0000000..453fb8c --- /dev/null +++ b/nsexeccwp.c @@ -0,0 +1,352 @@ +/* + * Copyright 2008,2009 IBM Corp. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sched.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> +#include <errno.h> +#include <libgen.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "clone.h" + +struct pid_set { + int num_pids; + pid_t *pids; +}; + +typedef unsigned long long u64; +typedef unsigned int u32; +typedef int pid_t; +struct clone_args { + u64 clone_flags_high; + + u64 child_stack_base; + u64 child_stack_size; + + u64 parent_tid_ptr; + u64 child_tid_ptr; + + u32 nr_pids; + + u32 reserved0; + u64 reserved1; +}; +/* (until it's supported by libc) from clone_ARCH.c */ +extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags, + struct pid_set *target_pids, void *arg); + +extern pid_t getpgid(pid_t pid); +extern pid_t getsid(pid_t pid); + +static const char* procname; + +static void usage(const char *name) +{ + printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]" + "[command [arg ..]]\n", name); + printf("\n"); + printf(" -h this message\n"); + printf("\n"); + printf(" -z <pid> use clone_with_pids and specify chosen pid\n"); + printf(" Note that -z and -p are not compatible\n"); + printf(" -c use 'clone' rather than 'unshare' system call\n"); + printf(" -g launch in new cgroup\n"); + printf(" -m mount namespace\n"); + printf(" -n network namespace\n"); + printf(" -u utsname namespace\n"); + printf(" -U userid namespace\n"); + printf(" -i ipc namespace\n"); + printf(" -P <pid-file> File in which to write global pid of cinit\n"); + printf(" -p pid namespace\n"); + printf(" -f <flag> extra clone flags\n"); + printf("\n"); + printf("(C) Copyright IBM Corp. 2006\n"); + printf("\n"); + exit(1); +} + +static int string_to_ul(const char *str, unsigned long int *res) +{ + char *tail; + long long int r; + + if (!*str) + return -1; + + errno = 0; + + r = strtol(str, &tail, 16); + + /* + * according to strtol(3), if errno is set or tail does no point + * to the ending '\0', the conversion failed. + */ + if (errno || *tail) + return -1; + + *res = r; + return 0; +} + +/* + * Copied following opentty() from Fedora's util-linux rpm + * I just changed the "FATAL" message below from syslog() + * to printf + */ +static void +opentty(const char * tty) { + int i, fd, flags; + + fd = open(tty, O_RDWR | O_NONBLOCK); + if (fd == -1) { + printf("FATAL: can't reopen tty: %s", strerror(errno)); + sleep(1); + exit(1); + } + + flags = fcntl(fd, F_GETFL); + flags &= ~O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + + for (i = 0; i < fd; i++) + close(i); + for (i = 0; i < 3; i++) + if (fd != i) + dup2(fd, i); + if (fd >= 3) + close(fd); +} +// Code copy end + +int do_newcgrp = 0; + +int load_cgroup_dir(char *dest, int len) +{ + FILE *f = fopen("/proc/mounts", "r"); + char buf[200]; + char *name, *path, *fsname, *options, *p1, *p2, *s; + if (!f) + return 0; + while (fgets(buf, 200, f)) { + name = strtok_r(buf, " ", &p1); + path = strtok_r(NULL, " ", &p1); + fsname = strtok_r(NULL, " ", &p1); + options = strtok_r(NULL, " ", &p1); + if (strcmp(fsname, "cgroup") != 0) + continue; + + /* make sure the freezer is composed */ + s = strtok_r(options, ",", &p2); + while (s && strcmp(s, "freezer") != 0) + s = strtok_r(NULL, ",", &p2); + if (!s) + continue; + strncpy(dest, path, len); + fclose(f); + return 1; + } + fclose(f); + printf("Freezer not mounted\n"); + return 0; +} + +int move_to_new_cgroup(int newcgroup) +{ + char cgroupname[150], cgroupbase[100], tasksfname[200]; + FILE *fout; + int ret; + + if (!load_cgroup_dir(cgroupbase, 100)) + return 0; + + snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup); + ret = mkdir(cgroupname, 0755); + if (ret) + return 0; + snprintf(tasksfname, 200, "%s/tasks", cgroupname); + fout = fopen(tasksfname, "w"); + if (!fout) + return 0; + fprintf(fout, "%d\n", getpid()); + fclose(fout); + return 1; +} + +int pipefd[2]; + +/* gah. opentty will close the pipefd */ +int check_newcgrp(void) +{ + int ret, newgroup; + char buf[20]; + + if (!do_newcgrp) + return 0; + + close(pipefd[1]); + ret = read(pipefd[0], buf, 20); + close(pipefd[0]); + if (ret == -1) { + perror("read"); + return 1; + } + newgroup = atoi(buf); + if (!move_to_new_cgroup(newgroup)) + return 1; + do_newcgrp = 0; + return 0; +} + +int do_child(void *vargv) +{ + char **argv = (char **)vargv; + + if (check_newcgrp()) + return 1; + + execve(argv[0], argv, __environ); + perror("execve"); + return 1; +} + +void write_pid(char *pid_file, int pid) +{ + FILE *fp; + + if (!pid_file) + return; + + fp = fopen(pid_file, "w"); + if (!fp) { + perror("fopen, pid_file"); + exit(1); + } + fprintf(fp, "%d", pid); + fflush(fp); + fclose(fp); +} + +int main(int argc, char *argv[]) +{ + int c; + unsigned long flags = 0, eflags = 0; + char ttyname[256]; + int status; + int ret, use_clone = 0; + int pid; + char *pid_file = NULL; + struct pid_set pid_set; + int chosen_pid = 0; + + pid_set.num_pids = 1; + pid_set.pids = &chosen_pid; + + procname = basename(argv[0]); + + memset(ttyname, '\0', sizeof(ttyname)); + readlink("/proc/self/fd/0", ttyname, sizeof(ttyname)); + + while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) { + switch (c) { + case 'g': do_newcgrp = getpid(); break; + case 'm': flags |= CLONE_NEWNS; break; + case 'c': use_clone = 1; break; + case 'P': pid_file = optarg; break; + case 'u': flags |= CLONE_NEWUTS; break; + case 'i': flags |= CLONE_NEWIPC; break; + case 'U': flags |= CLONE_NEWUSER; break; + case 'n': flags |= CLONE_NEWNET; break; + case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID; break; + case 'z': chosen_pid = atoi(optarg); break; + case 'f': if (!string_to_ul(optarg, &eflags)) { + flags |= eflags; + break; + } + case 'h': + default: + usage(procname); + } + }; + + if (chosen_pid) { + use_clone = 1; + if (flags & CLONE_NEWPID) { + printf("Error: can't use CLONE_NEWPID and pick a pid\n"); + exit(1); + } + } + argv = &argv[optind]; + argc = argc - optind; + + if (do_newcgrp) { + ret = pipe(pipefd); + if (ret) { + perror("pipe"); + return -1; + } + do_newcgrp = pipefd[0]; + } + + if (use_clone) { + int stacksize = 4*getpagesize(); + void *stack = malloc(stacksize); + + if (!stack) { + perror("malloc"); + return -1; + } + + printf("about to clone with %lx\n", flags); + if (chosen_pid) + printf("Will choose pid %d\n", chosen_pid); + flags |= SIGCHLD; + pid = clone_with_pids(do_child, stack, flags, &pid_set, + (void *)argv); + if (pid == -1) { + perror("clone"); + return -1; + } + } else { + if ((pid = fork()) == 0) { + // Child. + //print_my_info(procname, ttyname); + + if (check_newcgrp()) + return 1; + opentty(ttyname); + + printf("about to unshare with %lx\n", flags); + ret = unshare(flags); + if (ret < 0) { + perror("unshare"); + return 1; + } + + return do_child((void*)argv); + } + + } + if (pid != -1 && do_newcgrp) { + char buf[20]; + snprintf(buf, 20, "%d", pid); + close(pipefd[0]); + write(pipefd[1], buf, strlen(buf)+1); + close(pipefd[1]); + } + + write_pid(pid_file, pid); + + if ((ret = waitpid(pid, &status, __WALL)) < 0) + printf("waitpid() returns %d, errno %d\n", ret, errno); + + exit(0); +} -- 1.6.1.1 ^ permalink raw reply related [flat|nested] 17+ messages in thread
[parent not found: <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-13 21:08 ` Serge E. Hallyn 2009-11-15 22:45 ` Nathan Lynch 2009-11-16 14:45 ` Serge E. Hallyn 2 siblings, 0 replies; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-13 21:08 UTC (permalink / raw) To: containers-qjLDD68F18O7TbgM5vRIOg Quoting serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org (serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org): ... > + pid = clone_with_pids(do_child, stack, flags, &pid_set, > + (void *)argv); > + if (pid == -1) { > + perror("clone"); > + return -1; > + } Come on Serge, what crapppy code! The clone_with_pids() wrapper used in user-cr doesn't set errno, so this is messed up on failure. Shape up! -serge ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-11-13 21:08 ` Serge E. Hallyn @ 2009-11-15 22:45 ` Nathan Lynch [not found] ` <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2009-11-16 14:45 ` Serge E. Hallyn 2 siblings, 1 reply; 17+ messages in thread From: Nathan Lynch @ 2009-11-15 22:45 UTC (permalink / raw) To: serue-r/Jw6+rmf7HQT0dZR+AlfA; +Cc: containers-qjLDD68F18O7TbgM5vRIOg On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > + if (use_clone) { > + int stacksize = 4*getpagesize(); > + void *stack = malloc(stacksize); > + > + if (!stack) { > + perror("malloc"); > + return -1; > + } > + > + printf("about to clone with %lx\n", flags); > + if (chosen_pid) > + printf("Will choose pid %d\n", chosen_pid); > + flags |= SIGCHLD; > + pid = clone_with_pids(do_child, stack, flags, &pid_set, > + (void *)argv); The stack argument should be adjusted with the usual stack += stacksize - 1 or similar, right? ^ permalink raw reply [flat|nested] 17+ messages in thread
[parent not found: <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2009-11-16 11:12 ` Serge E. Hallyn [not found] ` <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-16 11:12 UTC (permalink / raw) To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > > + if (use_clone) { > > + int stacksize = 4*getpagesize(); > > + void *stack = malloc(stacksize); > > + > > + if (!stack) { > > + perror("malloc"); > > + return -1; > > + } > > + > > + printf("about to clone with %lx\n", flags); > > + if (chosen_pid) > > + printf("Will choose pid %d\n", chosen_pid); > > + flags |= SIGCHLD; > > + pid = clone_with_pids(do_child, stack, flags, &pid_set, > > + (void *)argv); > > The stack argument should be adjusted with the usual stack += stacksize > - 1 or similar, right? the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the x86 one by Suka also) does this implicitly, by doing: s = child_stack; *--s = arg; *--s = fn; child_stack -= 16 -serge ^ permalink raw reply [flat|nested] 17+ messages in thread
[parent not found: <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-15 23:49 ` Nathan Lynch [not found] ` <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Nathan Lynch @ 2009-11-15 23:49 UTC (permalink / raw) To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote: > Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > > > + if (use_clone) { > > > + int stacksize = 4*getpagesize(); > > > + void *stack = malloc(stacksize); > > > + > > > + if (!stack) { > > > + perror("malloc"); > > > + return -1; > > > + } > > > + > > > + printf("about to clone with %lx\n", flags); > > > + if (chosen_pid) > > > + printf("Will choose pid %d\n", chosen_pid); > > > + flags |= SIGCHLD; > > > + pid = clone_with_pids(do_child, stack, flags, &pid_set, > > > + (void *)argv); > > > > The stack argument should be adjusted with the usual stack += stacksize > > - 1 or similar, right? > > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the > x86 one by Suka also) does this implicitly, by doing: > > s = child_stack; > *--s = arg; > *--s = fn; > child_stack -= 16 That's setting up arguments for the function to run in the child, and afaict that code assumes the value of child_stack is the _end_ of the stack region. The code I quoted above is passing the beginning of the region (the return value from malloc). On powerpc the segfaults went away when I made the following change. diff --git a/nsexeccwp.c b/nsexeccwp.c index a71d9a4..92eb092 100644 --- a/nsexeccwp.c +++ b/nsexeccwp.c @@ -309,8 +309,8 @@ int main(int argc, char *argv[]) if (chosen_pid) printf("Will choose pid %d\n", chosen_pid); flags |= SIGCHLD; - pid = clone_with_pids(do_child, stack, flags, &pid_set, - (void *)argv); + pid = clone_with_pids(do_child, stack + stacksize - 1, + flags, &pid_set, (void *)argv); ^ permalink raw reply related [flat|nested] 17+ messages in thread
[parent not found: <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2009-11-16 18:26 ` Serge E. Hallyn [not found] ` <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-16 18:26 UTC (permalink / raw) To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote: > > Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > > > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > > > > + if (use_clone) { > > > > + int stacksize = 4*getpagesize(); > > > > + void *stack = malloc(stacksize); > > > > + > > > > + if (!stack) { > > > > + perror("malloc"); > > > > + return -1; > > > > + } > > > > + > > > > + printf("about to clone with %lx\n", flags); > > > > + if (chosen_pid) > > > > + printf("Will choose pid %d\n", chosen_pid); > > > > + flags |= SIGCHLD; > > > > + pid = clone_with_pids(do_child, stack, flags, &pid_set, > > > > + (void *)argv); > > > > > > The stack argument should be adjusted with the usual stack += stacksize > > > - 1 or similar, right? > > > > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the > > x86 one by Suka also) does this implicitly, by doing: > > > > s = child_stack; > > *--s = arg; > > *--s = fn; > > child_stack -= 16 > > That's setting up arguments for the function to run in the child, and > afaict that code assumes the value of child_stack is the _end_ of the > stack region. Yes. > The code I quoted above is passing the beginning of the > region (the return value from malloc). Holy cow, that was a snafu in my switching to sending (stack_base,stack_size) for the previous version, and then back again. It was meant to send stack_base+stack_size now. I say 'holy cow' because it doesn't segfault on s390x. And it certainly should! > On powerpc the segfaults went away when I made the following change. > > diff --git a/nsexeccwp.c b/nsexeccwp.c > index a71d9a4..92eb092 100644 > --- a/nsexeccwp.c > +++ b/nsexeccwp.c > @@ -309,8 +309,8 @@ int main(int argc, char *argv[]) > if (chosen_pid) > printf("Will choose pid %d\n", chosen_pid); > flags |= SIGCHLD; > - pid = clone_with_pids(do_child, stack, flags, &pid_set, > - (void *)argv); > + pid = clone_with_pids(do_child, stack + stacksize - 1, > + flags, &pid_set, (void *)argv); Yes I don't think the -1 should be needed, but certainly the +stacksize is. thanks, -serge ^ permalink raw reply [flat|nested] 17+ messages in thread
[parent not found: <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-16 23:18 ` Nathan Lynch [not found] ` <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Nathan Lynch @ 2009-11-16 23:18 UTC (permalink / raw) To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg On Mon, 2009-11-16 at 12:26 -0600, Serge E. Hallyn wrote: > Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > > On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote: > > > Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > > > > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > > > > > + if (use_clone) { > > > > > + int stacksize = 4*getpagesize(); > > > > > + void *stack = malloc(stacksize); > > > > > + > > > > > + if (!stack) { > > > > > + perror("malloc"); > > > > > + return -1; > > > > > + } > > > > > + > > > > > + printf("about to clone with %lx\n", flags); > > > > > + if (chosen_pid) > > > > > + printf("Will choose pid %d\n", chosen_pid); > > > > > + flags |= SIGCHLD; > > > > > + pid = clone_with_pids(do_child, stack, flags, &pid_set, > > > > > + (void *)argv); > > > > > > > > The stack argument should be adjusted with the usual stack += stacksize > > > > - 1 or similar, right? > > > > > > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the > > > x86 one by Suka also) does this implicitly, by doing: > > > > > > s = child_stack; > > > *--s = arg; > > > *--s = fn; > > > child_stack -= 16 > > > > That's setting up arguments for the function to run in the child, and > > afaict that code assumes the value of child_stack is the _end_ of the > > stack region. > > Yes. > > > The code I quoted above is passing the beginning of the > > region (the return value from malloc). > > Holy cow, that was a snafu in my switching to sending (stack_base,stack_size) > for the previous version, and then back again. It was meant to send > stack_base+stack_size now. Okay, here's the violence I've committed against your code to get eclone working on powerpc (tested 32-bit userspace against 64-bit kernel). ./nsexeccwp -z 300 /bin/bash -c 'echo $$' [debugging cruft elided] 300 This is meant not for inclusion but for discussion at this point. I made some changes that will certainly break the builds for other architectures. Note that I have generic code initializing clone_args with the true stack base and size and passing that to the architecture code. The architecture code (e.g. clone_ppc.c) is responsible for calculating the stack pointer to pass to the kernel. The architecture code is also responsible for clearing clone_args.child_stack_size and updating clone_args.child_stack, adjusting for alignment and arguments if appropriate. In this way, we can accommodate ia64 and parisc and keep platform details in platform-specific code. clone_ppc.c | 54 +++++++++++++++++++++++++++++++++++ clone_ppc_.S | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- eclone.h | 25 ++++++++++++++++ nsexeccwp.c | 42 ++++++++++++---------------- 4 files changed, 182 insertions(+), 27 deletions(-) diff --git a/clone_ppc.c b/clone_ppc.c index 49797fd..9e19fae 100644 --- a/clone_ppc.c +++ b/clone_ppc.c @@ -10,14 +10,25 @@ #define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> #include <unistd.h> #include <errno.h> #include <sys/types.h> #include <sys/syscall.h> #include <asm/unistd.h> +#include "eclone.h" + struct target_pid_set; +struct pid_set { + size_t nr_pids; + pid_t *pids; +}; + + extern int __clone_with_pids(int (*fn)(void *arg), void *child_stack , int flags, @@ -56,3 +67,46 @@ int clone_with_pids(int (*fn)(void *), void *child_stack, int flags, } #endif + +extern int __eclone(int (*fn)(void *arg), + void *child_sp, + int flags, + void *fn_arg, + struct clone_args *args, + size_t args_size, + pid_t *pids); + +int eclone(int (*fn)(void *), void *fn_arg, int clone_flags_low, + struct clone_args *clone_args, pid_t *pids) +{ + struct clone_args my_args; + unsigned long child_sp; + int newpid; + + if (clone_args->child_stack) + child_sp = clone_args->child_stack + + clone_args->child_stack_size - 1; + else + child_sp = 0; + + my_args = *clone_args; + my_args.child_stack = child_sp; + my_args.child_stack_size = 0; + + printf("%s: child_sp = %p\n", __func__, (void *)child_sp); + + newpid = __eclone(fn, + (void *)child_sp, + clone_flags_low, + fn_arg, + &my_args, + sizeof(my_args), + pids); + + if (newpid < 0) { + errno = -newpid; + newpid = -1; + } + + return newpid; +} diff --git a/clone_ppc_.S b/clone_ppc_.S index cb3e053..b777b2d 100644 --- a/clone_ppc_.S +++ b/clone_ppc_.S @@ -11,6 +11,14 @@ #include <asm/unistd.h> #include "powerpc_asm.h" +#ifndef __NR_clone_with_pids +#define __NR_clone_with_pids 325 +#endif + +#ifndef __NR_eclone +#define __NR_eclone 323 +#endif + /* int [r3] clone_with_pids(int (*fn)(void *arg) [r3], * void *child_stack [r4], * int flags [r5], @@ -29,10 +37,10 @@ .globl __clone_with_pids __clone_with_pids: -/* No argument validation. */ + /* No argument validation. */ -/* Set up parent's stack frame. */ -stwu r1,-32(r1) + /* Set up parent's stack frame. */ + stwu r1,-32(r1) /* Save non-volatiles (r28-r31) which we plan to use. */ stmw r28,16(r1) @@ -88,3 +96,77 @@ parent: neg r3,r3 blr +/* int [r3] eclone(int (*fn)(void *arg) [r3], + * void *child_sp [r4], + * int flags [r5], + * void *fn_arg [r6], + * struct clone_args *args [r7], + * size_t args_size [r8], + * pid_t *pids [r9]); + * Creates a child task with the pids specified by pids. + * Returns to parent only, child execution and exit is handled here. + * On error, returns negated errno. On success, returns the pid of the child + * created. + */ + +.globl __eclone +__eclone: + + /* No argument validation. */ + + /* Set up parent's stack frame. */ + stwu r1,-32(r1) + + /* Save non-volatiles (r28-r31) which we plan to use. */ + stmw r28,16(r1) + + /* Set up child's stack frame. */ + clrrwi r4,r4,4 + li r0,0 + stw r0,-16(r4) + + /* Save fn, stack pointer, flags, and fn_arg across system call. */ + mr r28,r3 + mr r29,r4 + mr r30,r5 + mr r31,r6 + + /* Set up arguments for system call. */ + mr r3,r5 /* flags */ + mr r4,r7 /* clone_args */ + mr r5,r8 /* clone_args' size */ + mr r6,r9 /* pids */ + + /* Do the system call */ + li r0,__NR_eclone + sc + + /* Parent or child? */ + cmpwi cr1,r3,0 + crandc 4*cr1+eq,4*cr1+eq,4*cr0+so + bne cr1,eclone_parent + + /* Child. Call fn. */ + mtctr r28 + mr r3,r31 + bctrl + + /* Assume result of fn in r3 and exit. */ + li r0,__NR_exit + sc + +eclone_parent: + /* Restore non-volatiles. */ + lmw r28,16(r1) + + addi r1,r1,32 + + /* Return to caller on success. */ + bnslr + + /* Handle error. Negate the return value to signal an error + * to the caller, which must set errno. + */ + neg r3,r3 + blr + diff --git a/eclone.h b/eclone.h new file mode 100644 index 0000000..601a621 --- /dev/null +++ b/eclone.h @@ -0,0 +1,25 @@ +#ifndef _ECLONE_H_ +#define _ECLONE_H_ + +#include <stdint.h> + +struct clone_args { + uint64_t clone_flags_high; + uint64_t child_stack; + uint64_t child_stack_size; + uint64_t parent_tid_ptr; + uint64_t child_tid_ptr; + + uint32_t nr_pids; + + uint32_t reserved0; + uint64_t reserved1; +}; + +/* arch-dependent code implements this interface */ +extern int eclone(int (*fn)(void *), void *fn_arg, + int clone_flags_low, + struct clone_args *clone_args, + pid_t *pids); + +#endif diff --git a/nsexeccwp.c b/nsexeccwp.c index a71d9a4..b80b78e 100644 --- a/nsexeccwp.c +++ b/nsexeccwp.c @@ -17,29 +17,13 @@ #include <sys/wait.h> #include "clone.h" +#include "eclone.h" struct pid_set { int num_pids; pid_t *pids; }; -typedef unsigned long long u64; -typedef unsigned int u32; -typedef int pid_t; -struct clone_args { - u64 clone_flags_high; - - u64 child_stack_base; - u64 child_stack_size; - - u64 parent_tid_ptr; - u64 child_tid_ptr; - - u32 nr_pids; - - u32 reserved0; - u64 reserved1; -}; /* (until it's supported by libc) from clone_ARCH.c */ extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags, struct pid_set *target_pids, void *arg); @@ -210,6 +194,9 @@ int do_child(void *vargv) { char **argv = (char **)vargv; + printf("%s(%p)/%lu\n", __func__, vargv, (unsigned long)getpid()); + fflush(NULL); + if (check_newcgrp()) return 1; @@ -237,6 +224,7 @@ void write_pid(char *pid_file, int pid) int main(int argc, char *argv[]) { + int i; int c; unsigned long flags = 0, eflags = 0; char ttyname[256]; @@ -244,11 +232,8 @@ int main(int argc, char *argv[]) int ret, use_clone = 0; int pid; char *pid_file = NULL; - struct pid_set pid_set; - int chosen_pid = 0; - - pid_set.num_pids = 1; - pid_set.pids = &chosen_pid; + size_t nr_pids = 1; + pid_t chosen_pid = 0; procname = basename(argv[0]); @@ -287,6 +272,9 @@ int main(int argc, char *argv[]) argv = &argv[optind]; argc = argc - optind; + for (i = 0; i < argc; i++) + printf("argv[%d] = '%s'\n", i, argv[i]); + if (do_newcgrp) { ret = pipe(pipefd); if (ret) { @@ -297,6 +285,7 @@ int main(int argc, char *argv[]) } if (use_clone) { + struct clone_args clone_args; int stacksize = 4*getpagesize(); void *stack = malloc(stacksize); @@ -305,12 +294,17 @@ int main(int argc, char *argv[]) return -1; } + memset(&clone_args, 0, sizeof(clone_args)); + clone_args.child_stack = (unsigned long)stack; + clone_args.child_stack_size = stacksize; + clone_args.nr_pids = nr_pids; + printf("about to clone with %lx\n", flags); if (chosen_pid) printf("Will choose pid %d\n", chosen_pid); + printf("argv = %p\n", argv); flags |= SIGCHLD; - pid = clone_with_pids(do_child, stack, flags, &pid_set, - (void *)argv); + pid = eclone(do_child, argv, flags, &clone_args, &chosen_pid); if (pid == -1) { perror("clone"); return -1; ^ permalink raw reply related [flat|nested] 17+ messages in thread
[parent not found: <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2009-11-17 4:05 ` Serge E. Hallyn 0 siblings, 0 replies; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-17 4:05 UTC (permalink / raw) To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > Okay, here's the violence I've committed against your code to get eclone > working on powerpc (tested 32-bit userspace against 64-bit kernel). > > ./nsexeccwp -z 300 /bin/bash -c 'echo $$' > [debugging cruft elided] > 300 > > This is meant not for inclusion but for discussion at this point. I > made some changes that will certainly break the builds for other > architectures. > > Note that I have generic code initializing clone_args with the true > stack base and size and passing that to the architecture code. The > architecture code (e.g. clone_ppc.c) is responsible for calculating the > stack pointer to pass to the kernel. The architecture code is also > responsible for clearing clone_args.child_stack_size and updating > clone_args.child_stack, adjusting for alignment and arguments if > appropriate. In this way, we can accommodate ia64 and parisc and keep > platform details in platform-specific code. ... > diff --git a/clone_ppc.c b/clone_ppc.c > index 49797fd..9e19fae 100644 > --- a/clone_ppc.c > +++ b/clone_ppc.c > @@ -10,14 +10,25 @@ > > #define _GNU_SOURCE > > +#include <stdint.h> > +#include <stdio.h> > +#include <string.h> > #include <unistd.h> > #include <errno.h> > #include <sys/types.h> > #include <sys/syscall.h> > #include <asm/unistd.h> > > +#include "eclone.h" > + > struct target_pid_set; > > +struct pid_set { > + size_t nr_pids; > + pid_t *pids; > +}; You shouldn't need the pid_set any more right? ... > @@ -305,12 +294,17 @@ int main(int argc, char *argv[]) > return -1; > } > > + memset(&clone_args, 0, sizeof(clone_args)); > + clone_args.child_stack = (unsigned long)stack; > + clone_args.child_stack_size = stacksize; > + clone_args.nr_pids = nr_pids; > + > printf("about to clone with %lx\n", flags); > if (chosen_pid) > printf("Will choose pid %d\n", chosen_pid); > + printf("argv = %p\n", argv); > flags |= SIGCHLD; > - pid = clone_with_pids(do_child, stack, flags, &pid_set, > - (void *)argv); > + pid = eclone(do_child, argv, flags, &clone_args, &chosen_pid); > if (pid == -1) { > perror("clone"); > return -1; Yup, of course I agree with switching to a clean eclone passing the clone_args and no struct pid_set, i was just trying to minimize (to 0 :) the changes required for now in restart.c. If you don't mind sending the patch to update restart.c as well as this (minus some debugging) when you're ready, I'll port clone_s390x.c to your precise api. thanks, -serge ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-11-13 21:08 ` Serge E. Hallyn 2009-11-15 22:45 ` Nathan Lynch @ 2009-11-16 14:45 ` Serge E. Hallyn 2 siblings, 0 replies; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-16 14:45 UTC (permalink / raw) To: Nathan T Lynch, containers-qjLDD68F18O7TbgM5vRIOg Subject: [PATCH 1/1] nsexeccwp bugfixes 1. As Nathan pointed out, I was passing in stack bottom, not stack top. Our clone_with_pids() helper in user-cr/clone_${ARCH}.c just accepts stack top. 2. The clone_with_pids() helper returns -errno on error, it doesn't set errno. Hande that right. Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> --- nsexeccwp.c | 4 +++- 1 files changed, 3 insertions(+), 1 deletions(-) diff --git a/nsexeccwp.c b/nsexeccwp.c index 453fb8c..d4bf00c 100644 --- a/nsexeccwp.c +++ b/nsexeccwp.c @@ -304,6 +304,7 @@ int main(int argc, char *argv[]) perror("malloc"); return -1; } + stack += stacksize - 1; printf("about to clone with %lx\n", flags); if (chosen_pid) @@ -311,7 +312,8 @@ int main(int argc, char *argv[]) flags |= SIGCHLD; pid = clone_with_pids(do_child, stack, flags, &pid_set, (void *)argv); - if (pid == -1) { + if (pid < 0) { + errno = -pid; perror("clone"); return -1; } -- 1.6.1.1 ^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH linux-cr] implement s390 eclone syscall [not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 2009-11-13 5:24 ` [PATCH user-cr 1/2] use v13 of eclone in clone_s390x.c serue-r/Jw6+rmf7HQT0dZR+AlfA 2009-11-13 5:24 ` [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2009-11-16 23:36 ` Nathan Lynch [not found] ` <1258414596.4031.1058.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2 siblings, 1 reply; 17+ messages in thread From: Nathan Lynch @ 2009-11-16 23:36 UTC (permalink / raw) To: serue-r/Jw6+rmf7HQT0dZR+AlfA; +Cc: containers-qjLDD68F18O7TbgM5vRIOg On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > > This patch implements the s390 hook for sys_eclone. Here's powerpc (this is on top of serge's eclone-v13-s390x.2 branch). From 9c7ee027d1519a68308b20f5216a49eb43656ff6 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org> Date: Mon, 16 Nov 2009 16:37:29 -0600 Subject: [PATCH] implement eclone for powerpc Wired up for both ppc32 and ppc64, but tested only with the latter. Signed-off-by: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org> --- arch/powerpc/include/asm/syscalls.h | 6 ++++ arch/powerpc/include/asm/systbl.h | 1 + arch/powerpc/include/asm/unistd.h | 3 +- arch/powerpc/kernel/entry_32.S | 8 +++++ arch/powerpc/kernel/entry_64.S | 5 +++ arch/powerpc/kernel/process.c | 56 +++++++++++++++++++++++++++++++++++ 6 files changed, 78 insertions(+), 1 deletions(-) diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index eb8eb40..1674544 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -24,6 +24,12 @@ asmlinkage int sys_execve(unsigned long a0, unsigned long a1, asmlinkage int sys_clone(unsigned long clone_flags, unsigned long usp, int __user *parent_tidp, void __user *child_threadptr, int __user *child_tidp, int p6, struct pt_regs *regs); +asmlinkage int sys_eclone(unsigned long flags_low, + struct clone_args __user *args, + size_t args_size, + pid_t __user *pids, + unsigned long p5, unsigned long p6, + struct pt_regs *regs); asmlinkage int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4, unsigned long p5, unsigned long p6, struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index c7d671a..a7f67ee 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -326,3 +326,4 @@ SYSCALL_SPU(perf_event_open) COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) +PPC_SYS(eclone) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index f6ca761..37357a2 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -345,10 +345,11 @@ #define __NR_preadv 320 #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 +#define __NR_eclone 323 #ifdef __KERNEL__ -#define __NR_syscalls 323 +#define __NR_syscalls 324 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1175a85..579f1da 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -586,6 +586,14 @@ ppc_clone: stw r0,_TRAP(r1) /* register set saved */ b sys_clone + .globl ppc_eclone +ppc_eclone: + SAVE_NVGPRS(r1) + lwz r0,_TRAP(r1) + rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ + stw r0,_TRAP(r1) /* register set saved */ + b sys_eclone + .globl ppc_swapcontext ppc_swapcontext: SAVE_NVGPRS(r1) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index f9fd54b..1d6077e 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -344,6 +344,11 @@ _GLOBAL(ppc_clone) bl .sys_clone b syscall_exit +_GLOBAL(ppc_eclone) + bl .save_nvgprs + bl .sys_eclone + b syscall_exit + _GLOBAL(ppc32_swapcontext) bl .save_nvgprs bl .compat_sys_swapcontext diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 2ec1eae..42d08cb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -900,6 +900,62 @@ int sys_clone(unsigned long clone_flags, unsigned long usp, return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp); } +int sys_eclone(unsigned long clone_flags_low, + struct clone_args __user *uclone_args, + size_t size, + pid_t __user *upids, + unsigned long p5, unsigned long p6, + struct pt_regs *regs) +{ + struct clone_args kclone_args; + unsigned long stack_base; + int __user *parent_tidp; + int __user *child_tidp; + unsigned long stack_sz; + unsigned int nr_pids; + unsigned long flags; + unsigned long usp; + int rc; + + CHECK_FULL_REGS(regs); + + rc = fetch_clone_args_from_user(uclone_args, size, &kclone_args); + if (rc) + return rc; + + stack_sz = kclone_args.child_stack_size; + stack_base = kclone_args.child_stack; + + /* powerpc doesn't do anything useful with the stack size */ + if (stack_sz) + return -EINVAL; + + /* Interpret stack_base as the child sp if it is set. */ + usp = regs->gpr[1]; + if (stack_base) + usp = stack_base; + + /* High flags unused as yet */ + if (kclone_args.clone_flags_high) + return -EINVAL; + + flags = clone_flags_low | (kclone_args.clone_flags_high << 32); + + nr_pids = kclone_args.nr_pids; + + parent_tidp = (int __user *)kclone_args.parent_tid_ptr; + child_tidp = (int __user *)kclone_args.child_tid_ptr; + +#ifdef CONFIG_PPC64 + if (test_thread_flag(TIF_32BIT)) { + parent_tidp = TRUNC_PTR(parent_tidp); + child_tidp = TRUNC_PTR(child_tidp); + } +#endif + return do_fork_with_pids(flags, stack_base, regs, stack_sz, + parent_tidp, child_tidp, nr_pids, upids); +} + int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4, unsigned long p5, unsigned long p6, struct pt_regs *regs) -- 1.6.0.6 ^ permalink raw reply related [flat|nested] 17+ messages in thread
[parent not found: <1258414596.4031.1058.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH linux-cr] implement s390 eclone syscall [not found] ` <1258414596.4031.1058.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2009-11-17 4:03 ` Serge E. Hallyn [not found] ` <20091117040321.GA32461-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-17 4:03 UTC (permalink / raw) To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org): > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote: > > From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > > > > This patch implements the s390 hook for sys_eclone. > > Here's powerpc (this is on top of serge's eclone-v13-s390x.2 branch). Pushed to branch eclone-v13-s390x.ppc of git://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux-cr.git thanks, -serge ^ permalink raw reply [flat|nested] 17+ messages in thread
[parent not found: <20091117040321.GA32461-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* [PATCH linux-cr] fix warnings in i386 sys_eclone [not found] ` <20091117040321.GA32461-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-17 21:33 ` Nathan Lynch 0 siblings, 0 replies; 17+ messages in thread From: Nathan Lynch @ 2009-11-17 21:33 UTC (permalink / raw) To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg With your eclone-v13-s390x.ppc.2 branch I get: arch/x86/kernel/process_32.c: In function ‘sys_eclone’: arch/x86/kernel/process_32.c:479: warning: cast to pointer from integer of different size arch/x86/kernel/process_32.c:480: warning: cast to pointer from integer of different size arch/x86/kernel/process_32.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 161ae4e..2bb8c1e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -476,8 +476,8 @@ int sys_eclone(struct pt_regs *regs) * flags = (kca.clone_flags_high << 32) | flags_low; */ flags = flags_low; - parent_tid_ptr = (int *)kca.parent_tid_ptr; - child_tid_ptr = (int *)kca.child_tid_ptr; + parent_tid_ptr = (int *)(unsigned long)kca.parent_tid_ptr; + child_tid_ptr = (int *)(unsigned long)kca.child_tid_ptr; stack_size = (unsigned long)kca.child_stack_size; if (stack_size) _______________________________________________ Containers mailing list Containers@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH user-cr 1/2] use Suka's v11 api
@ 2009-11-10 16:58 Serge E. Hallyn
[not found] ` <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Serge E. Hallyn @ 2009-11-10 16:58 UTC (permalink / raw)
To: Linux Containers; +Cc: Nathan T Lynch
This patch:
1. changes restart to pass the right values to
clone-with-pids.
2. updates the clone_s390x.c to work with the
new kernel.
All tests under cr_tests/ pass with this patch.
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
clone_s390x.c | 92 +++++++++++++++++++++++++++++++++++++--------------------
restart.c | 14 +++++----
2 files changed, 68 insertions(+), 38 deletions(-)
diff --git a/clone_s390x.c b/clone_s390x.c
index dada822..71cf52f 100644
--- a/clone_s390x.c
+++ b/clone_s390x.c
@@ -14,6 +14,7 @@
#include <unistd.h>
#include <errno.h>
+#include <string.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <asm/unistd.h>
@@ -25,48 +26,75 @@
#include <linux/checkpoint.h>
#if defined(__NR_clone_with_pids)
-/* this really belongs to some kernel header ! */
struct pid_set {
int num_pids;
pid_t *pids;
};
-/* (see: http://lkml.indiana.edu/hypermail/linux/kernel/9604.3/0204.html) */
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef int pid_t;
+struct clone_args {
+ u64 clone_flags_high;
-#define do_clone_with_pids(stack, flags, ptid, ctid, setp) ({ \
- register unsigned long int __r2 asm ("2") = (unsigned long int)(stack);\
- register unsigned long int __r3 asm ("3") = (unsigned long int)(flags);\
- register unsigned long int __r4 asm ("4") = (unsigned long int)(ptid); \
- register unsigned long int __r5 asm ("5") = (unsigned long int)(ctid); \
- register unsigned long int __r6 asm ("6") = (unsigned long int)(NULL); \
- register unsigned long int __r7 asm ("7") = (unsigned long int)(setp); \
- register unsigned long int __result asm ("2"); \
- __asm__ __volatile__( \
- " lghi %%r1,%7\n" \
- " svc 0\n" \
- : "=d" (__result) \
- : "0" (__r2), "d" (__r3), \
- "d" (__r4), "d" (__r5), "d" (__r6), "d" (__r7), \
- "i" (__NR_clone_with_pids) \
- : "1", "cc", "memory" \
- ); \
- __result; \
- })
+ u64 child_stack_base;
+ u64 child_stack_size;
-int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+
+ u32 nr_pids;
+
+ u32 reserved0;
+ u64 reserved1;
+};
+
+#define do_cwp(flags, pids, args, sz) \
+( { \
+ register unsigned long int __r1 asm ("1") = (unsigned long int)(__NR_clone_with_pids); \
+ register unsigned long int __r2 asm ("2") = (unsigned long int)(flags); \
+ register unsigned long int __r3 asm ("3") = (unsigned long int)(args); \
+ register unsigned long int __r4 asm ("4") = (unsigned long int)(sz); \
+ register unsigned long int __r5 asm ("5") = (unsigned long int)(pids); \
+ register long int __result asm ("2"); \
+ __asm__ __volatile__( \
+ " svc 0\n" /* do __NR_cwp syscall */ \
+ " ltgr %%r2,%%r2\n" /* returned 0? */ \
+ " jnz 1f\n" /* if not goto label 1 */ \
+ " lg %%r3,0(%%r15)\n" /* get fnarg off stack into arg 1 */ \
+ " lg %%r2,8(%%r15)\n" /* get fn off stack int r3 basr*/ \
+ " lgr %%r1,%%r15\n" /* tmp store old stack pointer */ \
+ " aghi %%r15,-160\n" /* move the stack */ \
+ " stg %%r1,0(%%r15)\n" /* and save old stack pointer */ \
+ " basr %%r14,%%r3\n" /* call fn(arg) */ \
+ " svc 1\n" /* call exit */ \
+ " 1:\n" \
+ : "=d" (__result) \
+ : "d" (__r1), "0" (__r2), "d" (__r3), "d" (__r4), "d" (__r5) \
+ : "memory"); \
+ __result; \
+} )
+
+int clone_with_pids(int (*fn)(void *), void *child_stack,
+ unsigned long stack_size, unsigned long flags,
struct pid_set *target_pids, void *arg)
{
- long retval;
- retval = do_clone_with_pids(child_stack, flags, NULL, NULL,
- target_pids);
+ struct clone_args clone_args, *ca = &clone_args;
+ u64 *s;
+
+ memset(ca, 0, sizeof(struct clone_args));
+ ca->nr_pids = target_pids->num_pids;
+ ca->child_stack_size = stack_size - 16;
+ ca->child_stack_base = (u64) child_stack;
+ if (child_stack) {
+ s = (u64 *) (ca->child_stack_base + ca->child_stack_size);
+ *--s = (u64) arg;
+ *--s = (u64) fn;
+ ca->child_stack_size -= 16;
+ }
- if (retval < 0) {
- errno = -retval;
- return -1;
- } else if (retval == 0) {
- return fn(arg);
- } else
- return retval;
+ return do_cwp(flags, target_pids->pids, ca,
+ sizeof(struct clone_args));
}
#endif /* !defined(__NR_clone_with_pids) */
diff --git a/restart.c b/restart.c
index 35c54ea..ebc7bf8 100644
--- a/restart.c
+++ b/restart.c
@@ -43,10 +43,12 @@ struct pid_set {
/* (until it's supported by libc) from clone_ARCH.c */
#if defined(__NR_clone_with_pids) && defined(ARCH_HAS_CLONE_WITH_PID)
-extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+extern int clone_with_pids(int (*fn)(void *), void *child_stack,
+ unsigned long stack_size, int flags,
struct pid_set *target_pids, void *arg);
#else
-static int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+static int clone_with_pids(int (*fn)(void *), void *child_stack,
+ unsigned long stack_size, int flags,
struct pid_set *target_pids, void *arg)
{
return clone(fn, child_stack, flags, arg);
@@ -1749,18 +1751,17 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
{
struct pid_set pid_set;
char *stack_region;
- char *stack_start;
unsigned long flags = SIGCHLD;
+ unsigned long stack_size = PTHREAD_STACK_MIN;
pid_t pid = 0;
ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags);
- stack_region = malloc(PTHREAD_STACK_MIN);
+ stack_region = malloc(stack_size);
if (!stack_region) {
perror("stack malloc");
return -1;
}
- stack_start = stack_region + PTHREAD_STACK_MIN - 1;
pid_set.pids = &pid;
pid_set.num_pids = 1;
@@ -1788,7 +1789,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
else
child->real_parent = _getpid();
- pid = clone_with_pids(ckpt_fork_stub, stack_start, flags, &pid_set, child);
+ pid = clone_with_pids(ckpt_fork_stub, stack_region, stack_size - 16,
+ flags, &pid_set, child);
if (pid < 0) {
perror("clone");
free(stack_region);
--
1.6.1.1
^ permalink raw reply related [flat|nested] 17+ messages in thread[parent not found: <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-10 16:59 ` Serge E. Hallyn [not found] ` <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-10 16:59 UTC (permalink / raw) To: Linux Containers; +Cc: Nathan T Lynch One of the concerns with clone-with-pids is whether the stack handling is all correct and robust enough to withstand real usage. Little testcases playing with pid values are also necessary, but can't replace really using clone-with-pids to start a shell from which to keep working. This patch tweaks the old ns_exec.c namespace manipulation program to add a -z option to specify a pid. So you can: nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns mount -t proc proc /proc # mount private /proc echo $$ 1 nsexeccwp -z /bin/bash # start a shell with pid 999 echo $$ 999 Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> --- Makefile | 5 +- clone.h | 54 +++++++++ nsexeccwp.c | 352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 410 insertions(+), 1 deletions(-) create mode 100644 clone.h create mode 100644 nsexeccwp.c diff --git a/Makefile b/Makefile index 181cc1c..32a6893 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG) # install dir INSTALL_DIR = /bin -PROGS = checkpoint restart ckptinfo +PROGS = checkpoint restart ckptinfo nsexeccwp # other cleanup OTHER = ckptinfo_types.c @@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread ifneq ($(SUBARCH),) restart: clone_$(SUBARCH).o restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID +nsexeccwp: clone_$(SUBARCH).o +nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID endif # on powerpc, need also assembly file ifeq ($(SUBARCH),ppc) restart: clone_$(SUBARCH)_.o +nsexeccwp: clone_$(SUBARCH)_.o endif # ckptinfo dependencies diff --git a/clone.h b/clone.h new file mode 100644 index 0000000..3569a45 --- /dev/null +++ b/clone.h @@ -0,0 +1,54 @@ +#ifndef CLONE_H +#define CLONE_H +/* + * Copyright (C) 2007 IBM Corporation + * + * Author: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ +#include <sys/syscall.h> + +#ifndef HAVE_UNSHARE + +#if __i386__ +# define __NR_unshare 310 +#elif __x86_64__ +# define __NR_unshare 272 +#elif __ia64__ +# define __NR_unshare 1296 +#elif __s390x__ +# define __NR_unshare 303 +#elif __powerpc__ +# define __NR_unshare 282 +#else +# error "Architecture not supported" +#endif + +#endif /* HAVE_UNSHARE */ + +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000 +#endif + +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000 +#endif + +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 +#endif + +#ifndef CLONE_NEWPID +#define CLONE_NEWPID 0x20000000 +#endif + +#ifndef CLONE_NEWNET +#define CLONE_NEWNET 0x40000000 +#endif + +#endif /* CLONE_H */ diff --git a/nsexeccwp.c b/nsexeccwp.c new file mode 100644 index 0000000..f14b8b0 --- /dev/null +++ b/nsexeccwp.c @@ -0,0 +1,352 @@ +/* + * Copyright 2008,2009 IBM Corp. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sched.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> +#include <errno.h> +#include <libgen.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "clone.h" + +struct pid_set { + int num_pids; + pid_t *pids; +}; + +typedef unsigned long long u64; +typedef unsigned int u32; +typedef int pid_t; +struct clone_args { + u64 clone_flags_high; + + u64 child_stack_base; + u64 child_stack_size; + + u64 parent_tid_ptr; + u64 child_tid_ptr; + + u32 nr_pids; + + u32 reserved0; + u64 reserved1; +}; +extern int clone_with_pids(int (*fn)(void *), void *child_stack, + unsigned long stack_size, unsigned long flags, + struct pid_set *target_pids, void *arg); + +extern pid_t getpgid(pid_t pid); +extern pid_t getsid(pid_t pid); + +static const char* procname; + +static void usage(const char *name) +{ + printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]" + "[command [arg ..]]\n", name); + printf("\n"); + printf(" -h this message\n"); + printf("\n"); + printf(" -z <pid> use clone_with_pids and specify chosen pid\n"); + printf(" Note that -z and -p are not compatible\n"); + printf(" -c use 'clone' rather than 'unshare' system call\n"); + printf(" -g launch in new cgroup\n"); + printf(" -m mount namespace\n"); + printf(" -n network namespace\n"); + printf(" -u utsname namespace\n"); + printf(" -U userid namespace\n"); + printf(" -i ipc namespace\n"); + printf(" -P <pid-file> File in which to write global pid of cinit\n"); + printf(" -p pid namespace\n"); + printf(" -f <flag> extra clone flags\n"); + printf("\n"); + printf("(C) Copyright IBM Corp. 2006\n"); + printf("\n"); + exit(1); +} + +static int string_to_ul(const char *str, unsigned long int *res) +{ + char *tail; + long long int r; + + if (!*str) + return -1; + + errno = 0; + + r = strtol(str, &tail, 16); + + /* + * according to strtol(3), if errno is set or tail does no point + * to the ending '\0', the conversion failed. + */ + if (errno || *tail) + return -1; + + *res = r; + return 0; +} + +/* + * Copied following opentty() from Fedora's util-linux rpm + * I just changed the "FATAL" message below from syslog() + * to printf + */ +static void +opentty(const char * tty) { + int i, fd, flags; + + fd = open(tty, O_RDWR | O_NONBLOCK); + if (fd == -1) { + printf("FATAL: can't reopen tty: %s", strerror(errno)); + sleep(1); + exit(1); + } + + flags = fcntl(fd, F_GETFL); + flags &= ~O_NONBLOCK; + fcntl(fd, F_SETFL, flags); + + for (i = 0; i < fd; i++) + close(i); + for (i = 0; i < 3; i++) + if (fd != i) + dup2(fd, i); + if (fd >= 3) + close(fd); +} +// Code copy end + +int do_newcgrp = 0; + +int load_cgroup_dir(char *dest, int len) +{ + FILE *f = fopen("/proc/mounts", "r"); + char buf[200]; + char *name, *path, *fsname, *options, *p1, *p2, *s; + if (!f) + return 0; + while (fgets(buf, 200, f)) { + name = strtok_r(buf, " ", &p1); + path = strtok_r(NULL, " ", &p1); + fsname = strtok_r(NULL, " ", &p1); + options = strtok_r(NULL, " ", &p1); + if (strcmp(fsname, "cgroup") != 0) + continue; + + /* make sure the freezer is composed */ + s = strtok_r(options, ",", &p2); + while (s && strcmp(s, "freezer") != 0) + s = strtok_r(NULL, ",", &p2); + if (!s) + continue; + strncpy(dest, path, len); + fclose(f); + return 1; + } + fclose(f); + printf("Freezer not mounted\n"); + return 0; +} + +int move_to_new_cgroup(int newcgroup) +{ + char cgroupname[150], cgroupbase[100], tasksfname[200]; + FILE *fout; + int ret; + + if (!load_cgroup_dir(cgroupbase, 100)) + return 0; + + snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup); + ret = mkdir(cgroupname, 0755); + if (ret) + return 0; + snprintf(tasksfname, 200, "%s/tasks", cgroupname); + fout = fopen(tasksfname, "w"); + if (!fout) + return 0; + fprintf(fout, "%d\n", getpid()); + fclose(fout); + return 1; +} + +int pipefd[2]; + +/* gah. opentty will close the pipefd */ +int check_newcgrp(void) +{ + int ret, newgroup; + char buf[20]; + + if (!do_newcgrp) + return 0; + + close(pipefd[1]); + ret = read(pipefd[0], buf, 20); + close(pipefd[0]); + if (ret == -1) { + perror("read"); + return 1; + } + newgroup = atoi(buf); + if (!move_to_new_cgroup(newgroup)) + return 1; + do_newcgrp = 0; + return 0; +} + +int do_child(void *vargv) +{ + char **argv = (char **)vargv; + + if (check_newcgrp()) + return 1; + + execve(argv[0], argv, __environ); + perror("execve"); + return 1; +} + +void write_pid(char *pid_file, int pid) +{ + FILE *fp; + + if (!pid_file) + return; + + fp = fopen(pid_file, "w"); + if (!fp) { + perror("fopen, pid_file"); + exit(1); + } + fprintf(fp, "%d", pid); + fflush(fp); + fclose(fp); +} + +int main(int argc, char *argv[]) +{ + int c; + unsigned long flags = 0, eflags = 0; + char ttyname[256]; + int status; + int ret, use_clone = 0; + int pid; + char *pid_file = NULL; + struct pid_set pid_set; + int chosen_pid = 0; + + pid_set.num_pids = 1; + pid_set.pids = &chosen_pid; + + procname = basename(argv[0]); + + memset(ttyname, '\0', sizeof(ttyname)); + readlink("/proc/self/fd/0", ttyname, sizeof(ttyname)); + + while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) { + switch (c) { + case 'g': do_newcgrp = getpid(); break; + case 'm': flags |= CLONE_NEWNS; break; + case 'c': use_clone = 1; break; + case 'P': pid_file = optarg; break; + case 'u': flags |= CLONE_NEWUTS; break; + case 'i': flags |= CLONE_NEWIPC; break; + case 'U': flags |= CLONE_NEWUSER; break; + case 'n': flags |= CLONE_NEWNET; break; + case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID; break; + case 'z': chosen_pid = atoi(optarg); break; + case 'f': if (!string_to_ul(optarg, &eflags)) { + flags |= eflags; + break; + } + case 'h': + default: + usage(procname); + } + }; + + if (chosen_pid) { + use_clone = 1; + if (flags & CLONE_NEWPID) { + printf("Error: can't use CLONE_NEWPID and pick a pid\n"); + exit(1); + } + } + argv = &argv[optind]; + argc = argc - optind; + + if (do_newcgrp) { + ret = pipe(pipefd); + if (ret) { + perror("pipe"); + return -1; + } + do_newcgrp = pipefd[0]; + } + + if (use_clone) { + int stacksize = 4*getpagesize(); + void *stack = malloc(stacksize); + + if (!stack) { + perror("malloc"); + return -1; + } + + printf("about to clone with %lx\n", flags); + if (chosen_pid) + printf("Will choose pid %d\n", chosen_pid); + flags |= SIGCHLD; + pid = clone_with_pids(do_child, stack, stacksize, flags, + &pid_set, (void *)argv); + if (pid == -1) { + perror("clone"); + return -1; + } + } else { + if ((pid = fork()) == 0) { + // Child. + //print_my_info(procname, ttyname); + + if (check_newcgrp()) + return 1; + opentty(ttyname); + + printf("about to unshare with %lx\n", flags); + ret = unshare(flags); + if (ret < 0) { + perror("unshare"); + return 1; + } + + return do_child((void*)argv); + } + + } + if (pid != -1 && do_newcgrp) { + char buf[20]; + snprintf(buf, 20, "%d", pid); + close(pipefd[0]); + write(pipefd[1], buf, strlen(buf)+1); + close(pipefd[1]); + } + + write_pid(pid_file, pid); + + if ((ret = waitpid(pid, &status, __WALL)) < 0) + printf("waitpid() returns %d, errno %d\n", ret, errno); + + exit(0); +} -- 1.6.1.1 ^ permalink raw reply related [flat|nested] 17+ messages in thread
[parent not found: <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-11-25 18:46 ` Oren Laadan [not found] ` <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 17+ messages in thread From: Oren Laadan @ 2009-11-25 18:46 UTC (permalink / raw) To: Serge E. Hallyn; +Cc: Linux Containers, Nathan T Lynch Ok, will add this to user-cr (v19-rc2). BTW, where is the original nsexec source maintained ? Oren. Serge E. Hallyn wrote: > One of the concerns with clone-with-pids is whether the > stack handling is all correct and robust enough to withstand > real usage. Little testcases playing with pid values are > also necessary, but can't replace really using clone-with-pids > to start a shell from which to keep working. > > This patch tweaks the old ns_exec.c namespace manipulation > program to add a -z option to specify a pid. So you can: > > nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns > mount -t proc proc /proc # mount private /proc > echo $$ > 1 > nsexeccwp -z /bin/bash # start a shell with pid 999 > echo $$ > 999 > > Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > --- > Makefile | 5 +- > clone.h | 54 +++++++++ > nsexeccwp.c | 352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 410 insertions(+), 1 deletions(-) > create mode 100644 clone.h > create mode 100644 nsexeccwp.c > > diff --git a/Makefile b/Makefile > index 181cc1c..32a6893 100644 > --- a/Makefile > +++ b/Makefile > @@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG) > # install dir > INSTALL_DIR = /bin > > -PROGS = checkpoint restart ckptinfo > +PROGS = checkpoint restart ckptinfo nsexeccwp > > # other cleanup > OTHER = ckptinfo_types.c > @@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread > ifneq ($(SUBARCH),) > restart: clone_$(SUBARCH).o > restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID > +nsexeccwp: clone_$(SUBARCH).o > +nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID > endif > > # on powerpc, need also assembly file > ifeq ($(SUBARCH),ppc) > restart: clone_$(SUBARCH)_.o > +nsexeccwp: clone_$(SUBARCH)_.o > endif > > # ckptinfo dependencies > diff --git a/clone.h b/clone.h > new file mode 100644 > index 0000000..3569a45 > --- /dev/null > +++ b/clone.h > @@ -0,0 +1,54 @@ > +#ifndef CLONE_H > +#define CLONE_H > +/* > + * Copyright (C) 2007 IBM Corporation > + * > + * Author: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License as > + * published by the Free Software Foundation, version 2 of the > + * License. > + * > + */ > +#include <sys/syscall.h> > + > +#ifndef HAVE_UNSHARE > + > +#if __i386__ > +# define __NR_unshare 310 > +#elif __x86_64__ > +# define __NR_unshare 272 > +#elif __ia64__ > +# define __NR_unshare 1296 > +#elif __s390x__ > +# define __NR_unshare 303 > +#elif __powerpc__ > +# define __NR_unshare 282 > +#else > +# error "Architecture not supported" > +#endif > + > +#endif /* HAVE_UNSHARE */ > + > +#ifndef CLONE_NEWUTS > +#define CLONE_NEWUTS 0x04000000 > +#endif > + > +#ifndef CLONE_NEWIPC > +#define CLONE_NEWIPC 0x08000000 > +#endif > + > +#ifndef CLONE_NEWUSER > +#define CLONE_NEWUSER 0x10000000 > +#endif > + > +#ifndef CLONE_NEWPID > +#define CLONE_NEWPID 0x20000000 > +#endif > + > +#ifndef CLONE_NEWNET > +#define CLONE_NEWNET 0x40000000 > +#endif > + > +#endif /* CLONE_H */ > diff --git a/nsexeccwp.c b/nsexeccwp.c > new file mode 100644 > index 0000000..f14b8b0 > --- /dev/null > +++ b/nsexeccwp.c > @@ -0,0 +1,352 @@ > +/* > + * Copyright 2008,2009 IBM Corp. > + */ > + > +#include <stdio.h> > +#include <stdlib.h> > +#include <sched.h> > +#include <sys/syscall.h> > +#include <unistd.h> > +#include <signal.h> > +#include <string.h> > +#include <errno.h> > +#include <libgen.h> > +#include <fcntl.h> > +#include <sys/stat.h> > +#include <sys/types.h> > +#include <sys/wait.h> > + > +#include "clone.h" > + > +struct pid_set { > + int num_pids; > + pid_t *pids; > +}; > + > +typedef unsigned long long u64; > +typedef unsigned int u32; > +typedef int pid_t; > +struct clone_args { > + u64 clone_flags_high; > + > + u64 child_stack_base; > + u64 child_stack_size; > + > + u64 parent_tid_ptr; > + u64 child_tid_ptr; > + > + u32 nr_pids; > + > + u32 reserved0; > + u64 reserved1; > +}; > +extern int clone_with_pids(int (*fn)(void *), void *child_stack, > + unsigned long stack_size, unsigned long flags, > + struct pid_set *target_pids, void *arg); > + > +extern pid_t getpgid(pid_t pid); > +extern pid_t getsid(pid_t pid); > + > +static const char* procname; > + > +static void usage(const char *name) > +{ > + printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]" > + "[command [arg ..]]\n", name); > + printf("\n"); > + printf(" -h this message\n"); > + printf("\n"); > + printf(" -z <pid> use clone_with_pids and specify chosen pid\n"); > + printf(" Note that -z and -p are not compatible\n"); > + printf(" -c use 'clone' rather than 'unshare' system call\n"); > + printf(" -g launch in new cgroup\n"); > + printf(" -m mount namespace\n"); > + printf(" -n network namespace\n"); > + printf(" -u utsname namespace\n"); > + printf(" -U userid namespace\n"); > + printf(" -i ipc namespace\n"); > + printf(" -P <pid-file> File in which to write global pid of cinit\n"); > + printf(" -p pid namespace\n"); > + printf(" -f <flag> extra clone flags\n"); > + printf("\n"); > + printf("(C) Copyright IBM Corp. 2006\n"); > + printf("\n"); > + exit(1); > +} > + > +static int string_to_ul(const char *str, unsigned long int *res) > +{ > + char *tail; > + long long int r; > + > + if (!*str) > + return -1; > + > + errno = 0; > + > + r = strtol(str, &tail, 16); > + > + /* > + * according to strtol(3), if errno is set or tail does no point > + * to the ending '\0', the conversion failed. > + */ > + if (errno || *tail) > + return -1; > + > + *res = r; > + return 0; > +} > + > +/* > + * Copied following opentty() from Fedora's util-linux rpm > + * I just changed the "FATAL" message below from syslog() > + * to printf > + */ > +static void > +opentty(const char * tty) { > + int i, fd, flags; > + > + fd = open(tty, O_RDWR | O_NONBLOCK); > + if (fd == -1) { > + printf("FATAL: can't reopen tty: %s", strerror(errno)); > + sleep(1); > + exit(1); > + } > + > + flags = fcntl(fd, F_GETFL); > + flags &= ~O_NONBLOCK; > + fcntl(fd, F_SETFL, flags); > + > + for (i = 0; i < fd; i++) > + close(i); > + for (i = 0; i < 3; i++) > + if (fd != i) > + dup2(fd, i); > + if (fd >= 3) > + close(fd); > +} > +// Code copy end > + > +int do_newcgrp = 0; > + > +int load_cgroup_dir(char *dest, int len) > +{ > + FILE *f = fopen("/proc/mounts", "r"); > + char buf[200]; > + char *name, *path, *fsname, *options, *p1, *p2, *s; > + if (!f) > + return 0; > + while (fgets(buf, 200, f)) { > + name = strtok_r(buf, " ", &p1); > + path = strtok_r(NULL, " ", &p1); > + fsname = strtok_r(NULL, " ", &p1); > + options = strtok_r(NULL, " ", &p1); > + if (strcmp(fsname, "cgroup") != 0) > + continue; > + > + /* make sure the freezer is composed */ > + s = strtok_r(options, ",", &p2); > + while (s && strcmp(s, "freezer") != 0) > + s = strtok_r(NULL, ",", &p2); > + if (!s) > + continue; > + strncpy(dest, path, len); > + fclose(f); > + return 1; > + } > + fclose(f); > + printf("Freezer not mounted\n"); > + return 0; > +} > + > +int move_to_new_cgroup(int newcgroup) > +{ > + char cgroupname[150], cgroupbase[100], tasksfname[200]; > + FILE *fout; > + int ret; > + > + if (!load_cgroup_dir(cgroupbase, 100)) > + return 0; > + > + snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup); > + ret = mkdir(cgroupname, 0755); > + if (ret) > + return 0; > + snprintf(tasksfname, 200, "%s/tasks", cgroupname); > + fout = fopen(tasksfname, "w"); > + if (!fout) > + return 0; > + fprintf(fout, "%d\n", getpid()); > + fclose(fout); > + return 1; > +} > + > +int pipefd[2]; > + > +/* gah. opentty will close the pipefd */ > +int check_newcgrp(void) > +{ > + int ret, newgroup; > + char buf[20]; > + > + if (!do_newcgrp) > + return 0; > + > + close(pipefd[1]); > + ret = read(pipefd[0], buf, 20); > + close(pipefd[0]); > + if (ret == -1) { > + perror("read"); > + return 1; > + } > + newgroup = atoi(buf); > + if (!move_to_new_cgroup(newgroup)) > + return 1; > + do_newcgrp = 0; > + return 0; > +} > + > +int do_child(void *vargv) > +{ > + char **argv = (char **)vargv; > + > + if (check_newcgrp()) > + return 1; > + > + execve(argv[0], argv, __environ); > + perror("execve"); > + return 1; > +} > + > +void write_pid(char *pid_file, int pid) > +{ > + FILE *fp; > + > + if (!pid_file) > + return; > + > + fp = fopen(pid_file, "w"); > + if (!fp) { > + perror("fopen, pid_file"); > + exit(1); > + } > + fprintf(fp, "%d", pid); > + fflush(fp); > + fclose(fp); > +} > + > +int main(int argc, char *argv[]) > +{ > + int c; > + unsigned long flags = 0, eflags = 0; > + char ttyname[256]; > + int status; > + int ret, use_clone = 0; > + int pid; > + char *pid_file = NULL; > + struct pid_set pid_set; > + int chosen_pid = 0; > + > + pid_set.num_pids = 1; > + pid_set.pids = &chosen_pid; > + > + procname = basename(argv[0]); > + > + memset(ttyname, '\0', sizeof(ttyname)); > + readlink("/proc/self/fd/0", ttyname, sizeof(ttyname)); > + > + while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) { > + switch (c) { > + case 'g': do_newcgrp = getpid(); break; > + case 'm': flags |= CLONE_NEWNS; break; > + case 'c': use_clone = 1; break; > + case 'P': pid_file = optarg; break; > + case 'u': flags |= CLONE_NEWUTS; break; > + case 'i': flags |= CLONE_NEWIPC; break; > + case 'U': flags |= CLONE_NEWUSER; break; > + case 'n': flags |= CLONE_NEWNET; break; > + case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID; break; > + case 'z': chosen_pid = atoi(optarg); break; > + case 'f': if (!string_to_ul(optarg, &eflags)) { > + flags |= eflags; > + break; > + } > + case 'h': > + default: > + usage(procname); > + } > + }; > + > + if (chosen_pid) { > + use_clone = 1; > + if (flags & CLONE_NEWPID) { > + printf("Error: can't use CLONE_NEWPID and pick a pid\n"); > + exit(1); > + } > + } > + argv = &argv[optind]; > + argc = argc - optind; > + > + if (do_newcgrp) { > + ret = pipe(pipefd); > + if (ret) { > + perror("pipe"); > + return -1; > + } > + do_newcgrp = pipefd[0]; > + } > + > + if (use_clone) { > + int stacksize = 4*getpagesize(); > + void *stack = malloc(stacksize); > + > + if (!stack) { > + perror("malloc"); > + return -1; > + } > + > + printf("about to clone with %lx\n", flags); > + if (chosen_pid) > + printf("Will choose pid %d\n", chosen_pid); > + flags |= SIGCHLD; > + pid = clone_with_pids(do_child, stack, stacksize, flags, > + &pid_set, (void *)argv); > + if (pid == -1) { > + perror("clone"); > + return -1; > + } > + } else { > + if ((pid = fork()) == 0) { > + // Child. > + //print_my_info(procname, ttyname); > + > + if (check_newcgrp()) > + return 1; > + opentty(ttyname); > + > + printf("about to unshare with %lx\n", flags); > + ret = unshare(flags); > + if (ret < 0) { > + perror("unshare"); > + return 1; > + } > + > + return do_child((void*)argv); > + } > + > + } > + if (pid != -1 && do_newcgrp) { > + char buf[20]; > + snprintf(buf, 20, "%d", pid); > + close(pipefd[0]); > + write(pipefd[1], buf, strlen(buf)+1); > + close(pipefd[1]); > + } > + > + write_pid(pid_file, pid); > + > + if ((ret = waitpid(pid, &status, __WALL)) < 0) > + printf("waitpid() returns %d, errno %d\n", ret, errno); > + > + exit(0); > +} ^ permalink raw reply [flat|nested] 17+ messages in thread
[parent not found: <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids [not found] ` <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-11-25 19:24 ` Serge E. Hallyn 0 siblings, 0 replies; 17+ messages in thread From: Serge E. Hallyn @ 2009-11-25 19:24 UTC (permalink / raw) To: Oren Laadan; +Cc: Linux Containers, Nathan T Lynch Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org): > > Ok, will add this to user-cr (v19-rc2). > > BTW, where is the original nsexec source maintained ? It isn't really 'maintained'. Used to be kept at lxc.sf.net, and right now a copy is in the cr_tests git tree. -serge ^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2009-11-25 19:24 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-13 5:24 [PATCH linux-cr] implement s390 eclone syscall serue-r/Jw6+rmf7HQT0dZR+AlfA
[not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-13 5:24 ` [PATCH user-cr 1/2] use v13 of eclone in clone_s390x.c serue-r/Jw6+rmf7HQT0dZR+AlfA
2009-11-13 5:24 ` [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids serue-r/Jw6+rmf7HQT0dZR+AlfA
[not found] ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-13 21:08 ` Serge E. Hallyn
2009-11-15 22:45 ` Nathan Lynch
[not found] ` <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-16 11:12 ` Serge E. Hallyn
[not found] ` <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-15 23:49 ` Nathan Lynch
[not found] ` <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-16 18:26 ` Serge E. Hallyn
[not found] ` <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-16 23:18 ` Nathan Lynch
[not found] ` <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-17 4:05 ` Serge E. Hallyn
2009-11-16 14:45 ` Serge E. Hallyn
2009-11-16 23:36 ` [PATCH linux-cr] implement s390 eclone syscall Nathan Lynch
[not found] ` <1258414596.4031.1058.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-17 4:03 ` Serge E. Hallyn
[not found] ` <20091117040321.GA32461-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-17 21:33 ` [PATCH linux-cr] fix warnings in i386 sys_eclone Nathan Lynch
-- strict thread matches above, loose matches on Subject: below --
2009-11-10 16:58 [PATCH user-cr 1/2] use Suka's v11 api Serge E. Hallyn
[not found] ` <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-10 16:59 ` [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids Serge E. Hallyn
[not found] ` <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-25 18:46 ` Oren Laadan
[not found] ` <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-11-25 19:24 ` Serge E. Hallyn
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.