All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH user-cr 1/2] use Suka's v11 api
@ 2009-11-10 16:58 Serge E. Hallyn
       [not found] ` <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-10 16:58 UTC (permalink / raw)
  To: Linux Containers; +Cc: Nathan T Lynch

This patch:
	1. changes restart to pass the right values to
		clone-with-pids.
	2. updates the clone_s390x.c to work with the
		new kernel.

All tests under cr_tests/ pass with this patch.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 clone_s390x.c |   92 +++++++++++++++++++++++++++++++++++++--------------------
 restart.c     |   14 +++++----
 2 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/clone_s390x.c b/clone_s390x.c
index dada822..71cf52f 100644
--- a/clone_s390x.c
+++ b/clone_s390x.c
@@ -14,6 +14,7 @@
 
 #include <unistd.h>
 #include <errno.h>
+#include <string.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <asm/unistd.h>
@@ -25,48 +26,75 @@
 #include <linux/checkpoint.h>
 #if defined(__NR_clone_with_pids)
 
-/* this really belongs to some kernel header ! */
 struct pid_set {
 	int num_pids;
 	pid_t *pids;
 };
 
-/* (see: http://lkml.indiana.edu/hypermail/linux/kernel/9604.3/0204.html) */
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef int pid_t;
+struct clone_args {
+	u64 clone_flags_high;
 
-#define do_clone_with_pids(stack, flags, ptid, ctid, setp) ({ \
-	register unsigned long int __r2 asm ("2") = (unsigned long int)(stack);\
-	register unsigned long int __r3 asm ("3") = (unsigned long int)(flags);\
-	register unsigned long int __r4 asm ("4") = (unsigned long int)(ptid); \
-	register unsigned long int __r5 asm ("5") = (unsigned long int)(ctid); \
-	register unsigned long int __r6 asm ("6") = (unsigned long int)(NULL); \
-	register unsigned long int __r7 asm ("7") = (unsigned long int)(setp); \
-	register unsigned long int __result asm ("2"); \
-	__asm__ __volatile__( \
-		" lghi %%r1,%7\n" \
-		" svc 0\n" \
-		: "=d" (__result) \
-		: "0" (__r2), "d" (__r3), \
-		  "d" (__r4), "d" (__r5), "d" (__r6), "d" (__r7), \
-		  "i" (__NR_clone_with_pids) \
-		: "1", "cc", "memory" \
-	); \
-		__result; \
-	})
+	u64 child_stack_base;
+	u64 child_stack_size;
 
-int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+	u64 parent_tid_ptr;
+	u64 child_tid_ptr;
+
+	u32 nr_pids;
+
+	u32 reserved0;
+	u64 reserved1;
+};
+
+#define do_cwp(flags, pids, args, sz) \
+( { \
+  register unsigned long int __r1 asm ("1") = (unsigned long int)(__NR_clone_with_pids); \
+  register unsigned long int __r2 asm ("2") = (unsigned long int)(flags); \
+  register unsigned long int __r3 asm ("3") = (unsigned long int)(args); \
+  register unsigned long int __r4 asm ("4") = (unsigned long int)(sz); \
+  register unsigned long int __r5 asm ("5") = (unsigned long int)(pids); \
+  register long int __result asm ("2"); \
+  __asm__ __volatile__( \
+	  " svc 0\n" /* do __NR_cwp syscall */ \
+	  " ltgr %%r2,%%r2\n" /* returned 0? */ \
+	  " jnz 1f\n" /* if not goto label 1 */ \
+	  " lg %%r3,0(%%r15)\n"   /* get fnarg off stack into arg 1 */ \
+	  " lg %%r2,8(%%r15)\n"   /* get fn off stack int r3 basr*/ \
+	  " lgr %%r1,%%r15\n" /* tmp store old stack pointer */ \
+	  " aghi %%r15,-160\n" /* move the stack */ \
+	  " stg %%r1,0(%%r15)\n" /* and save old stack pointer */ \
+	  " basr %%r14,%%r3\n" /* call fn(arg) */ \
+	  " svc 1\n"  /* call exit */ \
+	  " 1:\n" \
+	  : "=d" (__result) \
+	  : "d" (__r1), "0" (__r2), "d" (__r3), "d" (__r4), "d" (__r5) \
+	  : "memory"); \
+	__result; \
+} )
+
+int clone_with_pids(int (*fn)(void *), void *child_stack,
+			unsigned long stack_size, unsigned long flags,
 			struct pid_set *target_pids, void *arg)
 {
-	long retval;
-	retval = do_clone_with_pids(child_stack, flags, NULL, NULL,
-				    target_pids);
+	struct clone_args clone_args, *ca = &clone_args;
+	u64 *s;
+
+	memset(ca, 0, sizeof(struct clone_args));
+	ca->nr_pids = target_pids->num_pids;
+	ca->child_stack_size = stack_size - 16;
+	ca->child_stack_base = (u64) child_stack;
+	if (child_stack) {
+		s = (u64 *) (ca->child_stack_base + ca->child_stack_size);
+		*--s = (u64) arg;
+		*--s = (u64) fn;
+		ca->child_stack_size -= 16;
+	}
 
-	if (retval < 0) {
-		errno = -retval;
-		return -1;
-	} else if (retval == 0) {
-		return fn(arg);
-	} else
-		return retval;
+	return do_cwp(flags, target_pids->pids, ca,
+				    sizeof(struct clone_args));
 }
 
 #endif  /* !defined(__NR_clone_with_pids) */
diff --git a/restart.c b/restart.c
index 35c54ea..ebc7bf8 100644
--- a/restart.c
+++ b/restart.c
@@ -43,10 +43,12 @@ struct pid_set {
 
 /* (until it's supported by libc) from clone_ARCH.c */
 #if defined(__NR_clone_with_pids) && defined(ARCH_HAS_CLONE_WITH_PID)
-extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+extern int clone_with_pids(int (*fn)(void *), void *child_stack,
+			   unsigned long stack_size, int flags,
 			   struct pid_set *target_pids, void *arg);
 #else
-static int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+static int clone_with_pids(int (*fn)(void *), void *child_stack,
+			    unsigned long stack_size, int flags,
 			   struct pid_set *target_pids, void *arg)
 {
 	return clone(fn, child_stack, flags, arg);
@@ -1749,18 +1751,17 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 {
 	struct pid_set pid_set;
 	char *stack_region;
-	char *stack_start;
 	unsigned long flags = SIGCHLD;
+	unsigned long stack_size = PTHREAD_STACK_MIN;
 	pid_t pid = 0;
 
 	ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags);
 
-	stack_region = malloc(PTHREAD_STACK_MIN);
+	stack_region = malloc(stack_size);
 	if (!stack_region) {
 		perror("stack malloc");
 		return -1;
 	}
-	stack_start = stack_region + PTHREAD_STACK_MIN - 1;
 
 	pid_set.pids = &pid;
 	pid_set.num_pids = 1;
@@ -1788,7 +1789,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 	else
 		child->real_parent = _getpid();
 
-	pid = clone_with_pids(ckpt_fork_stub, stack_start, flags, &pid_set, child);
+	pid = clone_with_pids(ckpt_fork_stub, stack_region, stack_size - 16,
+				flags, &pid_set, child);
 	if (pid < 0) {
 		perror("clone");
 		free(stack_region);
-- 
1.6.1.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found] ` <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-10 16:59   ` Serge E. Hallyn
       [not found]     ` <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-10 16:59 UTC (permalink / raw)
  To: Linux Containers; +Cc: Nathan T Lynch

One of the concerns with clone-with-pids is whether the
stack handling is all correct and robust enough to withstand
real usage.  Little testcases playing with pid values are
also necessary, but can't replace really using clone-with-pids
to start a shell from which to keep working.

This patch tweaks the old ns_exec.c namespace manipulation
program to add a -z option to specify a pid.  So you can:

	nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns
	mount -t proc proc /proc # mount private /proc
	echo $$
		1
	nsexeccwp -z /bin/bash   #  start a shell with pid 999
	echo $$
		999

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 Makefile    |    5 +-
 clone.h     |   54 +++++++++
 nsexeccwp.c |  352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 410 insertions(+), 1 deletions(-)
 create mode 100644 clone.h
 create mode 100644 nsexeccwp.c

diff --git a/Makefile b/Makefile
index 181cc1c..32a6893 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG)
 # install dir
 INSTALL_DIR = /bin
 
-PROGS =	checkpoint restart ckptinfo
+PROGS =	checkpoint restart ckptinfo nsexeccwp
 
 # other cleanup
 OTHER = ckptinfo_types.c
@@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread
 ifneq ($(SUBARCH),)
 restart: clone_$(SUBARCH).o
 restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
+nsexeccwp: clone_$(SUBARCH).o
+nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
 endif
 
 # on powerpc, need also assembly file
 ifeq ($(SUBARCH),ppc)
 restart: clone_$(SUBARCH)_.o
+nsexeccwp: clone_$(SUBARCH)_.o
 endif
 
 # ckptinfo dependencies
diff --git a/clone.h b/clone.h
new file mode 100644
index 0000000..3569a45
--- /dev/null
+++ b/clone.h
@@ -0,0 +1,54 @@
+#ifndef CLONE_H
+#define CLONE_H
+/*
+ *  Copyright (C) 2007 IBM Corporation
+ *
+ *  Author: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ *
+ */
+#include <sys/syscall.h>
+
+#ifndef HAVE_UNSHARE
+
+#if __i386__
+#    define __NR_unshare 310
+#elif __x86_64__
+#    define __NR_unshare 272
+#elif __ia64__
+#    define __NR_unshare 1296
+#elif __s390x__
+#    define __NR_unshare 303
+#elif __powerpc__
+#    define __NR_unshare 282
+#else
+#    error "Architecture not supported"
+#endif
+
+#endif /* HAVE_UNSHARE */
+
+#ifndef CLONE_NEWUTS
+#define CLONE_NEWUTS		0x04000000
+#endif
+
+#ifndef CLONE_NEWIPC
+#define CLONE_NEWIPC		0x08000000
+#endif
+
+#ifndef CLONE_NEWUSER
+#define CLONE_NEWUSER		0x10000000
+#endif
+
+#ifndef CLONE_NEWPID
+#define CLONE_NEWPID		0x20000000
+#endif
+
+#ifndef CLONE_NEWNET
+#define CLONE_NEWNET		0x40000000
+#endif
+
+#endif /* CLONE_H */
diff --git a/nsexeccwp.c b/nsexeccwp.c
new file mode 100644
index 0000000..f14b8b0
--- /dev/null
+++ b/nsexeccwp.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2008,2009 IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+#include <errno.h>
+#include <libgen.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "clone.h"
+
+struct pid_set {
+	int num_pids;
+	pid_t *pids;
+};
+
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef int pid_t;
+struct clone_args {
+	u64 clone_flags_high;
+
+	u64 child_stack_base;
+	u64 child_stack_size;
+
+	u64 parent_tid_ptr;
+	u64 child_tid_ptr;
+
+	u32 nr_pids;
+
+	u32 reserved0;
+	u64 reserved1;
+};
+extern int clone_with_pids(int (*fn)(void *), void *child_stack,
+			unsigned long stack_size, unsigned long flags,
+			struct pid_set *target_pids, void *arg);
+
+extern pid_t getpgid(pid_t pid);
+extern pid_t getsid(pid_t pid);
+
+static const char* procname;
+
+static void usage(const char *name)
+{
+	printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]"
+			"[command [arg ..]]\n", name);
+	printf("\n");
+	printf("  -h		this message\n");
+	printf("\n");
+	printf("  -z <pid>	use clone_with_pids and specify chosen pid\n");
+	printf("  		Note that -z and -p are not compatible\n");
+	printf("  -c		use 'clone' rather than 'unshare' system call\n");
+	printf("  -g		launch in new cgroup\n");
+	printf("  -m		mount namespace\n");
+	printf("  -n		network namespace\n");
+	printf("  -u		utsname namespace\n");
+	printf("  -U		userid namespace\n");
+	printf("  -i		ipc namespace\n");
+	printf("  -P <pid-file>	File in which to write global pid of cinit\n");
+	printf("  -p		pid namespace\n");
+	printf("  -f <flag>	extra clone flags\n");
+	printf("\n");
+	printf("(C) Copyright IBM Corp. 2006\n");
+	printf("\n");
+	exit(1);
+}
+
+static int string_to_ul(const char *str, unsigned long int *res)
+{
+	char *tail;
+	long long int r;
+
+	if (!*str)
+		return -1;
+
+	errno = 0;
+
+	r = strtol(str, &tail, 16);
+
+	/*
+	 * according to strtol(3), if errno is set or tail does no point
+	 * to the ending '\0', the conversion failed.
+	 */
+	if (errno || *tail)
+		return -1;
+
+	*res = r;
+	return 0;
+}
+
+/*
+ * Copied following opentty() from Fedora's util-linux rpm
+ * I just changed the "FATAL" message below from syslog()
+ * to printf
+ */
+static void
+opentty(const char * tty) {
+        int i, fd, flags;
+
+        fd = open(tty, O_RDWR | O_NONBLOCK);
+        if (fd == -1) {
+		printf("FATAL: can't reopen tty: %s", strerror(errno));
+                sleep(1);
+                exit(1);
+        }
+
+        flags = fcntl(fd, F_GETFL);
+        flags &= ~O_NONBLOCK;
+        fcntl(fd, F_SETFL, flags);
+
+        for (i = 0; i < fd; i++)
+                close(i);
+        for (i = 0; i < 3; i++)
+                if (fd != i)
+                        dup2(fd, i);
+        if (fd >= 3)
+                close(fd);
+}
+// Code copy end
+
+int do_newcgrp = 0;
+
+int load_cgroup_dir(char *dest, int len)
+{
+	FILE *f = fopen("/proc/mounts", "r");
+	char buf[200];
+	char *name, *path, *fsname, *options, *p1, *p2, *s;
+	if (!f)
+		return 0;
+	while (fgets(buf, 200, f)) {
+		name = strtok_r(buf, " ", &p1);
+		path = strtok_r(NULL, " ", &p1);
+		fsname = strtok_r(NULL, " ", &p1);
+		options = strtok_r(NULL, " ", &p1);
+		if (strcmp(fsname, "cgroup") != 0)
+			continue;
+
+		/* make sure the freezer is composed */
+		s = strtok_r(options, ",", &p2);
+		while (s && strcmp(s, "freezer") != 0)
+			s = strtok_r(NULL, ",", &p2);
+		if (!s)
+			continue;
+		strncpy(dest, path, len);
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+	printf("Freezer not mounted\n");
+	return 0;
+}
+
+int move_to_new_cgroup(int newcgroup)
+{
+	char cgroupname[150], cgroupbase[100], tasksfname[200];
+	FILE *fout;
+	int ret;
+
+	if (!load_cgroup_dir(cgroupbase, 100))
+		return 0;
+
+	snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup);
+	ret = mkdir(cgroupname, 0755);
+	if (ret)
+		return 0;
+	snprintf(tasksfname, 200, "%s/tasks", cgroupname);
+	fout = fopen(tasksfname, "w");
+	if (!fout)
+		return 0;
+	fprintf(fout, "%d\n", getpid());
+	fclose(fout);
+	return 1;
+}
+
+int pipefd[2];
+
+/* gah. opentty will close the pipefd */
+int check_newcgrp(void)
+{
+	int ret, newgroup;
+	char buf[20];
+
+	if (!do_newcgrp)
+		return 0;
+
+	close(pipefd[1]);
+	ret = read(pipefd[0], buf, 20);
+	close(pipefd[0]);
+	if (ret == -1) {
+		perror("read");
+		return 1;
+	}
+	newgroup = atoi(buf);
+	if (!move_to_new_cgroup(newgroup))
+		return 1;
+	do_newcgrp = 0;
+	return 0;
+}
+
+int do_child(void *vargv)
+{
+	char **argv = (char **)vargv;
+
+	if (check_newcgrp())
+		return 1;
+
+	execve(argv[0], argv, __environ);
+	perror("execve");
+	return 1;
+}
+
+void write_pid(char *pid_file, int pid)
+{
+	FILE *fp;
+
+	if (!pid_file)
+		return;
+
+	fp = fopen(pid_file, "w");
+	if (!fp) {
+		perror("fopen, pid_file");
+		exit(1);
+	}
+	fprintf(fp, "%d", pid);
+	fflush(fp);
+	fclose(fp);
+}
+
+int main(int argc, char *argv[])
+{	
+	int c;
+	unsigned long flags = 0, eflags = 0;
+	char ttyname[256];
+	int status;
+	int ret, use_clone = 0;
+	int pid;
+	char *pid_file = NULL;
+	struct pid_set pid_set;
+	int chosen_pid = 0;
+
+	pid_set.num_pids = 1;
+	pid_set.pids = &chosen_pid;
+
+	procname = basename(argv[0]);
+
+	memset(ttyname, '\0', sizeof(ttyname));
+	readlink("/proc/self/fd/0", ttyname, sizeof(ttyname));
+
+	while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) {
+		switch (c) {
+		case 'g': do_newcgrp = getpid();		break;
+		case 'm': flags |= CLONE_NEWNS;			break;
+		case 'c': use_clone = 1;			break;
+		case 'P': pid_file = optarg; 			break;
+		case 'u': flags |= CLONE_NEWUTS;		break;
+		case 'i': flags |= CLONE_NEWIPC;		break;
+		case 'U': flags |= CLONE_NEWUSER;		break;
+		case 'n': flags |= CLONE_NEWNET;		break;
+		case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID;	break;
+		case 'z': chosen_pid = atoi(optarg);		break;
+		case 'f': if (!string_to_ul(optarg, &eflags)) {
+				flags |= eflags;
+				break;
+			}
+		case 'h':
+		default:
+			usage(procname);
+		}
+	};
+
+	if (chosen_pid) {
+		use_clone = 1;
+		if (flags & CLONE_NEWPID) {
+			printf("Error: can't use CLONE_NEWPID and pick a pid\n");
+			exit(1);
+		}
+	}
+	argv = &argv[optind];
+	argc = argc - optind;	
+
+	if (do_newcgrp) {
+		ret = pipe(pipefd);
+		if (ret) {
+			perror("pipe");
+			return -1;
+		}
+		do_newcgrp = pipefd[0];
+	}
+
+	if (use_clone) {
+		int stacksize = 4*getpagesize();
+		void *stack = malloc(stacksize);
+
+		if (!stack) {
+			perror("malloc");
+			return -1;
+		}
+
+		printf("about to clone with %lx\n", flags);
+		if (chosen_pid)
+			printf("Will choose pid %d\n", chosen_pid);
+		flags |= SIGCHLD;
+		pid = clone_with_pids(do_child, stack, stacksize, flags,
+			&pid_set, (void *)argv);
+		if (pid == -1) {
+			perror("clone");
+			return -1;
+		}
+	} else {
+		if ((pid = fork()) == 0) {
+			// Child.
+			//print_my_info(procname, ttyname);
+
+			if (check_newcgrp())
+				return 1;
+			opentty(ttyname);
+
+			printf("about to unshare with %lx\n", flags);
+			ret = unshare(flags);
+			if (ret < 0) {
+				perror("unshare");
+				return 1;
+			}		
+			
+			return do_child((void*)argv);
+		}
+
+	}
+	if (pid != -1 && do_newcgrp) {
+		char buf[20];
+		snprintf(buf, 20, "%d", pid);
+		close(pipefd[0]);
+		write(pipefd[1], buf, strlen(buf)+1);
+		close(pipefd[1]);
+	}
+
+	write_pid(pid_file, pid);
+
+	if ((ret = waitpid(pid, &status, __WALL)) < 0)
+		printf("waitpid() returns %d, errno %d\n", ret, errno);
+
+	exit(0);
+}
-- 
1.6.1.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-13  5:24   ` serue-r/Jw6+rmf7HQT0dZR+AlfA
       [not found]     ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2009-11-13  5:24 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

One of the concerns with clone-with-pids is whether the
stack handling is all correct and robust enough to withstand
real usage.  Little testcases playing with pid values are
also necessary, but can't replace really using clone-with-pids
to start a shell from which to keep working.

This patch tweaks the old ns_exec.c namespace manipulation
program to add a -z option to specify a pid.  So you can:

	nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns
	mount -t proc proc /proc # mount private /proc
	echo $$
		1
	nsexeccwp -z /bin/bash   #  start a shell with pid 999
	echo $$
		999

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 Makefile    |    5 +-
 clone.h     |   54 +++++++++
 nsexeccwp.c |  352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 410 insertions(+), 1 deletions(-)
 create mode 100644 clone.h
 create mode 100644 nsexeccwp.c

diff --git a/Makefile b/Makefile
index 181cc1c..32a6893 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG)
 # install dir
 INSTALL_DIR = /bin
 
-PROGS =	checkpoint restart ckptinfo
+PROGS =	checkpoint restart ckptinfo nsexeccwp
 
 # other cleanup
 OTHER = ckptinfo_types.c
@@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread
 ifneq ($(SUBARCH),)
 restart: clone_$(SUBARCH).o
 restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
+nsexeccwp: clone_$(SUBARCH).o
+nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
 endif
 
 # on powerpc, need also assembly file
 ifeq ($(SUBARCH),ppc)
 restart: clone_$(SUBARCH)_.o
+nsexeccwp: clone_$(SUBARCH)_.o
 endif
 
 # ckptinfo dependencies
diff --git a/clone.h b/clone.h
new file mode 100644
index 0000000..3569a45
--- /dev/null
+++ b/clone.h
@@ -0,0 +1,54 @@
+#ifndef CLONE_H
+#define CLONE_H
+/*
+ *  Copyright (C) 2007 IBM Corporation
+ *
+ *  Author: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ *
+ */
+#include <sys/syscall.h>
+
+#ifndef HAVE_UNSHARE
+
+#if __i386__
+#    define __NR_unshare 310
+#elif __x86_64__
+#    define __NR_unshare 272
+#elif __ia64__
+#    define __NR_unshare 1296
+#elif __s390x__
+#    define __NR_unshare 303
+#elif __powerpc__
+#    define __NR_unshare 282
+#else
+#    error "Architecture not supported"
+#endif
+
+#endif /* HAVE_UNSHARE */
+
+#ifndef CLONE_NEWUTS
+#define CLONE_NEWUTS		0x04000000
+#endif
+
+#ifndef CLONE_NEWIPC
+#define CLONE_NEWIPC		0x08000000
+#endif
+
+#ifndef CLONE_NEWUSER
+#define CLONE_NEWUSER		0x10000000
+#endif
+
+#ifndef CLONE_NEWPID
+#define CLONE_NEWPID		0x20000000
+#endif
+
+#ifndef CLONE_NEWNET
+#define CLONE_NEWNET		0x40000000
+#endif
+
+#endif /* CLONE_H */
diff --git a/nsexeccwp.c b/nsexeccwp.c
new file mode 100644
index 0000000..453fb8c
--- /dev/null
+++ b/nsexeccwp.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2008,2009 IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+#include <errno.h>
+#include <libgen.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "clone.h"
+
+struct pid_set {
+	int num_pids;
+	pid_t *pids;
+};
+
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef int pid_t;
+struct clone_args {
+	u64 clone_flags_high;
+
+	u64 child_stack_base;
+	u64 child_stack_size;
+
+	u64 parent_tid_ptr;
+	u64 child_tid_ptr;
+
+	u32 nr_pids;
+
+	u32 reserved0;
+	u64 reserved1;
+};
+/* (until it's supported by libc) from clone_ARCH.c */
+extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
+			   struct pid_set *target_pids, void *arg);
+
+extern pid_t getpgid(pid_t pid);
+extern pid_t getsid(pid_t pid);
+
+static const char* procname;
+
+static void usage(const char *name)
+{
+	printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]"
+			"[command [arg ..]]\n", name);
+	printf("\n");
+	printf("  -h		this message\n");
+	printf("\n");
+	printf("  -z <pid>	use clone_with_pids and specify chosen pid\n");
+	printf("  		Note that -z and -p are not compatible\n");
+	printf("  -c		use 'clone' rather than 'unshare' system call\n");
+	printf("  -g		launch in new cgroup\n");
+	printf("  -m		mount namespace\n");
+	printf("  -n		network namespace\n");
+	printf("  -u		utsname namespace\n");
+	printf("  -U		userid namespace\n");
+	printf("  -i		ipc namespace\n");
+	printf("  -P <pid-file>	File in which to write global pid of cinit\n");
+	printf("  -p		pid namespace\n");
+	printf("  -f <flag>	extra clone flags\n");
+	printf("\n");
+	printf("(C) Copyright IBM Corp. 2006\n");
+	printf("\n");
+	exit(1);
+}
+
+static int string_to_ul(const char *str, unsigned long int *res)
+{
+	char *tail;
+	long long int r;
+
+	if (!*str)
+		return -1;
+
+	errno = 0;
+
+	r = strtol(str, &tail, 16);
+
+	/*
+	 * according to strtol(3), if errno is set or tail does no point
+	 * to the ending '\0', the conversion failed.
+	 */
+	if (errno || *tail)
+		return -1;
+
+	*res = r;
+	return 0;
+}
+
+/*
+ * Copied following opentty() from Fedora's util-linux rpm
+ * I just changed the "FATAL" message below from syslog()
+ * to printf
+ */
+static void
+opentty(const char * tty) {
+        int i, fd, flags;
+
+        fd = open(tty, O_RDWR | O_NONBLOCK);
+        if (fd == -1) {
+		printf("FATAL: can't reopen tty: %s", strerror(errno));
+                sleep(1);
+                exit(1);
+        }
+
+        flags = fcntl(fd, F_GETFL);
+        flags &= ~O_NONBLOCK;
+        fcntl(fd, F_SETFL, flags);
+
+        for (i = 0; i < fd; i++)
+                close(i);
+        for (i = 0; i < 3; i++)
+                if (fd != i)
+                        dup2(fd, i);
+        if (fd >= 3)
+                close(fd);
+}
+// Code copy end
+
+int do_newcgrp = 0;
+
+int load_cgroup_dir(char *dest, int len)
+{
+	FILE *f = fopen("/proc/mounts", "r");
+	char buf[200];
+	char *name, *path, *fsname, *options, *p1, *p2, *s;
+	if (!f)
+		return 0;
+	while (fgets(buf, 200, f)) {
+		name = strtok_r(buf, " ", &p1);
+		path = strtok_r(NULL, " ", &p1);
+		fsname = strtok_r(NULL, " ", &p1);
+		options = strtok_r(NULL, " ", &p1);
+		if (strcmp(fsname, "cgroup") != 0)
+			continue;
+
+		/* make sure the freezer is composed */
+		s = strtok_r(options, ",", &p2);
+		while (s && strcmp(s, "freezer") != 0)
+			s = strtok_r(NULL, ",", &p2);
+		if (!s)
+			continue;
+		strncpy(dest, path, len);
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+	printf("Freezer not mounted\n");
+	return 0;
+}
+
+int move_to_new_cgroup(int newcgroup)
+{
+	char cgroupname[150], cgroupbase[100], tasksfname[200];
+	FILE *fout;
+	int ret;
+
+	if (!load_cgroup_dir(cgroupbase, 100))
+		return 0;
+
+	snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup);
+	ret = mkdir(cgroupname, 0755);
+	if (ret)
+		return 0;
+	snprintf(tasksfname, 200, "%s/tasks", cgroupname);
+	fout = fopen(tasksfname, "w");
+	if (!fout)
+		return 0;
+	fprintf(fout, "%d\n", getpid());
+	fclose(fout);
+	return 1;
+}
+
+int pipefd[2];
+
+/* gah. opentty will close the pipefd */
+int check_newcgrp(void)
+{
+	int ret, newgroup;
+	char buf[20];
+
+	if (!do_newcgrp)
+		return 0;
+
+	close(pipefd[1]);
+	ret = read(pipefd[0], buf, 20);
+	close(pipefd[0]);
+	if (ret == -1) {
+		perror("read");
+		return 1;
+	}
+	newgroup = atoi(buf);
+	if (!move_to_new_cgroup(newgroup))
+		return 1;
+	do_newcgrp = 0;
+	return 0;
+}
+
+int do_child(void *vargv)
+{
+	char **argv = (char **)vargv;
+
+	if (check_newcgrp())
+		return 1;
+
+	execve(argv[0], argv, __environ);
+	perror("execve");
+	return 1;
+}
+
+void write_pid(char *pid_file, int pid)
+{
+	FILE *fp;
+
+	if (!pid_file)
+		return;
+
+	fp = fopen(pid_file, "w");
+	if (!fp) {
+		perror("fopen, pid_file");
+		exit(1);
+	}
+	fprintf(fp, "%d", pid);
+	fflush(fp);
+	fclose(fp);
+}
+
+int main(int argc, char *argv[])
+{	
+	int c;
+	unsigned long flags = 0, eflags = 0;
+	char ttyname[256];
+	int status;
+	int ret, use_clone = 0;
+	int pid;
+	char *pid_file = NULL;
+	struct pid_set pid_set;
+	int chosen_pid = 0;
+
+	pid_set.num_pids = 1;
+	pid_set.pids = &chosen_pid;
+
+	procname = basename(argv[0]);
+
+	memset(ttyname, '\0', sizeof(ttyname));
+	readlink("/proc/self/fd/0", ttyname, sizeof(ttyname));
+
+	while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) {
+		switch (c) {
+		case 'g': do_newcgrp = getpid();		break;
+		case 'm': flags |= CLONE_NEWNS;			break;
+		case 'c': use_clone = 1;			break;
+		case 'P': pid_file = optarg; 			break;
+		case 'u': flags |= CLONE_NEWUTS;		break;
+		case 'i': flags |= CLONE_NEWIPC;		break;
+		case 'U': flags |= CLONE_NEWUSER;		break;
+		case 'n': flags |= CLONE_NEWNET;		break;
+		case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID;	break;
+		case 'z': chosen_pid = atoi(optarg);		break;
+		case 'f': if (!string_to_ul(optarg, &eflags)) {
+				flags |= eflags;
+				break;
+			}
+		case 'h':
+		default:
+			usage(procname);
+		}
+	};
+
+	if (chosen_pid) {
+		use_clone = 1;
+		if (flags & CLONE_NEWPID) {
+			printf("Error: can't use CLONE_NEWPID and pick a pid\n");
+			exit(1);
+		}
+	}
+	argv = &argv[optind];
+	argc = argc - optind;	
+
+	if (do_newcgrp) {
+		ret = pipe(pipefd);
+		if (ret) {
+			perror("pipe");
+			return -1;
+		}
+		do_newcgrp = pipefd[0];
+	}
+
+	if (use_clone) {
+		int stacksize = 4*getpagesize();
+		void *stack = malloc(stacksize);
+
+		if (!stack) {
+			perror("malloc");
+			return -1;
+		}
+
+		printf("about to clone with %lx\n", flags);
+		if (chosen_pid)
+			printf("Will choose pid %d\n", chosen_pid);
+		flags |= SIGCHLD;
+		pid = clone_with_pids(do_child, stack, flags, &pid_set,
+					(void *)argv);
+		if (pid == -1) {
+			perror("clone");
+			return -1;
+		}
+	} else {
+		if ((pid = fork()) == 0) {
+			// Child.
+			//print_my_info(procname, ttyname);
+
+			if (check_newcgrp())
+				return 1;
+			opentty(ttyname);
+
+			printf("about to unshare with %lx\n", flags);
+			ret = unshare(flags);
+			if (ret < 0) {
+				perror("unshare");
+				return 1;
+			}		
+			
+			return do_child((void*)argv);
+		}
+
+	}
+	if (pid != -1 && do_newcgrp) {
+		char buf[20];
+		snprintf(buf, 20, "%d", pid);
+		close(pipefd[0]);
+		write(pipefd[1], buf, strlen(buf)+1);
+		close(pipefd[1]);
+	}
+
+	write_pid(pid_file, pid);
+
+	if ((ret = waitpid(pid, &status, __WALL)) < 0)
+		printf("waitpid() returns %d, errno %d\n", ret, errno);
+
+	exit(0);
+}
-- 
1.6.1.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]     ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-13 21:08       ` Serge E. Hallyn
  2009-11-15 22:45       ` Nathan Lynch
  2009-11-16 14:45       ` Serge E. Hallyn
  2 siblings, 0 replies; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-13 21:08 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

Quoting serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org (serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
...
> +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> +					(void *)argv);
> +		if (pid == -1) {
> +			perror("clone");
> +			return -1;
> +		}

Come on Serge, what crapppy code!  The clone_with_pids() wrapper used
in user-cr doesn't set errno, so this is messed up on failure.  Shape
up!

-serge

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]     ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-13 21:08       ` Serge E. Hallyn
@ 2009-11-15 22:45       ` Nathan Lynch
       [not found]         ` <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2009-11-16 14:45       ` Serge E. Hallyn
  2 siblings, 1 reply; 13+ messages in thread
From: Nathan Lynch @ 2009-11-15 22:45 UTC (permalink / raw)
  To: serue-r/Jw6+rmf7HQT0dZR+AlfA; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote:
> +	if (use_clone) {
> +		int stacksize = 4*getpagesize();
> +		void *stack = malloc(stacksize);
> +
> +		if (!stack) {
> +			perror("malloc");
> +			return -1;
> +		}
> +
> +		printf("about to clone with %lx\n", flags);
> +		if (chosen_pid)
> +			printf("Will choose pid %d\n", chosen_pid);
> +		flags |= SIGCHLD;
> +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> +					(void *)argv);

The stack argument should be adjusted with the usual stack += stacksize
- 1 or similar, right?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]             ` <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-15 23:49               ` Nathan Lynch
       [not found]                 ` <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Nathan Lynch @ 2009-11-15 23:49 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote:
> Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote:
> > > +	if (use_clone) {
> > > +		int stacksize = 4*getpagesize();
> > > +		void *stack = malloc(stacksize);
> > > +
> > > +		if (!stack) {
> > > +			perror("malloc");
> > > +			return -1;
> > > +		}
> > > +
> > > +		printf("about to clone with %lx\n", flags);
> > > +		if (chosen_pid)
> > > +			printf("Will choose pid %d\n", chosen_pid);
> > > +		flags |= SIGCHLD;
> > > +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> > > +					(void *)argv);
> > 
> > The stack argument should be adjusted with the usual stack += stacksize
> > - 1 or similar, right?
> 
> the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the
> x86 one by Suka also) does this implicitly, by doing:
> 
> 	s = child_stack;
> 	*--s = arg;
> 	*--s = fn;
> 	child_stack -= 16

That's setting up arguments for the function to run in the child, and
afaict that code assumes the value of child_stack is the _end_ of the
stack region.  The code I quoted above is passing the beginning of the
region (the return value from malloc).

On powerpc the segfaults went away when I made the following change.

diff --git a/nsexeccwp.c b/nsexeccwp.c
index a71d9a4..92eb092 100644
--- a/nsexeccwp.c
+++ b/nsexeccwp.c
@@ -309,8 +309,8 @@ int main(int argc, char *argv[])
                if (chosen_pid)
                        printf("Will choose pid %d\n", chosen_pid);
                flags |= SIGCHLD;
-               pid = clone_with_pids(do_child, stack, flags, &pid_set,
-                                       (void *)argv);
+               pid = clone_with_pids(do_child, stack + stacksize - 1,
+                                     flags, &pid_set, (void *)argv);

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]         ` <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2009-11-16 11:12           ` Serge E. Hallyn
       [not found]             ` <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-16 11:12 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote:
> > +	if (use_clone) {
> > +		int stacksize = 4*getpagesize();
> > +		void *stack = malloc(stacksize);
> > +
> > +		if (!stack) {
> > +			perror("malloc");
> > +			return -1;
> > +		}
> > +
> > +		printf("about to clone with %lx\n", flags);
> > +		if (chosen_pid)
> > +			printf("Will choose pid %d\n", chosen_pid);
> > +		flags |= SIGCHLD;
> > +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> > +					(void *)argv);
> 
> The stack argument should be adjusted with the usual stack += stacksize
> - 1 or similar, right?

the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the
x86 one by Suka also) does this implicitly, by doing:

	s = child_stack;
	*--s = arg;
	*--s = fn;
	child_stack -= 16

-serge

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]     ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-13 21:08       ` Serge E. Hallyn
  2009-11-15 22:45       ` Nathan Lynch
@ 2009-11-16 14:45       ` Serge E. Hallyn
  2 siblings, 0 replies; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-16 14:45 UTC (permalink / raw)
  To: Nathan T Lynch, containers-qjLDD68F18O7TbgM5vRIOg

Subject: [PATCH 1/1] nsexeccwp bugfixes

1. As Nathan pointed out, I was passing in stack bottom, not
stack top.  Our clone_with_pids() helper in user-cr/clone_${ARCH}.c
just accepts stack top.

2. The clone_with_pids() helper returns -errno on error, it
doesn't set errno.  Hande that right.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 nsexeccwp.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/nsexeccwp.c b/nsexeccwp.c
index 453fb8c..d4bf00c 100644
--- a/nsexeccwp.c
+++ b/nsexeccwp.c
@@ -304,6 +304,7 @@ int main(int argc, char *argv[])
 			perror("malloc");
 			return -1;
 		}
+		stack += stacksize - 1;
 
 		printf("about to clone with %lx\n", flags);
 		if (chosen_pid)
@@ -311,7 +312,8 @@ int main(int argc, char *argv[])
 		flags |= SIGCHLD;
 		pid = clone_with_pids(do_child, stack, flags, &pid_set,
 					(void *)argv);
-		if (pid == -1) {
+		if (pid < 0) {
+			errno = -pid;
 			perror("clone");
 			return -1;
 		}
-- 
1.6.1.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]                 ` <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2009-11-16 18:26                   ` Serge E. Hallyn
       [not found]                     ` <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-16 18:26 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote:
> > Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> > > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote:
> > > > +	if (use_clone) {
> > > > +		int stacksize = 4*getpagesize();
> > > > +		void *stack = malloc(stacksize);
> > > > +
> > > > +		if (!stack) {
> > > > +			perror("malloc");
> > > > +			return -1;
> > > > +		}
> > > > +
> > > > +		printf("about to clone with %lx\n", flags);
> > > > +		if (chosen_pid)
> > > > +			printf("Will choose pid %d\n", chosen_pid);
> > > > +		flags |= SIGCHLD;
> > > > +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> > > > +					(void *)argv);
> > > 
> > > The stack argument should be adjusted with the usual stack += stacksize
> > > - 1 or similar, right?
> > 
> > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the
> > x86 one by Suka also) does this implicitly, by doing:
> > 
> > 	s = child_stack;
> > 	*--s = arg;
> > 	*--s = fn;
> > 	child_stack -= 16
> 
> That's setting up arguments for the function to run in the child, and
> afaict that code assumes the value of child_stack is the _end_ of the
> stack region.

Yes.

> The code I quoted above is passing the beginning of the
> region (the return value from malloc).

Holy cow, that was a snafu in my switching to sending (stack_base,stack_size)
for the previous version, and then back again.  It was meant to send
stack_base+stack_size now.

I say 'holy cow' because it doesn't segfault on s390x.  And it certainly
should!

> On powerpc the segfaults went away when I made the following change.
> 
> diff --git a/nsexeccwp.c b/nsexeccwp.c
> index a71d9a4..92eb092 100644
> --- a/nsexeccwp.c
> +++ b/nsexeccwp.c
> @@ -309,8 +309,8 @@ int main(int argc, char *argv[])
>                 if (chosen_pid)
>                         printf("Will choose pid %d\n", chosen_pid);
>                 flags |= SIGCHLD;
> -               pid = clone_with_pids(do_child, stack, flags, &pid_set,
> -                                       (void *)argv);
> +               pid = clone_with_pids(do_child, stack + stacksize - 1,
> +                                     flags, &pid_set, (void *)argv);

Yes I don't think the -1 should be needed, but certainly the
+stacksize is.

thanks,
-serge

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]                     ` <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-16 23:18                       ` Nathan Lynch
       [not found]                         ` <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Nathan Lynch @ 2009-11-16 23:18 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On Mon, 2009-11-16 at 12:26 -0600, Serge E. Hallyn wrote:
> Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> > On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote:
> > > Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> > > > On Thu, 2009-11-12 at 23:24 -0600, serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org wrote:
> > > > > +	if (use_clone) {
> > > > > +		int stacksize = 4*getpagesize();
> > > > > +		void *stack = malloc(stacksize);
> > > > > +
> > > > > +		if (!stack) {
> > > > > +			perror("malloc");
> > > > > +			return -1;
> > > > > +		}
> > > > > +
> > > > > +		printf("about to clone with %lx\n", flags);
> > > > > +		if (chosen_pid)
> > > > > +			printf("Will choose pid %d\n", chosen_pid);
> > > > > +		flags |= SIGCHLD;
> > > > > +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> > > > > +					(void *)argv);
> > > > 
> > > > The stack argument should be adjusted with the usual stack += stacksize
> > > > - 1 or similar, right?
> > > 
> > > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the
> > > x86 one by Suka also) does this implicitly, by doing:
> > > 
> > > 	s = child_stack;
> > > 	*--s = arg;
> > > 	*--s = fn;
> > > 	child_stack -= 16
> > 
> > That's setting up arguments for the function to run in the child, and
> > afaict that code assumes the value of child_stack is the _end_ of the
> > stack region.
> 
> Yes.
> 
> > The code I quoted above is passing the beginning of the
> > region (the return value from malloc).
> 
> Holy cow, that was a snafu in my switching to sending (stack_base,stack_size)
> for the previous version, and then back again.  It was meant to send
> stack_base+stack_size now.

Okay, here's the violence I've committed against your code to get eclone
working on powerpc (tested 32-bit userspace against 64-bit kernel).

./nsexeccwp -z 300 /bin/bash -c 'echo $$'
[debugging cruft elided]
300

This is meant not for inclusion but for discussion at this point.  I
made some changes that will certainly break the builds for other
architectures.

Note that I have generic code initializing clone_args with the true
stack base and size and passing that to the architecture code.  The
architecture code (e.g. clone_ppc.c) is responsible for calculating the
stack pointer to pass to the kernel.  The architecture code is also
responsible for clearing clone_args.child_stack_size and updating
clone_args.child_stack, adjusting for alignment and arguments if
appropriate.  In this way, we can accommodate ia64 and parisc and keep
platform details in platform-specific code.


 clone_ppc.c  |   54 +++++++++++++++++++++++++++++++++++
 clone_ppc_.S |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 eclone.h     |   25 ++++++++++++++++
 nsexeccwp.c  |   42 ++++++++++++----------------
 4 files changed, 182 insertions(+), 27 deletions(-)

diff --git a/clone_ppc.c b/clone_ppc.c
index 49797fd..9e19fae 100644
--- a/clone_ppc.c
+++ b/clone_ppc.c
@@ -10,14 +10,25 @@
 
 #define _GNU_SOURCE
 
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <asm/unistd.h>
 
+#include "eclone.h"
+
 struct target_pid_set;
 
+struct pid_set {
+	size_t nr_pids;
+	pid_t *pids;
+};
+
+
 extern int __clone_with_pids(int (*fn)(void *arg),
 			     void *child_stack ,
 			     int flags,
@@ -56,3 +67,46 @@ int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
 }
 
 #endif
+
+extern int __eclone(int (*fn)(void *arg),
+		    void *child_sp,
+		    int flags,
+		    void *fn_arg,
+		    struct clone_args *args,
+		    size_t args_size,
+		    pid_t *pids);
+
+int eclone(int (*fn)(void *), void *fn_arg, int clone_flags_low,
+	   struct clone_args *clone_args, pid_t *pids)
+{
+	struct clone_args my_args;
+	unsigned long child_sp;
+	int newpid;
+
+	if (clone_args->child_stack)
+		child_sp = clone_args->child_stack +
+			clone_args->child_stack_size - 1;
+	else
+		child_sp = 0;
+
+	my_args = *clone_args;
+	my_args.child_stack = child_sp;
+	my_args.child_stack_size = 0;
+
+	printf("%s: child_sp = %p\n", __func__, (void *)child_sp);
+
+	newpid = __eclone(fn,
+			  (void *)child_sp,
+			  clone_flags_low,
+			  fn_arg,
+			  &my_args,
+			  sizeof(my_args),
+			  pids);
+
+	if (newpid < 0) {
+		errno = -newpid;
+		newpid = -1;
+	}
+
+	return newpid;
+}
diff --git a/clone_ppc_.S b/clone_ppc_.S
index cb3e053..b777b2d 100644
--- a/clone_ppc_.S
+++ b/clone_ppc_.S
@@ -11,6 +11,14 @@
 #include <asm/unistd.h>
 #include "powerpc_asm.h"
 
+#ifndef __NR_clone_with_pids
+#define __NR_clone_with_pids	325
+#endif
+
+#ifndef __NR_eclone
+#define __NR_eclone	323
+#endif
+
 /* int [r3] clone_with_pids(int (*fn)(void *arg) [r3],
  *                          void *child_stack [r4],
  *                          int flags [r5],
@@ -29,10 +37,10 @@
 .globl __clone_with_pids
 __clone_with_pids:
 
-/* No argument validation. */
+	/* No argument validation. */
 
-/* Set up parent's stack frame. */
-stwu	r1,-32(r1)
+	/* Set up parent's stack frame. */
+	stwu	r1,-32(r1)
 
 	/* Save non-volatiles (r28-r31) which we plan to use. */
 	stmw	r28,16(r1)
@@ -88,3 +96,77 @@ parent:
 	neg	r3,r3
 	blr
 
+/* int [r3] eclone(int (*fn)(void *arg) [r3],
+ *                          void *child_sp [r4],
+ *                          int flags [r5],
+ *                          void *fn_arg [r6],
+ *                          struct clone_args *args [r7],
+ *                          size_t args_size [r8],
+ *                          pid_t *pids [r9]);
+ * Creates a child task with the pids specified by pids.
+ * Returns to parent only, child execution and exit is handled here.
+ * On error, returns negated errno.  On success, returns the pid of the child
+ * created.
+ */
+
+.globl __eclone
+__eclone:
+
+	/* No argument validation. */
+
+	/* Set up parent's stack frame. */
+	stwu	r1,-32(r1)
+
+	/* Save non-volatiles (r28-r31) which we plan to use. */
+	stmw	r28,16(r1)
+
+	/* Set up child's stack frame. */
+	clrrwi	r4,r4,4
+	li	r0,0
+	stw	r0,-16(r4)
+
+	/* Save fn, stack pointer, flags, and fn_arg across system call. */
+	mr	r28,r3
+	mr	r29,r4
+	mr	r30,r5
+	mr	r31,r6
+
+	/* Set up arguments for system call. */
+	mr	r3,r5	/* flags */
+	mr	r4,r7	/* clone_args */
+	mr	r5,r8	/* clone_args' size */
+	mr	r6,r9	/* pids */
+
+	/* Do the system call */
+	li	r0,__NR_eclone
+	sc
+
+	/* Parent or child? */
+	cmpwi	cr1,r3,0
+	crandc	4*cr1+eq,4*cr1+eq,4*cr0+so
+	bne	cr1,eclone_parent
+
+	/* Child. Call fn. */
+	mtctr	r28
+	mr 	r3,r31
+	bctrl
+
+	/* Assume result of fn in r3 and exit. */
+	li	r0,__NR_exit
+	sc
+
+eclone_parent:
+	/* Restore non-volatiles. */
+	lmw	r28,16(r1)
+
+	addi	r1,r1,32
+
+	/* Return to caller on success. */
+	bnslr
+
+	/* Handle error.  Negate the return value to signal an error
+	 * to the caller, which must set errno.
+	 */
+	neg	r3,r3
+	blr
+
diff --git a/eclone.h b/eclone.h
new file mode 100644
index 0000000..601a621
--- /dev/null
+++ b/eclone.h
@@ -0,0 +1,25 @@
+#ifndef _ECLONE_H_
+#define _ECLONE_H_
+
+#include <stdint.h>
+
+struct clone_args {
+	uint64_t clone_flags_high;
+	uint64_t child_stack;
+	uint64_t child_stack_size;
+	uint64_t parent_tid_ptr;
+	uint64_t child_tid_ptr;
+
+	uint32_t nr_pids;
+
+	uint32_t reserved0;
+	uint64_t reserved1;
+};
+
+/* arch-dependent code implements this interface */
+extern int eclone(int (*fn)(void *), void *fn_arg,
+		  int clone_flags_low,
+		  struct clone_args *clone_args,
+		  pid_t *pids);
+
+#endif
diff --git a/nsexeccwp.c b/nsexeccwp.c
index a71d9a4..b80b78e 100644
--- a/nsexeccwp.c
+++ b/nsexeccwp.c
@@ -17,29 +17,13 @@
 #include <sys/wait.h>
 
 #include "clone.h"
+#include "eclone.h"
 
 struct pid_set {
 	int num_pids;
 	pid_t *pids;
 };
 
-typedef unsigned long long u64;
-typedef unsigned int u32;
-typedef int pid_t;
-struct clone_args {
-	u64 clone_flags_high;
-
-	u64 child_stack_base;
-	u64 child_stack_size;
-
-	u64 parent_tid_ptr;
-	u64 child_tid_ptr;
-
-	u32 nr_pids;
-
-	u32 reserved0;
-	u64 reserved1;
-};
 /* (until it's supported by libc) from clone_ARCH.c */
 extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
 			   struct pid_set *target_pids, void *arg);
@@ -210,6 +194,9 @@ int do_child(void *vargv)
 {
 	char **argv = (char **)vargv;
 
+	printf("%s(%p)/%lu\n", __func__, vargv, (unsigned long)getpid());
+	fflush(NULL);
+
 	if (check_newcgrp())
 		return 1;
 
@@ -237,6 +224,7 @@ void write_pid(char *pid_file, int pid)
 
 int main(int argc, char *argv[])
 {
+	int i;
 	int c;
 	unsigned long flags = 0, eflags = 0;
 	char ttyname[256];
@@ -244,11 +232,8 @@ int main(int argc, char *argv[])
 	int ret, use_clone = 0;
 	int pid;
 	char *pid_file = NULL;
-	struct pid_set pid_set;
-	int chosen_pid = 0;
-
-	pid_set.num_pids = 1;
-	pid_set.pids = &chosen_pid;
+	size_t nr_pids = 1;
+	pid_t chosen_pid = 0;
 
 	procname = basename(argv[0]);
 
@@ -287,6 +272,9 @@ int main(int argc, char *argv[])
 	argv = &argv[optind];
 	argc = argc - optind;
 
+	for (i = 0; i < argc; i++)
+		printf("argv[%d] = '%s'\n", i, argv[i]);
+
 	if (do_newcgrp) {
 		ret = pipe(pipefd);
 		if (ret) {
@@ -297,6 +285,7 @@ int main(int argc, char *argv[])
 	}
 
 	if (use_clone) {
+		struct clone_args clone_args;
 		int stacksize = 4*getpagesize();
 		void *stack = malloc(stacksize);
 
@@ -305,12 +294,17 @@ int main(int argc, char *argv[])
 			return -1;
 		}
 
+		memset(&clone_args, 0, sizeof(clone_args));
+		clone_args.child_stack = (unsigned long)stack;
+		clone_args.child_stack_size = stacksize;
+		clone_args.nr_pids = nr_pids;
+
 		printf("about to clone with %lx\n", flags);
 		if (chosen_pid)
 			printf("Will choose pid %d\n", chosen_pid);
+		printf("argv = %p\n", argv);
 		flags |= SIGCHLD;
-		pid = clone_with_pids(do_child, stack, flags, &pid_set,
-					(void *)argv);
+		pid = eclone(do_child, argv, flags, &clone_args, &chosen_pid);
 		if (pid == -1) {
 			perror("clone");
 			return -1;

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]                         ` <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2009-11-17  4:05                           ` Serge E. Hallyn
  0 siblings, 0 replies; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-17  4:05 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> Okay, here's the violence I've committed against your code to get eclone
> working on powerpc (tested 32-bit userspace against 64-bit kernel).
> 
> ./nsexeccwp -z 300 /bin/bash -c 'echo $$'
> [debugging cruft elided]
> 300
> 
> This is meant not for inclusion but for discussion at this point.  I
> made some changes that will certainly break the builds for other
> architectures.
> 
> Note that I have generic code initializing clone_args with the true
> stack base and size and passing that to the architecture code.  The
> architecture code (e.g. clone_ppc.c) is responsible for calculating the
> stack pointer to pass to the kernel.  The architecture code is also
> responsible for clearing clone_args.child_stack_size and updating
> clone_args.child_stack, adjusting for alignment and arguments if
> appropriate.  In this way, we can accommodate ia64 and parisc and keep
> platform details in platform-specific code.

...

> diff --git a/clone_ppc.c b/clone_ppc.c
> index 49797fd..9e19fae 100644
> --- a/clone_ppc.c
> +++ b/clone_ppc.c
> @@ -10,14 +10,25 @@
>  
>  #define _GNU_SOURCE
>  
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <string.h>
>  #include <unistd.h>
>  #include <errno.h>
>  #include <sys/types.h>
>  #include <sys/syscall.h>
>  #include <asm/unistd.h>
>  
> +#include "eclone.h"
> +
>  struct target_pid_set;
>  
> +struct pid_set {
> +	size_t nr_pids;
> +	pid_t *pids;
> +};

You shouldn't need the pid_set any more right?

...

> @@ -305,12 +294,17 @@ int main(int argc, char *argv[])
>  			return -1;
>  		}
>  
> +		memset(&clone_args, 0, sizeof(clone_args));
> +		clone_args.child_stack = (unsigned long)stack;
> +		clone_args.child_stack_size = stacksize;
> +		clone_args.nr_pids = nr_pids;
> +
>  		printf("about to clone with %lx\n", flags);
>  		if (chosen_pid)
>  			printf("Will choose pid %d\n", chosen_pid);
> +		printf("argv = %p\n", argv);
>  		flags |= SIGCHLD;
> -		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> -					(void *)argv);
> +		pid = eclone(do_child, argv, flags, &clone_args, &chosen_pid);
>  		if (pid == -1) {
>  			perror("clone");
>  			return -1;

Yup, of course I agree with switching to a clean eclone passing
the clone_args and no struct pid_set, i was just trying to
minimize (to 0 :) the changes required for now in restart.c.

If you don't mind sending the patch to update restart.c as
well as this (minus some debugging) when you're ready, I'll
port clone_s390x.c to your precise api.

thanks,
-serge

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]     ` <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-25 18:46       ` Oren Laadan
       [not found]         ` <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Oren Laadan @ 2009-11-25 18:46 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, Nathan T Lynch


Ok, will add this to user-cr (v19-rc2).

BTW, where is the original nsexec source maintained ?

Oren.


Serge E. Hallyn wrote:
> One of the concerns with clone-with-pids is whether the
> stack handling is all correct and robust enough to withstand
> real usage.  Little testcases playing with pid values are
> also necessary, but can't replace really using clone-with-pids
> to start a shell from which to keep working.
> 
> This patch tweaks the old ns_exec.c namespace manipulation
> program to add a -z option to specify a pid.  So you can:
> 
> 	nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns
> 	mount -t proc proc /proc # mount private /proc
> 	echo $$
> 		1
> 	nsexeccwp -z /bin/bash   #  start a shell with pid 999
> 	echo $$
> 		999
> 
> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  Makefile    |    5 +-
>  clone.h     |   54 +++++++++
>  nsexeccwp.c |  352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 410 insertions(+), 1 deletions(-)
>  create mode 100644 clone.h
>  create mode 100644 nsexeccwp.c
> 
> diff --git a/Makefile b/Makefile
> index 181cc1c..32a6893 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG)
>  # install dir
>  INSTALL_DIR = /bin
>  
> -PROGS =	checkpoint restart ckptinfo
> +PROGS =	checkpoint restart ckptinfo nsexeccwp
>  
>  # other cleanup
>  OTHER = ckptinfo_types.c
> @@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread
>  ifneq ($(SUBARCH),)
>  restart: clone_$(SUBARCH).o
>  restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
> +nsexeccwp: clone_$(SUBARCH).o
> +nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
>  endif
>  
>  # on powerpc, need also assembly file
>  ifeq ($(SUBARCH),ppc)
>  restart: clone_$(SUBARCH)_.o
> +nsexeccwp: clone_$(SUBARCH)_.o
>  endif
>  
>  # ckptinfo dependencies
> diff --git a/clone.h b/clone.h
> new file mode 100644
> index 0000000..3569a45
> --- /dev/null
> +++ b/clone.h
> @@ -0,0 +1,54 @@
> +#ifndef CLONE_H
> +#define CLONE_H
> +/*
> + *  Copyright (C) 2007 IBM Corporation
> + *
> + *  Author: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + *
> + */
> +#include <sys/syscall.h>
> +
> +#ifndef HAVE_UNSHARE
> +
> +#if __i386__
> +#    define __NR_unshare 310
> +#elif __x86_64__
> +#    define __NR_unshare 272
> +#elif __ia64__
> +#    define __NR_unshare 1296
> +#elif __s390x__
> +#    define __NR_unshare 303
> +#elif __powerpc__
> +#    define __NR_unshare 282
> +#else
> +#    error "Architecture not supported"
> +#endif
> +
> +#endif /* HAVE_UNSHARE */
> +
> +#ifndef CLONE_NEWUTS
> +#define CLONE_NEWUTS		0x04000000
> +#endif
> +
> +#ifndef CLONE_NEWIPC
> +#define CLONE_NEWIPC		0x08000000
> +#endif
> +
> +#ifndef CLONE_NEWUSER
> +#define CLONE_NEWUSER		0x10000000
> +#endif
> +
> +#ifndef CLONE_NEWPID
> +#define CLONE_NEWPID		0x20000000
> +#endif
> +
> +#ifndef CLONE_NEWNET
> +#define CLONE_NEWNET		0x40000000
> +#endif
> +
> +#endif /* CLONE_H */
> diff --git a/nsexeccwp.c b/nsexeccwp.c
> new file mode 100644
> index 0000000..f14b8b0
> --- /dev/null
> +++ b/nsexeccwp.c
> @@ -0,0 +1,352 @@
> +/*
> + * Copyright 2008,2009 IBM Corp.
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sched.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <libgen.h>
> +#include <fcntl.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +
> +#include "clone.h"
> +
> +struct pid_set {
> +	int num_pids;
> +	pid_t *pids;
> +};
> +
> +typedef unsigned long long u64;
> +typedef unsigned int u32;
> +typedef int pid_t;
> +struct clone_args {
> +	u64 clone_flags_high;
> +
> +	u64 child_stack_base;
> +	u64 child_stack_size;
> +
> +	u64 parent_tid_ptr;
> +	u64 child_tid_ptr;
> +
> +	u32 nr_pids;
> +
> +	u32 reserved0;
> +	u64 reserved1;
> +};
> +extern int clone_with_pids(int (*fn)(void *), void *child_stack,
> +			unsigned long stack_size, unsigned long flags,
> +			struct pid_set *target_pids, void *arg);
> +
> +extern pid_t getpgid(pid_t pid);
> +extern pid_t getsid(pid_t pid);
> +
> +static const char* procname;
> +
> +static void usage(const char *name)
> +{
> +	printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]"
> +			"[command [arg ..]]\n", name);
> +	printf("\n");
> +	printf("  -h		this message\n");
> +	printf("\n");
> +	printf("  -z <pid>	use clone_with_pids and specify chosen pid\n");
> +	printf("  		Note that -z and -p are not compatible\n");
> +	printf("  -c		use 'clone' rather than 'unshare' system call\n");
> +	printf("  -g		launch in new cgroup\n");
> +	printf("  -m		mount namespace\n");
> +	printf("  -n		network namespace\n");
> +	printf("  -u		utsname namespace\n");
> +	printf("  -U		userid namespace\n");
> +	printf("  -i		ipc namespace\n");
> +	printf("  -P <pid-file>	File in which to write global pid of cinit\n");
> +	printf("  -p		pid namespace\n");
> +	printf("  -f <flag>	extra clone flags\n");
> +	printf("\n");
> +	printf("(C) Copyright IBM Corp. 2006\n");
> +	printf("\n");
> +	exit(1);
> +}
> +
> +static int string_to_ul(const char *str, unsigned long int *res)
> +{
> +	char *tail;
> +	long long int r;
> +
> +	if (!*str)
> +		return -1;
> +
> +	errno = 0;
> +
> +	r = strtol(str, &tail, 16);
> +
> +	/*
> +	 * according to strtol(3), if errno is set or tail does no point
> +	 * to the ending '\0', the conversion failed.
> +	 */
> +	if (errno || *tail)
> +		return -1;
> +
> +	*res = r;
> +	return 0;
> +}
> +
> +/*
> + * Copied following opentty() from Fedora's util-linux rpm
> + * I just changed the "FATAL" message below from syslog()
> + * to printf
> + */
> +static void
> +opentty(const char * tty) {
> +        int i, fd, flags;
> +
> +        fd = open(tty, O_RDWR | O_NONBLOCK);
> +        if (fd == -1) {
> +		printf("FATAL: can't reopen tty: %s", strerror(errno));
> +                sleep(1);
> +                exit(1);
> +        }
> +
> +        flags = fcntl(fd, F_GETFL);
> +        flags &= ~O_NONBLOCK;
> +        fcntl(fd, F_SETFL, flags);
> +
> +        for (i = 0; i < fd; i++)
> +                close(i);
> +        for (i = 0; i < 3; i++)
> +                if (fd != i)
> +                        dup2(fd, i);
> +        if (fd >= 3)
> +                close(fd);
> +}
> +// Code copy end
> +
> +int do_newcgrp = 0;
> +
> +int load_cgroup_dir(char *dest, int len)
> +{
> +	FILE *f = fopen("/proc/mounts", "r");
> +	char buf[200];
> +	char *name, *path, *fsname, *options, *p1, *p2, *s;
> +	if (!f)
> +		return 0;
> +	while (fgets(buf, 200, f)) {
> +		name = strtok_r(buf, " ", &p1);
> +		path = strtok_r(NULL, " ", &p1);
> +		fsname = strtok_r(NULL, " ", &p1);
> +		options = strtok_r(NULL, " ", &p1);
> +		if (strcmp(fsname, "cgroup") != 0)
> +			continue;
> +
> +		/* make sure the freezer is composed */
> +		s = strtok_r(options, ",", &p2);
> +		while (s && strcmp(s, "freezer") != 0)
> +			s = strtok_r(NULL, ",", &p2);
> +		if (!s)
> +			continue;
> +		strncpy(dest, path, len);
> +		fclose(f);
> +		return 1;
> +	}
> +	fclose(f);
> +	printf("Freezer not mounted\n");
> +	return 0;
> +}
> +
> +int move_to_new_cgroup(int newcgroup)
> +{
> +	char cgroupname[150], cgroupbase[100], tasksfname[200];
> +	FILE *fout;
> +	int ret;
> +
> +	if (!load_cgroup_dir(cgroupbase, 100))
> +		return 0;
> +
> +	snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup);
> +	ret = mkdir(cgroupname, 0755);
> +	if (ret)
> +		return 0;
> +	snprintf(tasksfname, 200, "%s/tasks", cgroupname);
> +	fout = fopen(tasksfname, "w");
> +	if (!fout)
> +		return 0;
> +	fprintf(fout, "%d\n", getpid());
> +	fclose(fout);
> +	return 1;
> +}
> +
> +int pipefd[2];
> +
> +/* gah. opentty will close the pipefd */
> +int check_newcgrp(void)
> +{
> +	int ret, newgroup;
> +	char buf[20];
> +
> +	if (!do_newcgrp)
> +		return 0;
> +
> +	close(pipefd[1]);
> +	ret = read(pipefd[0], buf, 20);
> +	close(pipefd[0]);
> +	if (ret == -1) {
> +		perror("read");
> +		return 1;
> +	}
> +	newgroup = atoi(buf);
> +	if (!move_to_new_cgroup(newgroup))
> +		return 1;
> +	do_newcgrp = 0;
> +	return 0;
> +}
> +
> +int do_child(void *vargv)
> +{
> +	char **argv = (char **)vargv;
> +
> +	if (check_newcgrp())
> +		return 1;
> +
> +	execve(argv[0], argv, __environ);
> +	perror("execve");
> +	return 1;
> +}
> +
> +void write_pid(char *pid_file, int pid)
> +{
> +	FILE *fp;
> +
> +	if (!pid_file)
> +		return;
> +
> +	fp = fopen(pid_file, "w");
> +	if (!fp) {
> +		perror("fopen, pid_file");
> +		exit(1);
> +	}
> +	fprintf(fp, "%d", pid);
> +	fflush(fp);
> +	fclose(fp);
> +}
> +
> +int main(int argc, char *argv[])
> +{	
> +	int c;
> +	unsigned long flags = 0, eflags = 0;
> +	char ttyname[256];
> +	int status;
> +	int ret, use_clone = 0;
> +	int pid;
> +	char *pid_file = NULL;
> +	struct pid_set pid_set;
> +	int chosen_pid = 0;
> +
> +	pid_set.num_pids = 1;
> +	pid_set.pids = &chosen_pid;
> +
> +	procname = basename(argv[0]);
> +
> +	memset(ttyname, '\0', sizeof(ttyname));
> +	readlink("/proc/self/fd/0", ttyname, sizeof(ttyname));
> +
> +	while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) {
> +		switch (c) {
> +		case 'g': do_newcgrp = getpid();		break;
> +		case 'm': flags |= CLONE_NEWNS;			break;
> +		case 'c': use_clone = 1;			break;
> +		case 'P': pid_file = optarg; 			break;
> +		case 'u': flags |= CLONE_NEWUTS;		break;
> +		case 'i': flags |= CLONE_NEWIPC;		break;
> +		case 'U': flags |= CLONE_NEWUSER;		break;
> +		case 'n': flags |= CLONE_NEWNET;		break;
> +		case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID;	break;
> +		case 'z': chosen_pid = atoi(optarg);		break;
> +		case 'f': if (!string_to_ul(optarg, &eflags)) {
> +				flags |= eflags;
> +				break;
> +			}
> +		case 'h':
> +		default:
> +			usage(procname);
> +		}
> +	};
> +
> +	if (chosen_pid) {
> +		use_clone = 1;
> +		if (flags & CLONE_NEWPID) {
> +			printf("Error: can't use CLONE_NEWPID and pick a pid\n");
> +			exit(1);
> +		}
> +	}
> +	argv = &argv[optind];
> +	argc = argc - optind;	
> +
> +	if (do_newcgrp) {
> +		ret = pipe(pipefd);
> +		if (ret) {
> +			perror("pipe");
> +			return -1;
> +		}
> +		do_newcgrp = pipefd[0];
> +	}
> +
> +	if (use_clone) {
> +		int stacksize = 4*getpagesize();
> +		void *stack = malloc(stacksize);
> +
> +		if (!stack) {
> +			perror("malloc");
> +			return -1;
> +		}
> +
> +		printf("about to clone with %lx\n", flags);
> +		if (chosen_pid)
> +			printf("Will choose pid %d\n", chosen_pid);
> +		flags |= SIGCHLD;
> +		pid = clone_with_pids(do_child, stack, stacksize, flags,
> +			&pid_set, (void *)argv);
> +		if (pid == -1) {
> +			perror("clone");
> +			return -1;
> +		}
> +	} else {
> +		if ((pid = fork()) == 0) {
> +			// Child.
> +			//print_my_info(procname, ttyname);
> +
> +			if (check_newcgrp())
> +				return 1;
> +			opentty(ttyname);
> +
> +			printf("about to unshare with %lx\n", flags);
> +			ret = unshare(flags);
> +			if (ret < 0) {
> +				perror("unshare");
> +				return 1;
> +			}		
> +			
> +			return do_child((void*)argv);
> +		}
> +
> +	}
> +	if (pid != -1 && do_newcgrp) {
> +		char buf[20];
> +		snprintf(buf, 20, "%d", pid);
> +		close(pipefd[0]);
> +		write(pipefd[1], buf, strlen(buf)+1);
> +		close(pipefd[1]);
> +	}
> +
> +	write_pid(pid_file, pid);
> +
> +	if ((ret = waitpid(pid, &status, __WALL)) < 0)
> +		printf("waitpid() returns %d, errno %d\n", ret, errno);
> +
> +	exit(0);
> +}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
       [not found]         ` <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-11-25 19:24           ` Serge E. Hallyn
  0 siblings, 0 replies; 13+ messages in thread
From: Serge E. Hallyn @ 2009-11-25 19:24 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, Nathan T Lynch

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> Ok, will add this to user-cr (v19-rc2).
> 
> BTW, where is the original nsexec source maintained ?

It isn't really 'maintained'.  Used to be kept at lxc.sf.net, and
right now a copy is in the cr_tests git tree.

-serge

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-11-25 19:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-10 16:58 [PATCH user-cr 1/2] use Suka's v11 api Serge E. Hallyn
     [not found] ` <20091110165839.GA19222-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-10 16:59   ` [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids Serge E. Hallyn
     [not found]     ` <20091110165922.GA19263-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-25 18:46       ` Oren Laadan
     [not found]         ` <4B0D7B87.5020504-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-11-25 19:24           ` Serge E. Hallyn
  -- strict thread matches above, loose matches on Subject: below --
2009-11-13  5:24 [PATCH linux-cr] implement s390 eclone syscall serue-r/Jw6+rmf7HQT0dZR+AlfA
     [not found] ` <1258089886-10034-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-13  5:24   ` [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids serue-r/Jw6+rmf7HQT0dZR+AlfA
     [not found]     ` <1258089886-10034-3-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-13 21:08       ` Serge E. Hallyn
2009-11-15 22:45       ` Nathan Lynch
     [not found]         ` <1258325156.4031.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-16 11:12           ` Serge E. Hallyn
     [not found]             ` <20091116111249.GA32340-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-15 23:49               ` Nathan Lynch
     [not found]                 ` <1258328984.4031.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-16 18:26                   ` Serge E. Hallyn
     [not found]                     ` <20091116182655.GA3777-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-16 23:18                       ` Nathan Lynch
     [not found]                         ` <1258413522.4031.1036.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-11-17  4:05                           ` Serge E. Hallyn
2009-11-16 14:45       ` Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.