Linux Container Development
 help / color / mirror / Atom feed
From: sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
Cc: Sukadev Bhattiprolu
	<sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [RFC][PATCH 7/7] Define clone_with_pids syscall
Date: Mon,  4 May 2009 01:17:45 -0700	[thread overview]
Message-ID: <12414250653426-git-send-email-sukadev@linux.vnet.ibm.com> (raw)
In-Reply-To: <12414250653025-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

clone_with_pids() is same as clone(), except that it takes a 'target_pid_set'
paramter which lets caller choose a specific pid number for the child process
in each of the child process's pid namespace. This system call would be needed
to implement Checkpoint/Restart (i.e after a checkpoint, restart a process with
its original pids).

Call clone_with_pids as follows:

	pid_t pids[] = { 0, 77, 99 };
	struct target_pid_set pid_set;

	pid_set.num_pids = sizeof(pids) / sizeof(int);
	pid_set.target_pids = &pids;

	syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set);

If a target-pid is 0, the kernel continues to assign a pid for the process in
that namespace. In the above example, pids[0] is 0, meaning the kernel will
assign next available pid to the process in init_pid_ns. But kernel will assign
pid 77 in the child pid namespace 1 and pid 99 in pid namespace 2. If either
77 or 99 are taken, the system call fails with -EBUSY.

If 'pid_set.num_pids' exceeds the current nesting level of pid namespaces,
the system call fails with -EINVAL.

Its mostly an exploratory patch seeking feedback on the interface.

NOTE:
	Compared to clone(), clone_with_pids() needs to pass in two more
	pieces of information:

		- number of pids in the set
		- user buffer containing the list of pids.

	But since clone() already takes 5 parameters, use a 'struct
	target_pid_set'.

TODO:
	- Gently tested.
	- May need additional sanity checks in check_target_pids()
	- Allow CLONE_NEWPID() with clone_with_pids() (ensure target-pid in
	  the namespace is either 1 or 0).

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 arch/x86/include/asm/syscalls.h    |    1 +
 arch/x86/include/asm/unistd_32.h   |    1 +
 arch/x86/kernel/entry_32.S         |    1 +
 arch/x86/kernel/process_32.c       |   91 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/syscall_table_32.S |    1 +
 include/linux/types.h              |    5 ++
 6 files changed, 100 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408..1fdc149 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -31,6 +31,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
 /* kernel/process_32.c */
 int sys_fork(struct pt_regs *);
 int sys_clone(struct pt_regs *);
+int sys_clone_with_pids(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
 int sys_execve(struct pt_regs *);

diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74..90f906f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_clone_with_pids	335

 #ifdef __KERNEL__

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add..ee92b0d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -707,6 +707,7 @@ ptregs_##name: \
 PTREGSCALL(iopl)
 PTREGSCALL(fork)
 PTREGSCALL(clone)
+PTREGSCALL(clone_with_pids)
 PTREGSCALL(vfork)
 PTREGSCALL(execve)
 PTREGSCALL(sigaltstack)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84..66ac6f7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -445,6 +445,97 @@ int sys_clone(struct pt_regs *regs)
 	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
 }

+static int check_target_pids(unsigned long clone_flags,
+		struct target_pid_set *pid_setp)
+{
+	/*
+	 * CLONE_NEWPID implies pid == 1
+	 *
+	 * TODO: Maybe this should be more fine-grained (i.e would we want
+	 *  	 to have a container-init have a specific pid in ancestor
+	 *  	 namespaces ?)
+	 */
+	if (clone_flags & CLONE_NEWPID)
+		return -EINVAL;
+
+	/* number of pids must match current nesting level of pid ns */
+	if (pid_setp->num_pids > task_pid(current)->level + 1)
+		return -EINVAL;
+
+	/* TODO: More sanity checks ?  */
+
+	return 0;
+}
+
+static pid_t *copy_target_pids(unsigned long clone_flags, void __user *upid_setp)
+{
+	int rc;
+	int size;
+	unsigned long clone_flags;
+	pid_t __user *utarget_pids;
+	pid_t *target_pids;
+	struct target_pid_set pid_set;
+
+	if (copy_from_user(pid_setp, upid_setp, sizeof(*pid_setp)))
+		return ERR_PTR(-EFAULT);
+
+	size = pid_setp->num_pids * sizeof(pid_t);
+	utarget_pids = pid_setp->target_pids;
+
+	target_pids = kzalloc(size, GFP_KERNEL);
+	if (!target_pids)
+		return ERR_PTR(-ENOMEM);
+
+	rc = -EFAULT;
+	if (copy_from_user(target_pids, utarget_pids, size))
+		goto out_free;
+
+	rc = check_target_pids(clone_flags, &pid_set);
+	if (rc)
+		goto out_free;
+
+	printk(KERN_ERR "clone_with_pids() num_pids %d, [ %d, %d ]\n",
+			pid_set.num_pids, target_pids[0], target_pids[1]);
+
+	return target_pids;
+
+out_free:
+	kfree(target_pids);
+	return ERR_PTR(rc);
+}
+
+int sys_clone_with_pids(struct pt_regs *regs)
+{
+	unsigned long clone_flags;
+	unsigned long newsp;
+	int __user *parent_tidptr;
+	int __user *child_tidptr;
+	void __user *upid_setp;
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	clone_flags = regs->bx;
+	newsp = regs->cx;
+	parent_tidptr = (int __user *)regs->dx;
+	child_tidptr = (int __user *)regs->di;
+	upid_setp = (void __user *)regs->bp;
+
+	if (!newsp)
+		newsp = regs->sp;
+
+	target_pids = copy_target_pids(clone_flags, upid_setp)
+	if (IS_ERR(target_pids))
+		return PTR_ERR(target_pids);
+
+	rc = do_fork_with_pids(clone_flags, newsp, regs, 0, parent_tidptr,
+			child_tidptr, target_pids);
+out_free:
+	kfree(target_pids);
+	return rc;
+}
+
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c873..94c1a58 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long ptregs_clone_with_pids	/* 335 */
diff --git a/include/linux/types.h b/include/linux/types.h
index 5abe354..17ec186 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -204,6 +204,11 @@ struct ustat {
 	char			f_fpack[6];
 };

+struct target_pid_set {
+	int num_pids;
+	pid_t *target_pids;
+};
+
 #endif	/* __KERNEL__ */
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
-- 
1.5.2.5

  parent reply	other threads:[~2009-05-04  8:17 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-05-04  8:17 [RFC][PATCH 1/7] Factor out code to allocate pidmap page sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found] ` <12414250653025-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04  8:17   ` [RFC][PATCH 2/7] Have alloc_pidmap() return actual error code sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found]     ` <1241425065670-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04 18:11       ` Matt Helsley
     [not found]         ` <20090504181111.GM11734-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-04 18:29           ` Sukadev Bhattiprolu
2009-05-04  8:17   ` [RFC][PATCH 3/7] Add target_pid parameter to alloc_pidmap() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found]     ` <12414250651744-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04 20:03       ` Serge E. Hallyn
     [not found]         ` <20090504200318.GA29491-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-05 22:23           ` Sukadev Bhattiprolu
     [not found]             ` <20090505222342.GB20515-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-05 23:01               ` Serge E. Hallyn
2009-05-04  8:17   ` [RFC][PATCH 4/7] Add target_pids parameter to alloc_pid() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
2009-05-04  8:17   ` [RFC][PATCH 5/7] Add target_pids parameter to copy_process() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
2009-05-04  8:17   ` [RFC][PATCH 6/7] Define do_fork_with_pids() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found]     ` <12414250652549-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04 18:26       ` Matt Helsley
     [not found]         ` <20090504182621.GO11734-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-04 18:32           ` Sukadev Bhattiprolu
2009-05-04  8:17   ` sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 [this message]
2009-05-04 18:20   ` [RFC][PATCH 1/7] Factor out code to allocate pidmap page Matt Helsley
     [not found]     ` <20090504182059.GN11734-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-04 18:27       ` Sukadev Bhattiprolu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=12414250653426-git-send-email-sukadev@linux.vnet.ibm.com \
    --to=sukadev-23vcf4htsmix0ybbhkvfkdbpr1lh4cv8@public.gmane.org \
    --cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox