All of lore.kernel.org
 help / color / mirror / Atom feed
From: sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
Cc: Sukadev Bhattiprolu
	<sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [RFC][PATCH 7/7] Define clone_with_pids syscall
Date: Mon,  4 May 2009 01:17:45 -0700	[thread overview]
Message-ID: <12414250653426-git-send-email-sukadev@linux.vnet.ibm.com> (raw)
In-Reply-To: <12414250653025-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

clone_with_pids() is same as clone(), except that it takes a 'target_pid_set'
paramter which lets caller choose a specific pid number for the child process
in each of the child process's pid namespace. This system call would be needed
to implement Checkpoint/Restart (i.e after a checkpoint, restart a process with
its original pids).

Call clone_with_pids as follows:

	pid_t pids[] = { 0, 77, 99 };
	struct target_pid_set pid_set;

	pid_set.num_pids = sizeof(pids) / sizeof(int);
	pid_set.target_pids = &pids;

	syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set);

If a target-pid is 0, the kernel continues to assign a pid for the process in
that namespace. In the above example, pids[0] is 0, meaning the kernel will
assign next available pid to the process in init_pid_ns. But kernel will assign
pid 77 in the child pid namespace 1 and pid 99 in pid namespace 2. If either
77 or 99 are taken, the system call fails with -EBUSY.

If 'pid_set.num_pids' exceeds the current nesting level of pid namespaces,
the system call fails with -EINVAL.

Its mostly an exploratory patch seeking feedback on the interface.

NOTE:
	Compared to clone(), clone_with_pids() needs to pass in two more
	pieces of information:

		- number of pids in the set
		- user buffer containing the list of pids.

	But since clone() already takes 5 parameters, use a 'struct
	target_pid_set'.

TODO:
	- Gently tested.
	- May need additional sanity checks in check_target_pids()
	- Allow CLONE_NEWPID() with clone_with_pids() (ensure target-pid in
	  the namespace is either 1 or 0).

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 arch/x86/include/asm/syscalls.h    |    1 +
 arch/x86/include/asm/unistd_32.h   |    1 +
 arch/x86/kernel/entry_32.S         |    1 +
 arch/x86/kernel/process_32.c       |   91 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/syscall_table_32.S |    1 +
 include/linux/types.h              |    5 ++
 6 files changed, 100 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408..1fdc149 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -31,6 +31,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
 /* kernel/process_32.c */
 int sys_fork(struct pt_regs *);
 int sys_clone(struct pt_regs *);
+int sys_clone_with_pids(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
 int sys_execve(struct pt_regs *);

diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74..90f906f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_clone_with_pids	335

 #ifdef __KERNEL__

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add..ee92b0d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -707,6 +707,7 @@ ptregs_##name: \
 PTREGSCALL(iopl)
 PTREGSCALL(fork)
 PTREGSCALL(clone)
+PTREGSCALL(clone_with_pids)
 PTREGSCALL(vfork)
 PTREGSCALL(execve)
 PTREGSCALL(sigaltstack)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84..66ac6f7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -445,6 +445,97 @@ int sys_clone(struct pt_regs *regs)
 	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
 }

+static int check_target_pids(unsigned long clone_flags,
+		struct target_pid_set *pid_setp)
+{
+	/*
+	 * CLONE_NEWPID implies pid == 1
+	 *
+	 * TODO: Maybe this should be more fine-grained (i.e would we want
+	 *  	 to have a container-init have a specific pid in ancestor
+	 *  	 namespaces ?)
+	 */
+	if (clone_flags & CLONE_NEWPID)
+		return -EINVAL;
+
+	/* number of pids must match current nesting level of pid ns */
+	if (pid_setp->num_pids > task_pid(current)->level + 1)
+		return -EINVAL;
+
+	/* TODO: More sanity checks ?  */
+
+	return 0;
+}
+
+static pid_t *copy_target_pids(unsigned long clone_flags, void __user *upid_setp)
+{
+	int rc;
+	int size;
+	unsigned long clone_flags;
+	pid_t __user *utarget_pids;
+	pid_t *target_pids;
+	struct target_pid_set pid_set;
+
+	if (copy_from_user(pid_setp, upid_setp, sizeof(*pid_setp)))
+		return ERR_PTR(-EFAULT);
+
+	size = pid_setp->num_pids * sizeof(pid_t);
+	utarget_pids = pid_setp->target_pids;
+
+	target_pids = kzalloc(size, GFP_KERNEL);
+	if (!target_pids)
+		return ERR_PTR(-ENOMEM);
+
+	rc = -EFAULT;
+	if (copy_from_user(target_pids, utarget_pids, size))
+		goto out_free;
+
+	rc = check_target_pids(clone_flags, &pid_set);
+	if (rc)
+		goto out_free;
+
+	printk(KERN_ERR "clone_with_pids() num_pids %d, [ %d, %d ]\n",
+			pid_set.num_pids, target_pids[0], target_pids[1]);
+
+	return target_pids;
+
+out_free:
+	kfree(target_pids);
+	return ERR_PTR(rc);
+}
+
+int sys_clone_with_pids(struct pt_regs *regs)
+{
+	unsigned long clone_flags;
+	unsigned long newsp;
+	int __user *parent_tidptr;
+	int __user *child_tidptr;
+	void __user *upid_setp;
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	clone_flags = regs->bx;
+	newsp = regs->cx;
+	parent_tidptr = (int __user *)regs->dx;
+	child_tidptr = (int __user *)regs->di;
+	upid_setp = (void __user *)regs->bp;
+
+	if (!newsp)
+		newsp = regs->sp;
+
+	target_pids = copy_target_pids(clone_flags, upid_setp)
+	if (IS_ERR(target_pids))
+		return PTR_ERR(target_pids);
+
+	rc = do_fork_with_pids(clone_flags, newsp, regs, 0, parent_tidptr,
+			child_tidptr, target_pids);
+out_free:
+	kfree(target_pids);
+	return rc;
+}
+
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c873..94c1a58 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long ptregs_clone_with_pids	/* 335 */
diff --git a/include/linux/types.h b/include/linux/types.h
index 5abe354..17ec186 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -204,6 +204,11 @@ struct ustat {
 	char			f_fpack[6];
 };

+struct target_pid_set {
+	int num_pids;
+	pid_t *target_pids;
+};
+
 #endif	/* __KERNEL__ */
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
-- 
1.5.2.5

  parent reply	other threads:[~2009-05-04  8:17 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-05-04  8:17 [RFC][PATCH 1/7] Factor out code to allocate pidmap page sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found] ` <12414250653025-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04  8:17   ` [RFC][PATCH 2/7] Have alloc_pidmap() return actual error code sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found]     ` <1241425065670-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04 18:11       ` Matt Helsley
     [not found]         ` <20090504181111.GM11734-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-04 18:29           ` Sukadev Bhattiprolu
2009-05-04  8:17   ` [RFC][PATCH 3/7] Add target_pid parameter to alloc_pidmap() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found]     ` <12414250651744-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04 20:03       ` Serge E. Hallyn
     [not found]         ` <20090504200318.GA29491-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-05 22:23           ` Sukadev Bhattiprolu
     [not found]             ` <20090505222342.GB20515-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-05 23:01               ` Serge E. Hallyn
2009-05-04  8:17   ` [RFC][PATCH 4/7] Add target_pids parameter to alloc_pid() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
2009-05-04  8:17   ` [RFC][PATCH 5/7] Add target_pids parameter to copy_process() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
2009-05-04  8:17   ` [RFC][PATCH 6/7] Define do_fork_with_pids() sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
     [not found]     ` <12414250652549-git-send-email-sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2009-05-04 18:26       ` Matt Helsley
     [not found]         ` <20090504182621.GO11734-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-04 18:32           ` Sukadev Bhattiprolu
2009-05-04  8:17   ` sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 [this message]
2009-05-04 18:20   ` [RFC][PATCH 1/7] Factor out code to allocate pidmap page Matt Helsley
     [not found]     ` <20090504182059.GN11734-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-04 18:27       ` Sukadev Bhattiprolu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=12414250653426-git-send-email-sukadev@linux.vnet.ibm.com \
    --to=sukadev-23vcf4htsmix0ybbhkvfkdbpr1lh4cv8@public.gmane.org \
    --cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.