All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] sys_restore prototype
@ 2008-07-25 22:56 Serge E. Hallyn
       [not found] ` <20080725225655.GA28276-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Serge E. Hallyn @ 2008-07-25 22:56 UTC (permalink / raw)
  To: Linux Containers

We were talking this morning about what trivial patchset to begin
with to get a start on checkpoint and restart.  We thought that
rather than start with checkpoint, maybe we should start with
something that reads a "checkpoint file" and "restarts" a single
task.  In this case, restart means it sets the process id and
executes the file which are found in the checkpoint file.

So here's what we whipped up for a half hour this morning,
and during some of Mark's talk this afternoon.

It refuses to run if it isn't the container init, so you must
unshare your pidns before calling sys_restore().

To test, I did:

[root@kvm-f9 ~]# cat mycheckpoint 
99 /root/whoami
[root@kvm-f9 ~]# cat restore.c
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>


int main()
{
        int ret;
        char *argv[3];
        char *envp[1];

        //argv[0] = argv[1] = "/bin/bash";
        //argv[2] = envp[0] = NULL;
        argv[0] = "/root/whoami";
        argv[1] = envp[0] = NULL;
        int fd = open("/root/mycheckpoint", O_RDONLY);
        if (fd < 0)
                perror("open checkpoint file");
        ret = syscall(327, fd, argv, envp);
        printf("syscall returned %d\n", ret);
        perror("syscall for checkpoint");
        close(fd);
        return ret;
}
[root@kvm-f9 ~]# cat whoami.c
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>

int main()
{
        printf("I am %d\n", getpid());
        return 0;
}

Next, I create a new pid namespace, remount /proc, and
 execute 'restore' using 'exec' so that pid 1 is doing it:

[root@kvm-f9 ~]# /home/hallyn/cryo/utils/ns_exec -cp /bin/bash
about to clone with 20020000
[root@kvm-f9 ~]# mount -t proc none /proc
[root@kvm-f9 ~]# ps -ef
UID        PID  PPID  C STIME TTY          TIME CMD
root         1     0  1 18:46 pts/0    00:00:00 /bin/bash
root        26     1  0 18:46 pts/0    00:00:00 ps -ef
[root@kvm-f9 ~]# exec ./restore
I am 99

Seems to work.

-serge

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] introduce sys_restore
       [not found] ` <20080725225655.GA28276-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2008-07-25 22:58   ` Serge E. Hallyn
  2008-07-25 22:59   ` [PATCH 2/2] sys_restore: set the pid number Serge E. Hallyn
  2008-07-28 22:00   ` [PATCH 0/2] sys_restore prototype Eric W. Biederman
  2 siblings, 0 replies; 7+ messages in thread
From: Serge E. Hallyn @ 2008-07-25 22:58 UTC (permalink / raw)
  To: Linux Containers

Create a useless (?) sys_restore system call.  All it does
is read a "checkpoint file" :) for a pid number and a file
to execute.

Since we don't take things like argv and envp and registers
from the checkpoint file, in order to make this easily
testable, we take those things as arguments.
Signed-off-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 
Signed-off-by: Dave Hansen <dave-gkUM19QKKo4@public.gmane.org>
---
 arch/x86/kernel/process_32.c       |   10 ++++++++
 arch/x86/kernel/syscall_table_32.S |    2 +
 kernel/fork.c                      |   43 ++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927a..e11627d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -688,6 +688,16 @@ out:
 	return error;
 }
 
+long do_restore(unsigned int fd,
+		char __user * __user *argv,
+                char __user * __user *envp,
+		struct pt_regs *regs);
+
+asmlinkage long sys_restore(struct pt_regs regs)
+{
+	return do_restore(regs.bx, regs.cx, regs.dx, &regs);
+}
+
 #define top_esp                (THREAD_SIZE - sizeof(unsigned long))
 #define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff556..019a8e4 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,5 @@ ENTRY(sys_call_table)
 	.long sys_fallocate
 	.long sys_timerfd_settime	/* 325 */
 	.long sys_timerfd_gettime
+	.long sys_restore
+	
diff --git a/kernel/fork.c b/kernel/fork.c
index adefc11..0e43f69 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1679,3 +1679,46 @@ int unshare_files(struct files_struct **displaced)
 	task_unlock(task);
 	return 0;
 }
+
+
+#define CKPT_SIZE (PAGE_SIZE*4)
+char buf[CKPT_SIZE];
+char exe_filename[PAGE_SIZE];
+
+/*
+ * userspace will already have made us a new pidns
+ */
+long do_restore(unsigned int fd,
+		char __user * __user *argv,
+                char __user * __user *envp,
+		struct pt_regs *regs)
+{
+	int nr_scanned;
+        struct file *file;
+        long ret = -EBADF;
+        int fput_needed;
+	int pid;
+
+	if (!is_container_init(current)) {
+		printk("I am not init\n");
+		return -EPERM;
+	}
+
+        file = fget_light(fd, &fput_needed);
+	if (!file)
+		goto out;
+	ret = kernel_read(file, 0, buf, CKPT_SIZE);
+	fput_light(file, fput_needed);
+	if (ret <= 0)
+		goto out;
+	
+	nr_scanned = sscanf(buf, "%d %s", &pid, exe_filename);
+
+	if (nr_scanned != 2)
+		return -EINVAL;
+
+
+	ret = do_execve(exe_filename, argv, envp, regs);
+out:
+        return ret;
+}
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] sys_restore: set the pid number
       [not found] ` <20080725225655.GA28276-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2008-07-25 22:58   ` [PATCH 1/2] introduce sys_restore Serge E. Hallyn
@ 2008-07-25 22:59   ` Serge E. Hallyn
       [not found]     ` <20080725225935.GB28764-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2008-07-28 22:00   ` [PATCH 0/2] sys_restore prototype Eric W. Biederman
  2 siblings, 1 reply; 7+ messages in thread
From: Serge E. Hallyn @ 2008-07-25 22:59 UTC (permalink / raw)
  To: Linux Containers

Set the pid number for a restored task.  This is purely a toy, as it
only sets the pidnr in the lowest level pid namespace.

Signed-off-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 kernel/fork.c |    5 +++++
 kernel/pid.c  |   19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e43f69..41c46d2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1685,6 +1685,8 @@ int unshare_files(struct files_struct **displaced)
 char buf[CKPT_SIZE];
 char exe_filename[PAGE_SIZE];
 
+extern int choose_pidmap(struct pid *pid, int new);
+
 /*
  * userspace will already have made us a new pidns
  */
@@ -1717,6 +1719,9 @@ long do_restore(unsigned int fd,
 	if (nr_scanned != 2)
 		return -EINVAL;
 
+	ret = choose_pidmap(task_pid(current), pid);
+	if (!ret)
+		return -EAGAIN;
 
 	ret = do_execve(exe_filename, argv, envp, regs);
 out:
diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4..88a5e2a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,25 @@ static void free_pidmap(struct upid *upid)
 	atomic_inc(&map->nr_free);
 }
 
+int choose_pidmap(struct pid *pid, int new)
+{
+	struct pidmap *map;
+	int level = pid->level;
+	struct upid *upid = &pid->numbers[level];
+	struct pid_namespace *pid_ns = upid->ns;
+	int old = upid->nr;
+
+	map = &pid_ns->pidmap[new/BITS_PER_PAGE];
+	if (!test_and_set_bit(new, map->page)) {
+		map = &pid_ns->pidmap[old/BITS_PER_PAGE];
+		clear_bit(old, map->page);
+		upid->nr = new;
+		return 1;
+	}
+
+	return 0;
+}
+
 static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
 	int i, offset, max_scan, pid, last = pid_ns->last_pid;
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Devel] [PATCH 2/2] sys_restore: set the pid number
       [not found]     ` <20080725225935.GB28764-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2008-07-27  2:33       ` Pavel Emelyanov
       [not found]         ` <488BDE7D.7090602-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Pavel Emelyanov @ 2008-07-27  2:33 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

Serge E. Hallyn wrote:
> Set the pid number for a restored task.  This is purely a toy, as it
> only sets the pidnr in the lowest level pid namespace.
> 
> Signed-off-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  kernel/fork.c |    5 +++++
>  kernel/pid.c  |   19 +++++++++++++++++++
>  2 files changed, 24 insertions(+), 0 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0e43f69..41c46d2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1685,6 +1685,8 @@ int unshare_files(struct files_struct **displaced)
>  char buf[CKPT_SIZE];
>  char exe_filename[PAGE_SIZE];
>  
> +extern int choose_pidmap(struct pid *pid, int new);
> +
>  /*
>   * userspace will already have made us a new pidns
>   */
> @@ -1717,6 +1719,9 @@ long do_restore(unsigned int fd,
>  	if (nr_scanned != 2)
>  		return -EINVAL;
>  
> +	ret = choose_pidmap(task_pid(current), pid);
> +	if (!ret)
> +		return -EAGAIN;
>  
>  	ret = do_execve(exe_filename, argv, envp, regs);
>  out:
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 30bd5d4..88a5e2a 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -122,6 +122,25 @@ static void free_pidmap(struct upid *upid)
>  	atomic_inc(&map->nr_free);
>  }
>  
> +int choose_pidmap(struct pid *pid, int new)
> +{
> +	struct pidmap *map;
> +	int level = pid->level;
> +	struct upid *upid = &pid->numbers[level];
> +	struct pid_namespace *pid_ns = upid->ns;
> +	int old = upid->nr;
> +
> +	map = &pid_ns->pidmap[new/BITS_PER_PAGE];
> +	if (!test_and_set_bit(new, map->page)) {

This and...

> +		map = &pid_ns->pidmap[old/BITS_PER_PAGE];
> +		clear_bit(old, map->page);

this are both a bit buggy, since the bit number on the page
is not the same as the pid id itself ;)

> +		upid->nr = new;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>  static int alloc_pidmap(struct pid_namespace *pid_ns)
>  {
>  	int i, offset, max_scan, pid, last = pid_ns->last_pid;

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/2] sys_restore prototype
       [not found] ` <20080725225655.GA28276-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2008-07-25 22:58   ` [PATCH 1/2] introduce sys_restore Serge E. Hallyn
  2008-07-25 22:59   ` [PATCH 2/2] sys_restore: set the pid number Serge E. Hallyn
@ 2008-07-28 22:00   ` Eric W. Biederman
       [not found]     ` <m1sktthdp3.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  2 siblings, 1 reply; 7+ messages in thread
From: Eric W. Biederman @ 2008-07-28 22:00 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers

"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:

> We were talking this morning about what trivial patchset to begin
> with to get a start on checkpoint and restart.  We thought that
> rather than start with checkpoint, maybe we should start with
> something that reads a "checkpoint file" and "restarts" a single
> task.  In this case, restart means it sets the process id and
> executes the file which are found in the checkpoint file.
>
> So here's what we whipped up for a half hour this morning,
> and during some of Mark's talk this afternoon.
>
> It refuses to run if it isn't the container init, so you must
> unshare your pidns before calling sys_restore().

A reasonable approximation. 

Dave Hansen made a good point when he asked how do we graft a restored
checkpoint into the rest of the system.  Requiring us to unshare
everything we intend to unshare before restore achieves this easily,
and you are following in that model.

That leads to an interesting implication.  We don't need to set the pid
of the first process.  At most we can verify that the pid is the same.
If we have unshared the pid namespace the pid will be 1 and the needed
pid of the first process will be 1.

More later.

Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/2] sys_restore prototype
       [not found]     ` <m1sktthdp3.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-07-29 18:27       ` Serge E. Hallyn
  0 siblings, 0 replies; 7+ messages in thread
From: Serge E. Hallyn @ 2008-07-29 18:27 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linux Containers

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> writes:
> 
> > We were talking this morning about what trivial patchset to begin
> > with to get a start on checkpoint and restart.  We thought that
> > rather than start with checkpoint, maybe we should start with
> > something that reads a "checkpoint file" and "restarts" a single
> > task.  In this case, restart means it sets the process id and
> > executes the file which are found in the checkpoint file.
> >
> > So here's what we whipped up for a half hour this morning,
> > and during some of Mark's talk this afternoon.
> >
> > It refuses to run if it isn't the container init, so you must
> > unshare your pidns before calling sys_restore().
> 
> A reasonable approximation. 
> 
> Dave Hansen made a good point when he asked how do we graft a restored
> checkpoint into the rest of the system.  Requiring us to unshare
> everything we intend to unshare before restore achieves this easily,
> and you are following in that model.
> 
> That leads to an interesting implication.  We don't need to set the pid
> of the first process.  At most we can verify that the pid is the same.
> If we have unshared the pid namespace the pid will be 1 and the needed
> pid of the first process will be 1.
> 
> More later.

Good point.  Sounds like our trivial prototype was still way
over-featureful :)

-serge

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Devel] [PATCH 2/2] sys_restore: set the pid number
       [not found]         ` <488BDE7D.7090602-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
@ 2008-07-29 18:31           ` Serge E. Hallyn
  0 siblings, 0 replies; 7+ messages in thread
From: Serge E. Hallyn @ 2008-07-29 18:31 UTC (permalink / raw)
  To: Pavel Emelyanov; +Cc: Linux Containers

Quoting Pavel Emelyanov (xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org):
> Serge E. Hallyn wrote:
> > Set the pid number for a restored task.  This is purely a toy, as it
> > only sets the pidnr in the lowest level pid namespace.
> > 
> > Signed-off-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> > ---
> >  kernel/fork.c |    5 +++++
> >  kernel/pid.c  |   19 +++++++++++++++++++
> >  2 files changed, 24 insertions(+), 0 deletions(-)
> > 
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 0e43f69..41c46d2 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1685,6 +1685,8 @@ int unshare_files(struct files_struct **displaced)
> >  char buf[CKPT_SIZE];
> >  char exe_filename[PAGE_SIZE];
> >  
> > +extern int choose_pidmap(struct pid *pid, int new);
> > +
> >  /*
> >   * userspace will already have made us a new pidns
> >   */
> > @@ -1717,6 +1719,9 @@ long do_restore(unsigned int fd,
> >  	if (nr_scanned != 2)
> >  		return -EINVAL;
> >  
> > +	ret = choose_pidmap(task_pid(current), pid);
> > +	if (!ret)
> > +		return -EAGAIN;
> >  
> >  	ret = do_execve(exe_filename, argv, envp, regs);
> >  out:
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index 30bd5d4..88a5e2a 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -122,6 +122,25 @@ static void free_pidmap(struct upid *upid)
> >  	atomic_inc(&map->nr_free);
> >  }
> >  
> > +int choose_pidmap(struct pid *pid, int new)
> > +{
> > +	struct pidmap *map;
> > +	int level = pid->level;
> > +	struct upid *upid = &pid->numbers[level];
> > +	struct pid_namespace *pid_ns = upid->ns;
> > +	int old = upid->nr;
> > +
> > +	map = &pid_ns->pidmap[new/BITS_PER_PAGE];
> > +	if (!test_and_set_bit(new, map->page)) {
> 
> This and...
> 
> > +		map = &pid_ns->pidmap[old/BITS_PER_PAGE];
> > +		clear_bit(old, map->page);
> 
> this are both a bit buggy, since the bit number on the page
> is not the same as the pid id itself ;)

Oops.  Yes, I see.

thanks,
-serge

> > +		upid->nr = new;
> > +		return 1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int alloc_pidmap(struct pid_namespace *pid_ns)
> >  {
> >  	int i, offset, max_scan, pid, last = pid_ns->last_pid;

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2008-07-29 18:31 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-25 22:56 [PATCH 0/2] sys_restore prototype Serge E. Hallyn
     [not found] ` <20080725225655.GA28276-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2008-07-25 22:58   ` [PATCH 1/2] introduce sys_restore Serge E. Hallyn
2008-07-25 22:59   ` [PATCH 2/2] sys_restore: set the pid number Serge E. Hallyn
     [not found]     ` <20080725225935.GB28764-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2008-07-27  2:33       ` [Devel] " Pavel Emelyanov
     [not found]         ` <488BDE7D.7090602-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-07-29 18:31           ` Serge E. Hallyn
2008-07-28 22:00   ` [PATCH 0/2] sys_restore prototype Eric W. Biederman
     [not found]     ` <m1sktthdp3.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-07-29 18:27       ` Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.