All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] checkpoint/restart: Add support for epoll
@ 2009-07-14 23:54 Matt Helsley
       [not found] ` <20090714235405.GH5213-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Matt Helsley @ 2009-07-14 23:54 UTC (permalink / raw)
  To: Containers

 Add checkpoint/restart support for epoll files. This is the minimum
 support necessary to recreate the epoll item sets without any pending
 events.

 This is an RFC to show where I'm going with the patch and give an idea
 of how much code I expect it will take. Compiles and boots on x86 but
 I haven't tested it.

 Caveats: Does not correctly support restoring epoll fds to epoll sets
	  as this requires some recursion detection/avoidance. See the
	  "TODO" that mentions deferqueues.

	  Does not attempt to save pending event information for
	  delivery during restart.

	  TODO: Look into if we need a restart block for epoll_wait().

Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/files.c             |   13 +++
 fs/eventpoll.c                 |  181 +++++++++++++++++++++++++++++++++++++++-
 include/linux/checkpoint_hdr.h |   15 ++++
 include/linux/eventpoll.h      |   14 +++-
 4 files changed, 221 insertions(+), 2 deletions(-)

diff --git a/checkpoint/files.c b/checkpoint/files.c
index 5ca2e6c..7233a9b 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -484,6 +484,11 @@ struct restore_file_ops {
 				  struct ckpt_hdr_file *ptr);
 };
 
+#ifdef CONFIG_EPOLL
+extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
+				    struct ckpt_hdr_file *ptr);
+#endif
+
 static struct restore_file_ops restore_file_ops[] = {
 	/* ignored file */
 	{
@@ -503,6 +508,14 @@ static struct restore_file_ops restore_file_ops[] = {
 		.file_type = CKPT_FILE_PIPE,
 		.restore = pipe_file_restore,
 	},
+#ifdef CONFIG_EPOLL
+	/* epoll */
+	{
+		.file_name = "EPOLL",
+		.file_type = CKPT_FILE_EPOLL,
+		.restore = ep_file_restore,
+	},
+#endif
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5458e80..9b2414b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -668,10 +668,17 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	return pollflags != -1 ? pollflags : 0;
 }
 
+#ifdef CONFIG_CHECKPOINT
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+#else
+#define ep_eventpoll_checkpoint NULL
+#endif
+
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
-	.poll		= ep_eventpoll_poll
+	.poll		= ep_eventpoll_poll,
+	.checkpoint 	= ep_eventpoll_checkpoint,
 };
 
 /* Fast test to see if the file is an evenpoll file */
@@ -1410,6 +1417,178 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_eventpoll *h;
+	struct ckpt_eventpoll_item *cepi;
+	struct rb_node *rbp;
+	struct eventpoll *ep;
+	int nitems = 0, rc = -ENOMEM;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_EPOLL);
+	if (!h)
+		return rc;
+
+	ep = file->private_data;
+	/* TODO see if we need mutex_lock(&ep->mtx);*/
+	for (rbp = rb_first(&ep->rbr); rbp && nitems++; rbp = rb_next(rbp)) {}
+	h->num_items = nitems;
+	h->common.f_type = CKPT_FILE_EPOLL;
+	rc = checkpoint_file_common(ctx, file, &h->common);
+	if (!rc) {
+		/*
+		 * We write this in such a weird way! The problem is we want
+		 * to use the common file checkpoint code above but we also
+		 * want a variable number of saved epitems to follow the
+		 * num_items field. So we write out the header type and length,
+		 * then we write the remaining, fixed-size part of the struct.
+		 * Finally we'll write each epitem by walking the rbtree.
+		 */
+		h->common.h.len += nitems*sizeof(*cepi);
+		rc = ckpt_write_obj_type(ctx, NULL, h->common.h.len,
+					 h->common.h.type);
+		ckpt_kwrite(ctx, ((void*)&h->common.h) + sizeof(h->common.h),
+			    sizeof(*h) - sizeof(h->common.h));
+	}
+	ckpt_hdr_put(ctx, h);
+	if (rc)
+		goto out;
+
+	/* 
+	 * FIXME for now we do it one at a time. Later we might do it like
+	 * checkpoint_pids() for better performance since there can be many
+	 * more epoll items than pids.
+	 */
+	cepi = ckpt_hdr_get(ctx, sizeof(*cepi));
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
+		cepi->fd = epi->ffd.fd;
+		cepi->events = epi->event.events;
+		cepi->data = epi->event.data;
+		if (ckpt_kwrite(ctx, cepi, sizeof(*cepi)) < 0)
+			break;
+	}
+	_ckpt_hdr_put(ctx, cepi, sizeof(*cepi));
+out:
+	/*mutex_unlock(&ep->mtx);*/
+	return rc;
+}
+
+struct file* ep_file_restore(struct ckpt_ctx *ctx,
+			     struct ckpt_hdr_file *ptr)
+{
+	struct ckpt_hdr_file_eventpoll *h;
+	struct eventpoll *ep;
+	struct file *epfile;
+	int epfd, error, i;
+
+	h = container_of(ptr, struct ckpt_hdr_file_eventpoll, common);
+	if (h->common.h.type != CKPT_HDR_FILE_EPOLL ||
+	    h->common.h.len < sizeof(*h) ||
+	    h->common.f_type != CKPT_FILE_EPOLL)
+		return ERR_PTR(-EINVAL);
+
+	/* limit max ep watches and the use of flags */
+	if (h->num_items >= max_user_watches)
+		return ERR_PTR(-ENOSPC);
+	if (h->common.f_flags & ~EPOLL_CLOEXEC)
+		return ERR_PTR(-EINVAL);
+
+	/* similar to epoll_create() */
+	error = ep_alloc(&ep);
+	if (error < 0)
+		return ERR_PTR(error);
+	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+				 ptr->f_flags & O_CLOEXEC);
+	if (error < 0) {
+		ep_free(ep);
+		return ERR_PTR(error);
+	}
+
+	/* Everything's allocated, we just need a file* */
+	epfd = error;
+	epfile = fget(epfd);
+	BUG_ON(!epfile);
+
+	/* Restore the common file bits */
+	i = 0;
+	error = restore_file_common(ctx, epfile, ptr);
+	if (error < 0)
+		goto error_return;
+
+	/* Restore the epoll items/watches */
+	for (; i < h->num_items; i++) {
+		/* 
+		 * Loop body like multiple epoll_ctl(ep, ADD, event)
+		 * calls except we've already done much of the checking.
+		 */
+		struct ckpt_eventpoll_item cepi;
+		struct epoll_event epev;
+		struct epitem *epi;
+		struct file *tfile;
+
+		error = ckpt_kread(ctx, &cepi, sizeof(cepi));
+		if (error < 0) {
+			i++;
+			break;
+		}
+		epev.events = cepi.events;
+		epev.data = cepi.data;
+
+		/* Get the file* for the target file */
+		error = -EFAULT;
+		tfile = fget(cepi.fd);
+		if (!tfile) {
+			/*
+			 * TODO delayed addition of epitem because 
+			 * tfile hasn't been restored yet.
+			 */
+			continue;
+		}
+
+		/* The target file descriptor must support poll */
+		error = -EPERM;
+		if (!tfile->f_op || !tfile->f_op->poll)
+			goto error_tgt_fput;
+
+		/* Cannot add an epoll file descriptor inside itself. */
+		error = -EINVAL;
+		if (epfile == tfile)
+			goto error_tgt_fput;
+
+		/* mutex_lock(&ep->mtx); TODO check if we need */
+		epi = ep_find(ep, tfile, cepi.fd);
+		if (!epi) {
+			epev.events |= POLLERR | POLLHUP;
+			error = ep_insert(ep, &epev, tfile, cepi.fd);
+		} else
+			error = -EEXIST;
+		/* mutex_unlock(&ep->mtx); */
+error_tgt_fput:
+		fput(tfile);
+		if (error < 0)
+			break;
+	}
+error_return:
+	/* Read the remaining number of items. */
+	for (; i < h->num_items; i++) {
+		struct ckpt_eventpoll_item cepi;
+		ckpt_kread(ctx, &cepi, sizeof(cepi));
+	}
+	if (error < 0) {
+		/* TODO closeup epfile and epfd */
+		fput(epfile);
+		sys_close(epfd);
+		epfile = ERR_PTR(error);
+	}
+	return epfile;
+}
+#endif /* CONFIG_CHECKPOINT */
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 874518c..58f7334 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -64,6 +64,7 @@ enum {
 	CKPT_HDR_FILE_NAME,
 	CKPT_HDR_FILE,
 	CKPT_HDR_FILE_PIPE,
+	CKPT_HDR_FILE_EPOLL,
 
 	CKPT_HDR_MM = 401,
 	CKPT_HDR_VMA,
@@ -226,6 +227,7 @@ enum file_type {
 	CKPT_FILE_IGNORE = 0,
 	CKPT_FILE_GENERIC,
 	CKPT_FILE_PIPE,
+	CKPT_FILE_EPOLL,
 	CKPT_FILE_MAX
 };
 
@@ -254,6 +256,19 @@ struct ckpt_hdr_file_pipe_state {
 	__s32 pipe_len;
 } __attribute__((aligned(8)));
 
+struct ckpt_eventpoll_item {
+	__u32 fd;
+	__u32 events;
+	__u64 data;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_file_eventpoll {
+	struct ckpt_hdr_file common;
+	__u32  num_items;
+	__u32  padding;
+	struct ckpt_eventpoll_item items[0];
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_utsns {
 	struct ckpt_hdr h;
 	__u32 nodename_len;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f6856a5..802053a 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -95,11 +95,23 @@ static inline void eventpoll_release(struct file *file)
 	eventpoll_release_file(file);
 }
 
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint_hdr.h>
+extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
+				    struct ckpt_hdr_file *ptr);
+#endif
 #else
 
 static inline void eventpoll_init_file(struct file *file) {}
 static inline void eventpoll_release(struct file *file) {}
-
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint_hdr.h>
+static inline struct file* ep_file_restore(struct ckpt_ctx *ctx,
+					   struct ckpt_hdr_file *ptr)
+{
+	return NULL;
+}
+#endif
 #endif
 
 #endif /* #ifdef __KERNEL__ */
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] checkpoint/restart: Add support for epoll
       [not found] ` <20090714235405.GH5213-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-07-15  9:22   ` Cedric Le Goater
  2009-07-24  8:35   ` Oren Laadan
  1 sibling, 0 replies; 5+ messages in thread
From: Cedric Le Goater @ 2009-07-15  9:22 UTC (permalink / raw)
  To: Matt Helsley; +Cc: Containers

On 07/15/2009 01:54 AM, Matt Helsley wrote:
>   Add checkpoint/restart support for epoll files. This is the minimum
>   support necessary to recreate the epoll item sets without any pending
>   events.
>
>   This is an RFC to show where I'm going with the patch and give an idea
>   of how much code I expect it will take. Compiles and boots on x86 but
>   I haven't tested it.
>
>   Caveats: Does not correctly support restoring epoll fds to epoll sets
> 	  as this requires some recursion detection/avoidance. See the
> 	  "TODO" that mentions deferqueues.

it all depends on the algorithm of C/R for files.

the 'contents', the watched fds in your case, of the 'struct file' should be
restored after all the struct files have been restored. you'll have the same
issue with other fd types.

Also,I think it it possible to add 2 epoll services providing get and set without
mixing the epoll code with C/R code. ugly, IHMO. something like :

	diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
	index f6856a5..b19b7a1 100644
	--- a/include/linux/eventpoll.h
	+++ b/include/linux/eventpoll.h
	@@ -49,6 +49,12 @@ struct epoll_event {
	        __u64 data;
	 } EPOLL_PACKED;
  
	+struct epoll_fd_event {
	+       int fd;
	+       struct file* file;
	+       struct epoll_event event;
	+} EPOLL_PACKED;
	+
	 #ifdef __KERNEL__
	
	 /* Forward declarations to avoid compiler errors */
	@@ -95,6 +101,9 @@ static inline void eventpoll_release(struct file *file)
	        eventpoll_release_file(file);
	 }
  
	+extern int eventpoll_get_events(int, struct epoll_fd_event *, int);
	+extern int eventpoll_insert(int, int, struct file*, struct epoll_event *);
	+
	 #else
	
	 static inline void eventpoll_init_file(struct file *file) {}

finally,


> +	/*
> +	 * FIXME for now we do it one at a time. Later we might do it like
> +	 * checkpoint_pids() for better performance since there can be many
> +	 * more epoll items than pids.
> +	 */
> +	cepi = ckpt_hdr_get(ctx, sizeof(*cepi));
> +	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> +		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
> +		cepi->fd = epi->ffd.fd;

epi->ffd.fd and epi->ffd.file are not necessarily related. fd could have been
recycled after the EPOLL_CTL_ADD. nop ? and it will break the restart.

C.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] checkpoint/restart: Add support for epoll
       [not found] ` <20090714235405.GH5213-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
  2009-07-15  9:22   ` Cedric Le Goater
@ 2009-07-24  8:35   ` Oren Laadan
       [not found]     ` <4A697242.60606-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 1 reply; 5+ messages in thread
From: Oren Laadan @ 2009-07-24  8:35 UTC (permalink / raw)
  To: Matt Helsley; +Cc: Containers



Matt Helsley wrote:
>  Add checkpoint/restart support for epoll files. This is the minimum
>  support necessary to recreate the epoll item sets without any pending
>  events.
> 
>  This is an RFC to show where I'm going with the patch and give an idea
>  of how much code I expect it will take. Compiles and boots on x86 but
>  I haven't tested it.
> 
>  Caveats: Does not correctly support restoring epoll fds to epoll sets
> 	  as this requires some recursion detection/avoidance. See the
> 	  "TODO" that mentions deferqueues.
> 
> 	  Does not attempt to save pending event information for
> 	  delivery during restart.
> 
> 	  TODO: Look into if we need a restart block for epoll_wait().

Since epoll restore can only complete after all fd's of a task have
been restores, there is little advantage to even attempt restore
earlier, on the fly.

Instead, I suggest that first epoll fd's be created (like all other
files), and after all fd's are in place, the epoll state be restored.

To avoid keeping potentially large data describing this state in the
kernel, modify checkpoint to only dump the internal state of epoll
fd's after all other fd's of the task have been dumped.

In both cases deferqueue is our friend:

At checkpoint, have do_checkpoint_file_table() setup a deferqueue
to which checkpoint_file_desc() may add work. The deferqueue will
be executed at the end of do_checkpoint_file_table(), and dump the
state of each epoll fd.

At restart, do_restore_file_table() will do the same: setup a
deferqueue and use it to restore the state of all epoll fd's.

> 
> Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/files.c             |   13 +++
>  fs/eventpoll.c                 |  181 +++++++++++++++++++++++++++++++++++++++-
>  include/linux/checkpoint_hdr.h |   15 ++++
>  include/linux/eventpoll.h      |   14 +++-
>  4 files changed, 221 insertions(+), 2 deletions(-)
> 
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index 5ca2e6c..7233a9b 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -484,6 +484,11 @@ struct restore_file_ops {
>  				  struct ckpt_hdr_file *ptr);
>  };
>  
> +#ifdef CONFIG_EPOLL
> +extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
> +				    struct ckpt_hdr_file *ptr);
> +#endif
> +

Already in eventpoll.h ?

>  static struct restore_file_ops restore_file_ops[] = {
>  	/* ignored file */
>  	{
> @@ -503,6 +508,14 @@ static struct restore_file_ops restore_file_ops[] = {
>  		.file_type = CKPT_FILE_PIPE,
>  		.restore = pipe_file_restore,
>  	},
> +#ifdef CONFIG_EPOLL
> +	/* epoll */
> +	{
> +		.file_name = "EPOLL",
> +		.file_type = CKPT_FILE_EPOLL,
> +		.restore = ep_file_restore,
> +	},
> +#endif
>  };
>  
>  static struct file *do_restore_file(struct ckpt_ctx *ctx)
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 5458e80..9b2414b 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -668,10 +668,17 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
>  	return pollflags != -1 ? pollflags : 0;
>  }
>  
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +#else
> +#define ep_eventpoll_checkpoint NULL
> +#endif
> +
>  /* File callbacks that implement the eventpoll file behaviour */
>  static const struct file_operations eventpoll_fops = {
>  	.release	= ep_eventpoll_release,
> -	.poll		= ep_eventpoll_poll
> +	.poll		= ep_eventpoll_poll,
> +	.checkpoint 	= ep_eventpoll_checkpoint,
>  };
>  
>  /* Fast test to see if the file is an evenpoll file */
> @@ -1410,6 +1417,178 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
>  
>  #endif /* HAVE_SET_RESTORE_SIGMASK */
>  
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +

Each file/fd registered in an epoll fd produces a reference count
to the target file, this needs to be taken into account for leak
detection when collecting references.

I'm thinking of adding a new fops method for files for this purpose:
e.g. collect(), such that collect_file_desc() will invoke this method
on the file if that method is non-NULL.

(Turns out that it's useful also for ptys/ttys, since the underlying
tty also needs to be counted for leaks).

For epoll, the collect() method will add all those files that are
being tracked.

> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct ckpt_hdr_file_eventpoll *h;
> +	struct ckpt_eventpoll_item *cepi;
> +	struct rb_node *rbp;
> +	struct eventpoll *ep;
> +	int nitems = 0, rc = -ENOMEM;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_EPOLL);
> +	if (!h)
> +		return rc;
> +
> +	ep = file->private_data;
> +	/* TODO see if we need mutex_lock(&ep->mtx);*/

Yes we do, especially for subtree (non-full-container) checkpoints
where another, not-checkpointed process, may modify it concurrently.

> +	for (rbp = rb_first(&ep->rbr); rbp && nitems++; rbp = rb_next(rbp)) {}
> +	h->num_items = nitems;
> +	h->common.f_type = CKPT_FILE_EPOLL;
> +	rc = checkpoint_file_common(ctx, file, &h->common);
> +	if (!rc) {
> +		/*
> +		 * We write this in such a weird way! The problem is we want
> +		 * to use the common file checkpoint code above but we also
> +		 * want a variable number of saved epitems to follow the
> +		 * num_items field. So we write out the header type and length,
> +		 * then we write the remaining, fixed-size part of the struct.
> +		 * Finally we'll write each epitem by walking the rbtree.
> +		 */

If we write the epoll-specific state later (as suggested above),
then this weirdness disappears.

And if not, I suggest that you separate this header from the actual
epoll-specific state, namely the array of epoll elements. The latter
should go into its own object.

Besides, during restart, the entire object will be read into memory,
and the array can be (or be made) very large.

> +		h->common.h.len += nitems*sizeof(*cepi);
> +		rc = ckpt_write_obj_type(ctx, NULL, h->common.h.len,
> +					 h->common.h.type);
> +		ckpt_kwrite(ctx, ((void*)&h->common.h) + sizeof(h->common.h),
> +			    sizeof(*h) - sizeof(h->common.h));
> +	}
> +	ckpt_hdr_put(ctx, h);
> +	if (rc)
> +		goto out;
> +
> +	/* 
> +	 * FIXME for now we do it one at a time. Later we might do it like
> +	 * checkpoint_pids() for better performance since there can be many
> +	 * more epoll items than pids.
> +	 */

Defer the writing below ...

> +	cepi = ckpt_hdr_get(ctx, sizeof(*cepi));
> +	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> +		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
> +		cepi->fd = epi->ffd.fd;
> +		cepi->events = epi->event.events;
> +		cepi->data = epi->event.data;
> +		if (ckpt_kwrite(ctx, cepi, sizeof(*cepi)) < 0)
> +			break;
> +	}
> +	_ckpt_hdr_put(ctx, cepi, sizeof(*cepi));
> +out:
> +	/*mutex_unlock(&ep->mtx);*/
> +	return rc;
> +}
> +
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> +			     struct ckpt_hdr_file *ptr)
> +{
> +	struct ckpt_hdr_file_eventpoll *h;
> +	struct eventpoll *ep;
> +	struct file *epfile;
> +	int epfd, error, i;
> +
> +	h = container_of(ptr, struct ckpt_hdr_file_eventpoll, common);
> +	if (h->common.h.type != CKPT_HDR_FILE_EPOLL ||
> +	    h->common.h.len < sizeof(*h) ||
> +	    h->common.f_type != CKPT_FILE_EPOLL)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* limit max ep watches and the use of flags */
> +	if (h->num_items >= max_user_watches)
> +		return ERR_PTR(-ENOSPC);

This check should be against the sum of all epoll fd's per user.
It will take place elsewhere, and here it's incomplete and doesn't
add much.

> +	if (h->common.f_flags & ~EPOLL_CLOEXEC)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* similar to epoll_create() */
> +	error = ep_alloc(&ep);
> +	if (error < 0)
> +		return ERR_PTR(error);
> +	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
> +				 ptr->f_flags & O_CLOEXEC);
> +	if (error < 0) {
> +		ep_free(ep);
> +		return ERR_PTR(error);
> +	}
> +
> +	/* Everything's allocated, we just need a file* */
> +	epfd = error;
> +	epfile = fget(epfd);
> +	BUG_ON(!epfile);

Is there a reason not to use sys_epoll_create(), or refactor it
and use the common code ?

> +
> +	/* Restore the common file bits */
> +	i = 0;
> +	error = restore_file_common(ctx, epfile, ptr);
> +	if (error < 0)
> +		goto error_return;
> +
> +	/* Restore the epoll items/watches */
> +	for (; i < h->num_items; i++) {

Defer these ...

> +		/* 
> +		 * Loop body like multiple epoll_ctl(ep, ADD, event)
> +		 * calls except we've already done much of the checking.
> +		 */
> +		struct ckpt_eventpoll_item cepi;
> +		struct epoll_event epev;
> +		struct epitem *epi;
> +		struct file *tfile;
> +
> +		error = ckpt_kread(ctx, &cepi, sizeof(cepi));
> +		if (error < 0) {
> +			i++;
> +			break;
> +		}
> +		epev.events = cepi.events;
> +		epev.data = cepi.data;

The code below is a duplicate of sys_epoll_ctl(). Is there a reason
not to use that code, or refactor it and share the common code ?

> +
> +		/* Get the file* for the target file */
> +		error = -EFAULT;
> +		tfile = fget(cepi.fd);
> +		if (!tfile) {
> +			/*
> +			 * TODO delayed addition of epitem because 
> +			 * tfile hasn't been restored yet.
> +			 */
> +			continue;
> +		}
> +
> +		/* The target file descriptor must support poll */
> +		error = -EPERM;
> +		if (!tfile->f_op || !tfile->f_op->poll)
> +			goto error_tgt_fput;
> +
> +		/* Cannot add an epoll file descriptor inside itself. */
> +		error = -EINVAL;
> +		if (epfile == tfile)
> +			goto error_tgt_fput;
> +
> +		/* mutex_lock(&ep->mtx); TODO check if we need */

Yes, please.

> +		epi = ep_find(ep, tfile, cepi.fd);
> +		if (!epi) {
> +			epev.events |= POLLERR | POLLHUP;
> +			error = ep_insert(ep, &epev, tfile, cepi.fd);
> +		} else
> +			error = -EEXIST;
> +		/* mutex_unlock(&ep->mtx); */
> +error_tgt_fput:
> +		fput(tfile);
> +		if (error < 0)
> +			break;
> +	}
> +error_return:
> +	/* Read the remaining number of items. */
> +	for (; i < h->num_items; i++) {
> +		struct ckpt_eventpoll_item cepi;
> +		ckpt_kread(ctx, &cepi, sizeof(cepi));
> +	}

What's the purpose of this ?

> +	if (error < 0) {
> +		/* TODO closeup epfile and epfd */
> +		fput(epfile);
> +		sys_close(epfd);

sys_close() should happen regardless of whether we succeeded or
failed.

> +		epfile = ERR_PTR(error);
> +	}
> +	return epfile;
> +}
> +#endif /* CONFIG_CHECKPOINT */
> +

[...]

Oren.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] checkpoint/restart: Add support for epoll
       [not found]     ` <4A697242.60606-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-07-24 17:04       ` Matt Helsley
       [not found]         ` <20090724170401.GG5878-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Matt Helsley @ 2009-07-24 17:04 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers

On Fri, Jul 24, 2009 at 04:35:14AM -0400, Oren Laadan wrote:
> 
> 
> Matt Helsley wrote:
> >  Add checkpoint/restart support for epoll files. This is the minimum
> >  support necessary to recreate the epoll item sets without any pending
> >  events.
> > 
> >  This is an RFC to show where I'm going with the patch and give an idea
> >  of how much code I expect it will take. Compiles and boots on x86 but
> >  I haven't tested it.
> > 
> >  Caveats: Does not correctly support restoring epoll fds to epoll sets
> > 	  as this requires some recursion detection/avoidance. See the
> > 	  "TODO" that mentions deferqueues.
> > 
> > 	  Does not attempt to save pending event information for
> > 	  delivery during restart.
> > 
> > 	  TODO: Look into if we need a restart block for epoll_wait().
> 
> Since epoll restore can only complete after all fd's of a task have
> been restores, there is little advantage to even attempt restore
> earlier, on the fly.
> 
> Instead, I suggest that first epoll fd's be created (like all other
> files), and after all fd's are in place, the epoll state be restored.
> 
> To avoid keeping potentially large data describing this state in the
> kernel, modify checkpoint to only dump the internal state of epoll
> fd's after all other fd's of the task have been dumped.
> 
> In both cases deferqueue is our friend:
> 
> At checkpoint, have do_checkpoint_file_table() setup a deferqueue
> to which checkpoint_file_desc() may add work. The deferqueue will
> be executed at the end of do_checkpoint_file_table(), and dump the
> state of each epoll fd.
> 
> At restart, do_restore_file_table() will do the same: setup a
> deferqueue and use it to restore the state of all epoll fd's.
> 
> > 
> > Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> > ---
> >  checkpoint/files.c             |   13 +++
> >  fs/eventpoll.c                 |  181 +++++++++++++++++++++++++++++++++++++++-
> >  include/linux/checkpoint_hdr.h |   15 ++++
> >  include/linux/eventpoll.h      |   14 +++-
> >  4 files changed, 221 insertions(+), 2 deletions(-)
> > 
> > diff --git a/checkpoint/files.c b/checkpoint/files.c
> > index 5ca2e6c..7233a9b 100644
> > --- a/checkpoint/files.c
> > +++ b/checkpoint/files.c
> > @@ -484,6 +484,11 @@ struct restore_file_ops {
> >  				  struct ckpt_hdr_file *ptr);
> >  };
> >  
> > +#ifdef CONFIG_EPOLL
> > +extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > +				    struct ckpt_hdr_file *ptr);
> > +#endif
> > +
> 
> Already in eventpoll.h ?

Yup, good point. Fixed.

> 
> >  static struct restore_file_ops restore_file_ops[] = {
> >  	/* ignored file */
> >  	{
> > @@ -503,6 +508,14 @@ static struct restore_file_ops restore_file_ops[] = {
> >  		.file_type = CKPT_FILE_PIPE,
> >  		.restore = pipe_file_restore,
> >  	},
> > +#ifdef CONFIG_EPOLL
> > +	/* epoll */
> > +	{
> > +		.file_name = "EPOLL",
> > +		.file_type = CKPT_FILE_EPOLL,
> > +		.restore = ep_file_restore,
> > +	},
> > +#endif
> >  };
> >  
> >  static struct file *do_restore_file(struct ckpt_ctx *ctx)
> > diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> > index 5458e80..9b2414b 100644
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -668,10 +668,17 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
> >  	return pollflags != -1 ? pollflags : 0;
> >  }
> >  
> > +#ifdef CONFIG_CHECKPOINT
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> > +#else
> > +#define ep_eventpoll_checkpoint NULL
> > +#endif
> > +
> >  /* File callbacks that implement the eventpoll file behaviour */
> >  static const struct file_operations eventpoll_fops = {
> >  	.release	= ep_eventpoll_release,
> > -	.poll		= ep_eventpoll_poll
> > +	.poll		= ep_eventpoll_poll,
> > +	.checkpoint 	= ep_eventpoll_checkpoint,
> >  };
> >  
> >  /* Fast test to see if the file is an evenpoll file */
> > @@ -1410,6 +1417,178 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> >  
> >  #endif /* HAVE_SET_RESTORE_SIGMASK */
> >  
> > +#ifdef CONFIG_CHECKPOINT
> > +#include <linux/checkpoint.h>
> > +#include <linux/checkpoint_hdr.h>
> > +
> 
> Each file/fd registered in an epoll fd produces a reference count
> to the target file, this needs to be taken into account for leak
> detection when collecting references.
> 
> I'm thinking of adding a new fops method for files for this purpose:
> e.g. collect(), such that collect_file_desc() will invoke this method
> on the file if that method is non-NULL.
> 
> (Turns out that it's useful also for ptys/ttys, since the underlying
> tty also needs to be counted for leaks).
> 
> For epoll, the collect() method will add all those files that are
> being tracked.

Sounds good to me -- it was on my TODO list. Here's a rough draft of the
collect routine that I've got:

static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
{
        struct rb_node *rbp;
        struct eventpoll *ep;
        int rc = 0;

#if 0
	/* Not needed if we have a "collect" function ptr, otherwise
	   can be called unconditionally from ckpt collect files path
	   and this will exit early.. */
        if (!is_file_epoll(file))
                return rc;
#endif

        ep = file->private_data;
        mutex_lock(&ep->mtx);
        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), nitems++) {
                struct epitem *epi;

                epi = rb_entry(rbp, struct epitem, rbn);
                rc = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
                if (rc < 0)
			break;
        }
        mutex_unlock(&ep->mtx);
	return rc;
}


> 
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > +	struct ckpt_hdr_file_eventpoll *h;
> > +	struct ckpt_eventpoll_item *cepi;
> > +	struct rb_node *rbp;
> > +	struct eventpoll *ep;
> > +	int nitems = 0, rc = -ENOMEM;
> > +
> > +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_EPOLL);
> > +	if (!h)
> > +		return rc;
> > +
> > +	ep = file->private_data;
> > +	/* TODO see if we need mutex_lock(&ep->mtx);*/
> 
> Yes we do, especially for subtree (non-full-container) checkpoints
> where another, not-checkpointed process, may modify it concurrently.

OK.

> 
> > +	for (rbp = rb_first(&ep->rbr); rbp && nitems++; rbp = rb_next(rbp)) {}
> > +	h->num_items = nitems;
> > +	h->common.f_type = CKPT_FILE_EPOLL;
> > +	rc = checkpoint_file_common(ctx, file, &h->common);
> > +	if (!rc) {
> > +		/*
> > +		 * We write this in such a weird way! The problem is we want
> > +		 * to use the common file checkpoint code above but we also
> > +		 * want a variable number of saved epitems to follow the
> > +		 * num_items field. So we write out the header type and length,
> > +		 * then we write the remaining, fixed-size part of the struct.
> > +		 * Finally we'll write each epitem by walking the rbtree.
> > +		 */
> 
> If we write the epoll-specific state later (as suggested above),
> then this weirdness disappears.
> 
> And if not, I suggest that you separate this header from the actual
> epoll-specific state, namely the array of epoll elements. The latter
> should go into its own object.
> 
> Besides, during restart, the entire object will be read into memory,
> and the array can be (or be made) very large.

Sure.

> 
> > +		h->common.h.len += nitems*sizeof(*cepi);
> > +		rc = ckpt_write_obj_type(ctx, NULL, h->common.h.len,
> > +					 h->common.h.type);
> > +		ckpt_kwrite(ctx, ((void*)&h->common.h) + sizeof(h->common.h),
> > +			    sizeof(*h) - sizeof(h->common.h));
> > +	}
> > +	ckpt_hdr_put(ctx, h);
> > +	if (rc)
> > +		goto out;
> > +
> > +	/* 
> > +	 * FIXME for now we do it one at a time. Later we might do it like
> > +	 * checkpoint_pids() for better performance since there can be many
> > +	 * more epoll items than pids.
> > +	 */
> 
> Defer the writing below ...

OK

> 
> > +	cepi = ckpt_hdr_get(ctx, sizeof(*cepi));
> > +	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> > +		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
> > +		cepi->fd = epi->ffd.fd;
> > +		cepi->events = epi->event.events;
> > +		cepi->data = epi->event.data;
> > +		if (ckpt_kwrite(ctx, cepi, sizeof(*cepi)) < 0)
> > +			break;
> > +	}
> > +	_ckpt_hdr_put(ctx, cepi, sizeof(*cepi));
> > +out:
> > +	/*mutex_unlock(&ep->mtx);*/
> > +	return rc;
> > +}
> > +
> > +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > +			     struct ckpt_hdr_file *ptr)
> > +{
> > +	struct ckpt_hdr_file_eventpoll *h;
> > +	struct eventpoll *ep;
> > +	struct file *epfile;
> > +	int epfd, error, i;
> > +
> > +	h = container_of(ptr, struct ckpt_hdr_file_eventpoll, common);
> > +	if (h->common.h.type != CKPT_HDR_FILE_EPOLL ||
> > +	    h->common.h.len < sizeof(*h) ||
> > +	    h->common.f_type != CKPT_FILE_EPOLL)
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	/* limit max ep watches and the use of flags */
> > +	if (h->num_items >= max_user_watches)
> > +		return ERR_PTR(-ENOSPC);
> 
> This check should be against the sum of all epoll fd's per user.
> It will take place elsewhere, and here it's incomplete and doesn't
> add much.

Yeah, I noticed that too. Currently, I've got:

       /* Limit max ep watches. */
        user = get_current_user();
        remaining_watches = (max_user_watches -
                             atomic_read(&user->epoll_watches));
        free_uid(user);
        if (h->num_items > remaining_watches)
                return ERR_PTR(-ENOSPC);

ep_insert() checks user->epoll_watches too. So even the original version
would have failed eventually if the number of watches would have been
exceeded.

The check in ep_insert() also means we'll properly enforce the limit even
if we're running in parallel with other epoll file restores.

So really this check is redundant. I added it originally with the idea
that I might not be able to use ep_insert() directly. Now I'm thinking
it might be better to remove it entirely.

> > +	if (h->common.f_flags & ~EPOLL_CLOEXEC)
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	/* similar to epoll_create() */
> > +	error = ep_alloc(&ep);
> > +	if (error < 0)
> > +		return ERR_PTR(error);
> > +	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
> > +				 ptr->f_flags & O_CLOEXEC);
> > +	if (error < 0) {
> > +		ep_free(ep);
> > +		return ERR_PTR(error);
> > +	}
> > +
> > +	/* Everything's allocated, we just need a file* */
> > +	epfd = error;
> > +	epfile = fget(epfd);
> > +	BUG_ON(!epfile);
> 
> Is there a reason not to use sys_epoll_create(), or refactor it
> and use the common code ?

Yeah, I'll reuse it.

> 
> > +
> > +	/* Restore the common file bits */
> > +	i = 0;
> > +	error = restore_file_common(ctx, epfile, ptr);
> > +	if (error < 0)
> > +		goto error_return;
> > +
> > +	/* Restore the epoll items/watches */
> > +	for (; i < h->num_items; i++) {
> 
> Defer these ...

OK.

> 
> > +		/* 
> > +		 * Loop body like multiple epoll_ctl(ep, ADD, event)
> > +		 * calls except we've already done much of the checking.
> > +		 */
> > +		struct ckpt_eventpoll_item cepi;
> > +		struct epoll_event epev;
> > +		struct epitem *epi;
> > +		struct file *tfile;
> > +
> > +		error = ckpt_kread(ctx, &cepi, sizeof(cepi));
> > +		if (error < 0) {
> > +			i++;
> > +			break;
> > +		}
> > +		epev.events = cepi.events;
> > +		epev.data = cepi.data;
> 
> The code below is a duplicate of sys_epoll_ctl(). Is there a reason
> not to use that code, or refactor it and share the common code ?

I certainly can't reuse it directly since it does a copy_from_user().
Also I've already got the epoll file* and know the op. I'll look for a
good way to factor common pieces from both sys_epoll_ctl() and
ep_file_restore().

> 
> > +
> > +		/* Get the file* for the target file */
> > +		error = -EFAULT;
> > +		tfile = fget(cepi.fd);
> > +		if (!tfile) {
> > +			/*
> > +			 * TODO delayed addition of epitem because 
> > +			 * tfile hasn't been restored yet.
> > +			 */
> > +			continue;
> > +		}
> > +
> > +		/* The target file descriptor must support poll */
> > +		error = -EPERM;
> > +		if (!tfile->f_op || !tfile->f_op->poll)
> > +			goto error_tgt_fput;
> > +
> > +		/* Cannot add an epoll file descriptor inside itself. */
> > +		error = -EINVAL;
> > +		if (epfile == tfile)
> > +			goto error_tgt_fput;
> > +
> > +		/* mutex_lock(&ep->mtx); TODO check if we need */
> 
> Yes, please.

OK.

> 
> > +		epi = ep_find(ep, tfile, cepi.fd);
> > +		if (!epi) {
> > +			epev.events |= POLLERR | POLLHUP;
> > +			error = ep_insert(ep, &epev, tfile, cepi.fd);
> > +		} else
> > +			error = -EEXIST;
> > +		/* mutex_unlock(&ep->mtx); */
> > +error_tgt_fput:
> > +		fput(tfile);
> > +		if (error < 0)
> > +			break;
> > +	}
> > +error_return:
> > +	/* Read the remaining number of items. */
> > +	for (; i < h->num_items; i++) {
> > +		struct ckpt_eventpoll_item cepi;
> > +		ckpt_kread(ctx, &cepi, sizeof(cepi));
> > +	}
> 
> What's the purpose of this ?

If we encounter an error we're left in the middle of the epoll item
array. This effectively seeks to the end of the array. Not needed
when deferring as you suggested since it changes the way we read the
ckpt image..

> 
> > +	if (error < 0) {
> > +		/* TODO closeup epfile and epfd */
> > +		fput(epfile);
> > +		sys_close(epfd);
> 
> sys_close() should happen regardless of whether we succeeded or
> failed.

OK, for some reason I thought restore_file_desc() tried fget() before
resorting to attach_file()...

Thanks for the review.

Cheers,
	-Matt Helsley

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC][PATCH] checkpoint/restart: Add support for epoll
       [not found]         ` <20090724170401.GG5878-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-07-24 17:22           ` Oren Laadan
  0 siblings, 0 replies; 5+ messages in thread
From: Oren Laadan @ 2009-07-24 17:22 UTC (permalink / raw)
  To: Matt Helsley; +Cc: Containers



Matt Helsley wrote:
> On Fri, Jul 24, 2009 at 04:35:14AM -0400, Oren Laadan wrote:
>>
>> Matt Helsley wrote:
>>>  Add checkpoint/restart support for epoll files. This is the minimum
>>>  support necessary to recreate the epoll item sets without any pending
>>>  events.
>>>
>>>  This is an RFC to show where I'm going with the patch and give an idea
>>>  of how much code I expect it will take. Compiles and boots on x86 but
>>>  I haven't tested it.
>>>
>>>  Caveats: Does not correctly support restoring epoll fds to epoll sets
>>> 	  as this requires some recursion detection/avoidance. See the
>>> 	  "TODO" that mentions deferqueues.
>>>
>>> 	  Does not attempt to save pending event information for
>>> 	  delivery during restart.
>>>
>>> 	  TODO: Look into if we need a restart block for epoll_wait().
>> Since epoll restore can only complete after all fd's of a task have
>> been restores, there is little advantage to even attempt restore
>> earlier, on the fly.
>>
>> Instead, I suggest that first epoll fd's be created (like all other
>> files), and after all fd's are in place, the epoll state be restored.
>>
>> To avoid keeping potentially large data describing this state in the
>> kernel, modify checkpoint to only dump the internal state of epoll
>> fd's after all other fd's of the task have been dumped.
>>
>> In both cases deferqueue is our friend:
>>
>> At checkpoint, have do_checkpoint_file_table() setup a deferqueue
>> to which checkpoint_file_desc() may add work. The deferqueue will
>> be executed at the end of do_checkpoint_file_table(), and dump the
>> state of each epoll fd.
>>
>> At restart, do_restore_file_table() will do the same: setup a
>> deferqueue and use it to restore the state of all epoll fd's.
>>
>>> Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>>> ---
>>>  checkpoint/files.c             |   13 +++
>>>  fs/eventpoll.c                 |  181 +++++++++++++++++++++++++++++++++++++++-
>>>  include/linux/checkpoint_hdr.h |   15 ++++
>>>  include/linux/eventpoll.h      |   14 +++-
>>>  4 files changed, 221 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/checkpoint/files.c b/checkpoint/files.c
>>> index 5ca2e6c..7233a9b 100644
>>> --- a/checkpoint/files.c
>>> +++ b/checkpoint/files.c
>>> @@ -484,6 +484,11 @@ struct restore_file_ops {
>>>  				  struct ckpt_hdr_file *ptr);
>>>  };
>>>  
>>> +#ifdef CONFIG_EPOLL
>>> +extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
>>> +				    struct ckpt_hdr_file *ptr);
>>> +#endif
>>> +
>> Already in eventpoll.h ?
> 
> Yup, good point. Fixed.
> 
>>>  static struct restore_file_ops restore_file_ops[] = {
>>>  	/* ignored file */
>>>  	{
>>> @@ -503,6 +508,14 @@ static struct restore_file_ops restore_file_ops[] = {
>>>  		.file_type = CKPT_FILE_PIPE,
>>>  		.restore = pipe_file_restore,
>>>  	},
>>> +#ifdef CONFIG_EPOLL
>>> +	/* epoll */
>>> +	{
>>> +		.file_name = "EPOLL",
>>> +		.file_type = CKPT_FILE_EPOLL,
>>> +		.restore = ep_file_restore,
>>> +	},
>>> +#endif
>>>  };
>>>  
>>>  static struct file *do_restore_file(struct ckpt_ctx *ctx)
>>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>>> index 5458e80..9b2414b 100644
>>> --- a/fs/eventpoll.c
>>> +++ b/fs/eventpoll.c
>>> @@ -668,10 +668,17 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
>>>  	return pollflags != -1 ? pollflags : 0;
>>>  }
>>>  
>>> +#ifdef CONFIG_CHECKPOINT
>>> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
>>> +#else
>>> +#define ep_eventpoll_checkpoint NULL
>>> +#endif
>>> +
>>>  /* File callbacks that implement the eventpoll file behaviour */
>>>  static const struct file_operations eventpoll_fops = {
>>>  	.release	= ep_eventpoll_release,
>>> -	.poll		= ep_eventpoll_poll
>>> +	.poll		= ep_eventpoll_poll,
>>> +	.checkpoint 	= ep_eventpoll_checkpoint,
>>>  };
>>>  
>>>  /* Fast test to see if the file is an evenpoll file */
>>> @@ -1410,6 +1417,178 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
>>>  
>>>  #endif /* HAVE_SET_RESTORE_SIGMASK */
>>>  
>>> +#ifdef CONFIG_CHECKPOINT
>>> +#include <linux/checkpoint.h>
>>> +#include <linux/checkpoint_hdr.h>
>>> +
>> Each file/fd registered in an epoll fd produces a reference count
>> to the target file, this needs to be taken into account for leak
>> detection when collecting references.
>>
>> I'm thinking of adding a new fops method for files for this purpose:
>> e.g. collect(), such that collect_file_desc() will invoke this method
>> on the file if that method is non-NULL.
>>
>> (Turns out that it's useful also for ptys/ttys, since the underlying
>> tty also needs to be counted for leaks).
>>
>> For epoll, the collect() method will add all those files that are
>> being tracked.
> 
> Sounds good to me -- it was on my TODO list. Here's a rough draft of the
> collect routine that I've got:
> 
> static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> {
>         struct rb_node *rbp;
>         struct eventpoll *ep;
>         int rc = 0;
> 
> #if 0
> 	/* Not needed if we have a "collect" function ptr, otherwise
> 	   can be called unconditionally from ckpt collect files path
> 	   and this will exit early.. */
>         if (!is_file_epoll(file))
>                 return rc;
> #endif
> 
>         ep = file->private_data;
>         mutex_lock(&ep->mtx);
>         for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), nitems++) {
>                 struct epitem *epi;
> 
>                 epi = rb_entry(rbp, struct epitem, rbn);
>                 rc = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
>                 if (rc < 0)
> 			break;
>         }
>         mutex_unlock(&ep->mtx);
> 	return rc;
> }

Looks ok to me.

> 
> 
>>> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
>>> +{
>>> +	struct ckpt_hdr_file_eventpoll *h;
>>> +	struct ckpt_eventpoll_item *cepi;
>>> +	struct rb_node *rbp;
>>> +	struct eventpoll *ep;
>>> +	int nitems = 0, rc = -ENOMEM;
>>> +
>>> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_EPOLL);
>>> +	if (!h)
>>> +		return rc;
>>> +
>>> +	ep = file->private_data;
>>> +	/* TODO see if we need mutex_lock(&ep->mtx);*/
>> Yes we do, especially for subtree (non-full-container) checkpoints
>> where another, not-checkpointed process, may modify it concurrently.
> 
> OK.
> 
>>> +	for (rbp = rb_first(&ep->rbr); rbp && nitems++; rbp = rb_next(rbp)) {}
>>> +	h->num_items = nitems;
>>> +	h->common.f_type = CKPT_FILE_EPOLL;
>>> +	rc = checkpoint_file_common(ctx, file, &h->common);
>>> +	if (!rc) {
>>> +		/*
>>> +		 * We write this in such a weird way! The problem is we want
>>> +		 * to use the common file checkpoint code above but we also
>>> +		 * want a variable number of saved epitems to follow the
>>> +		 * num_items field. So we write out the header type and length,
>>> +		 * then we write the remaining, fixed-size part of the struct.
>>> +		 * Finally we'll write each epitem by walking the rbtree.
>>> +		 */
>> If we write the epoll-specific state later (as suggested above),
>> then this weirdness disappears.
>>
>> And if not, I suggest that you separate this header from the actual
>> epoll-specific state, namely the array of epoll elements. The latter
>> should go into its own object.
>>
>> Besides, during restart, the entire object will be read into memory,
>> and the array can be (or be made) very large.
> 
> Sure.
> 
>>> +		h->common.h.len += nitems*sizeof(*cepi);
>>> +		rc = ckpt_write_obj_type(ctx, NULL, h->common.h.len,
>>> +					 h->common.h.type);
>>> +		ckpt_kwrite(ctx, ((void*)&h->common.h) + sizeof(h->common.h),
>>> +			    sizeof(*h) - sizeof(h->common.h));
>>> +	}
>>> +	ckpt_hdr_put(ctx, h);
>>> +	if (rc)
>>> +		goto out;
>>> +
>>> +	/* 
>>> +	 * FIXME for now we do it one at a time. Later we might do it like
>>> +	 * checkpoint_pids() for better performance since there can be many
>>> +	 * more epoll items than pids.
>>> +	 */
>> Defer the writing below ...
> 
> OK
> 
>>> +	cepi = ckpt_hdr_get(ctx, sizeof(*cepi));
>>> +	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
>>> +		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
>>> +		cepi->fd = epi->ffd.fd;
>>> +		cepi->events = epi->event.events;
>>> +		cepi->data = epi->event.data;
>>> +		if (ckpt_kwrite(ctx, cepi, sizeof(*cepi)) < 0)
>>> +			break;
>>> +	}
>>> +	_ckpt_hdr_put(ctx, cepi, sizeof(*cepi));
>>> +out:
>>> +	/*mutex_unlock(&ep->mtx);*/
>>> +	return rc;
>>> +}
>>> +
>>> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
>>> +			     struct ckpt_hdr_file *ptr)
>>> +{
>>> +	struct ckpt_hdr_file_eventpoll *h;
>>> +	struct eventpoll *ep;
>>> +	struct file *epfile;
>>> +	int epfd, error, i;
>>> +
>>> +	h = container_of(ptr, struct ckpt_hdr_file_eventpoll, common);
>>> +	if (h->common.h.type != CKPT_HDR_FILE_EPOLL ||
>>> +	    h->common.h.len < sizeof(*h) ||
>>> +	    h->common.f_type != CKPT_FILE_EPOLL)
>>> +		return ERR_PTR(-EINVAL);
>>> +
>>> +	/* limit max ep watches and the use of flags */
>>> +	if (h->num_items >= max_user_watches)
>>> +		return ERR_PTR(-ENOSPC);
>> This check should be against the sum of all epoll fd's per user.
>> It will take place elsewhere, and here it's incomplete and doesn't
>> add much.
> 
> Yeah, I noticed that too. Currently, I've got:
> 
>        /* Limit max ep watches. */
>         user = get_current_user();
>         remaining_watches = (max_user_watches -
>                              atomic_read(&user->epoll_watches));
>         free_uid(user);
>         if (h->num_items > remaining_watches)
>                 return ERR_PTR(-ENOSPC);
> 
> ep_insert() checks user->epoll_watches too. So even the original version
> would have failed eventually if the number of watches would have been
> exceeded.
> 
> The check in ep_insert() also means we'll properly enforce the limit even
> if we're running in parallel with other epoll file restores.
> 
> So really this check is redundant. I added it originally with the idea
> that I might not be able to use ep_insert() directly. Now I'm thinking
> it might be better to remove it entirely.

Note that by reusing code from epoll_ctl() (after refactoring),
you'll get all the standard checks, including this one, for free.

> 
>>> +	if (h->common.f_flags & ~EPOLL_CLOEXEC)
>>> +		return ERR_PTR(-EINVAL);
>>> +
>>> +	/* similar to epoll_create() */
>>> +	error = ep_alloc(&ep);
>>> +	if (error < 0)
>>> +		return ERR_PTR(error);
>>> +	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
>>> +				 ptr->f_flags & O_CLOEXEC);
>>> +	if (error < 0) {
>>> +		ep_free(ep);
>>> +		return ERR_PTR(error);
>>> +	}
>>> +
>>> +	/* Everything's allocated, we just need a file* */
>>> +	epfd = error;
>>> +	epfile = fget(epfd);
>>> +	BUG_ON(!epfile);
>> Is there a reason not to use sys_epoll_create(), or refactor it
>> and use the common code ?
> 
> Yeah, I'll reuse it.
> 
>>> +
>>> +	/* Restore the common file bits */
>>> +	i = 0;
>>> +	error = restore_file_common(ctx, epfile, ptr);
>>> +	if (error < 0)
>>> +		goto error_return;
>>> +
>>> +	/* Restore the epoll items/watches */
>>> +	for (; i < h->num_items; i++) {
>> Defer these ...
> 
> OK.
> 
>>> +		/* 
>>> +		 * Loop body like multiple epoll_ctl(ep, ADD, event)
>>> +		 * calls except we've already done much of the checking.
>>> +		 */
>>> +		struct ckpt_eventpoll_item cepi;
>>> +		struct epoll_event epev;
>>> +		struct epitem *epi;
>>> +		struct file *tfile;
>>> +
>>> +		error = ckpt_kread(ctx, &cepi, sizeof(cepi));
>>> +		if (error < 0) {
>>> +			i++;
>>> +			break;
>>> +		}
>>> +		epev.events = cepi.events;
>>> +		epev.data = cepi.data;
>> The code below is a duplicate of sys_epoll_ctl(). Is there a reason
>> not to use that code, or refactor it and share the common code ?
> 
> I certainly can't reuse it directly since it does a copy_from_user().
> Also I've already got the epoll file* and know the op. I'll look for a
> good way to factor common pieces from both sys_epoll_ctl() and
> ep_file_restore().
> 
>>> +
>>> +		/* Get the file* for the target file */
>>> +		error = -EFAULT;
>>> +		tfile = fget(cepi.fd);
>>> +		if (!tfile) {
>>> +			/*
>>> +			 * TODO delayed addition of epitem because 
>>> +			 * tfile hasn't been restored yet.
>>> +			 */
>>> +			continue;
>>> +		}
>>> +
>>> +		/* The target file descriptor must support poll */
>>> +		error = -EPERM;
>>> +		if (!tfile->f_op || !tfile->f_op->poll)
>>> +			goto error_tgt_fput;
>>> +
>>> +		/* Cannot add an epoll file descriptor inside itself. */
>>> +		error = -EINVAL;
>>> +		if (epfile == tfile)
>>> +			goto error_tgt_fput;
>>> +
>>> +		/* mutex_lock(&ep->mtx); TODO check if we need */
>> Yes, please.
> 
> OK.
> 
>>> +		epi = ep_find(ep, tfile, cepi.fd);
>>> +		if (!epi) {
>>> +			epev.events |= POLLERR | POLLHUP;
>>> +			error = ep_insert(ep, &epev, tfile, cepi.fd);
>>> +		} else
>>> +			error = -EEXIST;
>>> +		/* mutex_unlock(&ep->mtx); */
>>> +error_tgt_fput:
>>> +		fput(tfile);
>>> +		if (error < 0)
>>> +			break;
>>> +	}
>>> +error_return:
>>> +	/* Read the remaining number of items. */
>>> +	for (; i < h->num_items; i++) {
>>> +		struct ckpt_eventpoll_item cepi;
>>> +		ckpt_kread(ctx, &cepi, sizeof(cepi));
>>> +	}
>> What's the purpose of this ?
> 
> If we encounter an error we're left in the middle of the epoll item
> array. This effectively seeks to the end of the array. Not needed
> when deferring as you suggested since it changes the way we read the
> ckpt image..

Ok. I actually wondered what is the purpose of seeking to the end
of the array after you detect an error that would clearly abort the
restart ...

Thanks,

Oren.

> 
>>> +	if (error < 0) {
>>> +		/* TODO closeup epfile and epfd */
>>> +		fput(epfile);
>>> +		sys_close(epfd);
>> sys_close() should happen regardless of whether we succeeded or
>> failed.
> 
> OK, for some reason I thought restore_file_desc() tried fget() before
> resorting to attach_file()...
> 
> Thanks for the review.
> 
> Cheers,
> 	-Matt Helsley
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2009-07-24 17:22 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-07-14 23:54 [RFC][PATCH] checkpoint/restart: Add support for epoll Matt Helsley
     [not found] ` <20090714235405.GH5213-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-07-15  9:22   ` Cedric Le Goater
2009-07-24  8:35   ` Oren Laadan
     [not found]     ` <4A697242.60606-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-24 17:04       ` Matt Helsley
     [not found]         ` <20090724170401.GG5878-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-07-24 17:22           ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.