* [PATCH 2/2] Add checkpoint/restart support for epoll files.
@ 2009-09-28 19:01 Matt Helsley
[not found] ` <1254164482-2193-2-git-send-email-matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Matt Helsley @ 2009-09-28 19:01 UTC (permalink / raw)
To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Save/restore epoll items during checkpoint/restart respectively.
kmalloc failures should be dealt with more kindly than just error-out
because epoll is made to poll many thousands of file descriptors.
Subsequent patches will change epoll c/r to "chunk" its output/input
respectively.
Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Changelog:
v4: ckpt-v18
Use files_deferq as submitted by Dan Smith
Cleanup to only report >= 1 items when debugging.
v3: [unposted]
Removed most of the TODOs -- the remainder will be removed by
subsequent patches.
Fixed missing ep_file_collect() [Serge]
Rather than include checkpoint_hdr.h declare (but do not define)
the two structs needed in eventpoll.h [Oren]
Complain with ckpt_write_err() when we detect checkpoint obj
leaks. [Oren]
Remove redundant is_epoll_file() check in collect. [Oren]
Move epfile_objref lookup to simplify error handling. [Oren]
Simplify error handling with early return in
ep_eventpoll_checkpoint(). [Oren]
Cleaned up a comment. [Oren]
Shorten CKPT_HDR_FILE_EPOLL_ITEMS (-FILE) [Oren]
Renumbered to indicate that it follows the file table.
Renamed the epoll struct in checkpoint_hdr.h [Oren]
Also renamed substruct.
Fixup return of empty ep_file_restore(). [Oren]
Changed some error returns. [Oren]
Changed some tests to BUG_ON(). [Oren]
Factored out watch insert with epoll_ctl() into do_epoll_ctl().
[Cedric, Oren]
---
checkpoint/files.c | 21 +++-
checkpoint/restart.c | 2 +-
checkpoint/sys.c | 1 -
fs/eventpoll.c | 310 ++++++++++++++++++++++++++++++++++++----
include/linux/checkpoint.h | 1 +
include/linux/checkpoint_hdr.h | 14 ++
include/linux/eventpoll.h | 17 ++-
7 files changed, 331 insertions(+), 35 deletions(-)
diff --git a/checkpoint/files.c b/checkpoint/files.c
index eac5f3b..0c9bba2 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -22,6 +22,8 @@
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+#include <linux/eventpoll.h>
#include <net/sock.h>
@@ -311,9 +313,11 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
}
ret = deferqueue_run(ctx->files_deferq);
- ckpt_debug("files_deferq ran %d entries\n", ret);
- if (ret > 0)
+ if (ret > 0) {
+ ckpt_debug("file checkpoint deferred %d work items\n", ret);
ret = 0;
+ }
+
out:
kfree(fdtable);
return ret;
@@ -604,6 +608,13 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_TTY,
.restore = tty_file_restore,
},
+#ifdef CONFIG_EPOLL
+ {
+ .file_name = "EPOLL",
+ .file_type = CKPT_FILE_EPOLL,
+ .restore = ep_file_restore,
+ },
+#endif
};
static struct file *do_restore_file(struct ckpt_ctx *ctx)
@@ -731,9 +742,11 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
}
ret = deferqueue_run(ctx->files_deferq);
- ckpt_debug("files_deferq ran %d entries\n", ret);
- if (ret > 0)
+ if (ret > 0) {
+ ckpt_debug("file restore deferred %d work items\n", ret);
ret = 0;
+ }
+
out:
ckpt_hdr_put(ctx, h);
if (!ret) {
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 543b380..61b4921 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -193,7 +193,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
*
* Return: new buffer allocated on success, error pointer otherwise
*/
-static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
{
struct ckpt_hdr hh;
struct ckpt_hdr *h;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 76a3fa9..b8be421 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -251,7 +251,6 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
ctx->deferqueue = deferqueue_create();
if (!ctx->deferqueue)
goto err;
-
ctx->files_deferq = deferqueue_create();
if (!ctx->files_deferq)
goto err;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c0..cf3f309 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,12 @@
#include <asm/mman.h>
#include <asm/atomic.h>
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+#endif
+
/*
* LOCKING:
* There are three level of locking required by epoll :
@@ -671,10 +677,20 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
return pollflags != -1 ? pollflags : 0;
}
+#ifdef CONFIG_CHECKPOINT
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
+#else
+#define ep_eventpoll_checkpoint NULL
+#define ep_file_collect NULL
+#endif
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
- .poll = ep_eventpoll_poll
+ .poll = ep_eventpoll_poll,
+ .checkpoint = ep_eventpoll_checkpoint,
+ .collect = ep_file_collect,
};
/* Fast test to see if the file is an evenpoll file */
@@ -1226,35 +1242,18 @@ SYSCALL_DEFINE1(epoll_create, int, size)
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
-SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
- struct epoll_event __user *, event)
+int do_epoll_ctl(int op, int fd,
+ struct file *file, struct file *tfile,
+ struct epoll_event *epds)
{
int error;
- struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
- struct epoll_event epds;
-
- error = -EFAULT;
- if (ep_op_has_event(op) &&
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto error_return;
-
- /* Get the "struct file *" for the eventpoll file */
- error = -EBADF;
- file = fget(epfd);
- if (!file)
- goto error_return;
-
- /* Get the "struct file *" for the target file */
- tfile = fget(fd);
- if (!tfile)
- goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
- goto error_tgt_fput;
+ return error;
/*
* We have to check that the file structure underneath the file descriptor
@@ -1263,7 +1262,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
*/
error = -EINVAL;
if (file == tfile || !is_file_epoll(file))
- goto error_tgt_fput;
+ return error;
/*
* At this point it is safe to assume that the "private_data" contains
@@ -1284,8 +1283,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
+ epds->events |= POLLERR | POLLHUP;
+ error = ep_insert(ep, epds, tfile, fd);
} else
error = -EEXIST;
break;
@@ -1297,15 +1296,46 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
+ epds->events |= POLLERR | POLLHUP;
+ error = ep_modify(ep, epi, epds);
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
-error_tgt_fput:
+ return error;
+}
+
+/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+ struct epoll_event __user *, event)
+{
+ int error;
+ struct file *file, *tfile;
+ struct epoll_event epds;
+
+ error = -EFAULT;
+ if (ep_op_has_event(op) &&
+ copy_from_user(&epds, event, sizeof(struct epoll_event)))
+ goto error_return;
+
+ /* Get the "struct file *" for the eventpoll file */
+ error = -EBADF;
+ file = fget(epfd);
+ if (!file)
+ goto error_return;
+
+ /* Get the "struct file *" for the target file */
+ tfile = fget(fd);
+ if (!tfile)
+ goto error_fput;
+
+ error = do_epoll_ctl(op, fd, file, tfile, &epds);
fput(tfile);
error_fput:
fput(file);
@@ -1413,6 +1443,230 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
#endif /* HAVE_SET_RESTORE_SIGMASK */
+#ifdef CONFIG_CHECKPOINT
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct rb_node *rbp;
+ struct eventpoll *ep;
+ int ret = 0;
+
+ ep = file->private_data;
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ struct epitem *epi;
+
+ epi = rb_entry(rbp, struct epitem, rbn);
+ ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
+ if (ret < 0)
+ break;
+ }
+ mutex_unlock(&ep->mtx);
+ return ret;
+}
+
+struct epoll_deferq_entry {
+ struct ckpt_ctx *ctx;
+ struct file *epfile;
+};
+
+static int ep_items_checkpoint(void *data)
+{
+ struct epoll_deferq_entry *ep_dq_entry = data;
+ struct ckpt_ctx *ctx;
+ struct file *file;
+ struct ckpt_hdr_eventpoll_items *h;
+ struct rb_node *rbp;
+ struct eventpoll *ep;
+ __s32 epfile_objref;
+ int i, ret;
+
+ file = ep_dq_entry->epfile;
+ ctx = ep_dq_entry->ctx;
+
+ epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
+ BUG_ON(epfile_objref <= 0);
+
+
+ ep = file->private_data;
+ mutex_lock(&ep->mtx);
+ for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
+ mutex_unlock(&ep->mtx);
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
+ CKPT_HDR_EPOLL_ITEMS);
+ if (!h)
+ return -ENOMEM;
+
+ h->num_items = i;
+ h->epfile_objref = epfile_objref;
+
+ ret = 0;
+ mutex_lock(&ep->mtx);
+ for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
+ struct epitem *epi;
+ int objref;
+
+ epi = rb_entry(rbp, struct epitem, rbn);
+ objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
+ if (objref <= 0) {
+ ret = -EBUSY; /* checkpoint obj leak */
+ break;
+ }
+ h->items[i].fd = epi->ffd.fd;
+ h->items[i].file_objref = objref;
+ h->items[i].events = epi->event.events;
+ h->items[i].data = epi->event.data;
+ }
+ mutex_unlock(&ep->mtx);
+ if (!ret && (i != h->num_items))
+ /*
+ * We raced with another thread between our first and second
+ * walks of the rbtree such that there weren't the same number
+ * of items. This means there is a checkpoint "leak".
+ */
+ ret = -EBUSY;
+ if (ret == -EBUSY)
+ ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak detected.\n", "");
+ else if (!ret)
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, &h->h);
+ return ret;
+}
+
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct ckpt_hdr_file *h;
+ struct epoll_deferq_entry ep_dq_entry;
+ int ret = -ENOMEM;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+ h->f_type = CKPT_FILE_EPOLL;
+ ret = checkpoint_file_common(ctx, file, h);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Defer saving the epoll items until all of the ffd.file pointers
+ * have an objref; after the file table has been checkpointed.
+ */
+ ep_dq_entry.ctx = ctx;
+ ep_dq_entry.epfile = file;
+ ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
+ sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
+out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int ep_items_restore(void *data)
+{
+ struct ckpt_ctx *ctx = deferqueue_data_ptr(data);
+ struct ckpt_hdr_eventpoll_items *h;
+ struct eventpoll *ep;
+ struct file *epfile = NULL;
+ int ret, i = 0, remaining_watches;
+
+ h = ckpt_read_obj(ctx, 0,
+ sizeof(*h) + max_user_watches*sizeof(h->items[0]));
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ret = -EINVAL;
+ if ((h->h.type != CKPT_HDR_EPOLL_ITEMS) ||
+ (h->h.len < sizeof(*h)))
+ goto out;
+
+ /* Make sure the items match the size we expect */
+ if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
+ goto out;
+
+ epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
+ BUG_ON(IS_ERR(epfile));
+ BUG_ON(!is_file_epoll(epfile));
+
+ /* Make sure there are enough watches left. */
+ ret = -ENOSPC;
+ ep = epfile->private_data;
+ remaining_watches = (max_user_watches -
+ atomic_read(&ep->user->epoll_watches));
+ if (h->num_items > remaining_watches)
+ goto out;
+
+ ret = 0;
+ /* Restore the epoll items/watches */
+ for (i = 0; !ret && i < h->num_items; i++) {
+ struct epoll_event epev;
+ struct file *tfile;
+
+ /* Get the file* for the target file */
+ if (h->items[i].file_objref <= 0) {
+ ret = -EINVAL;
+ break;
+ }
+ tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
+ CKPT_OBJ_FILE);
+ if (IS_ERR(tfile)) {
+ ret = PTR_ERR(tfile);
+ break;
+ }
+
+ epev.events = h->items[i].events;
+ epev.data = h->items[i].data;
+
+ ret = do_epoll_ctl(EPOLL_CTL_ADD, h->items[i].fd,
+ epfile, tfile, &epev);
+ }
+out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+struct file* ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *h)
+{
+ struct file *epfile;
+ int epfd, ret;
+
+ if (h->h.type != CKPT_HDR_FILE ||
+ h->h.len != sizeof(*h) ||
+ h->f_type != CKPT_FILE_EPOLL)
+ return ERR_PTR(-EINVAL);
+
+ epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
+ if (epfd < 0)
+ return ERR_PTR(epfd);
+ epfile = fget(epfd);
+ BUG_ON(!epfile);
+
+ /*
+ * Needed before we can properly restore the watches and enforce the
+ * limit on watch numbers.
+ */
+ ret = restore_file_common(ctx, epfile, h);
+ if (ret < 0)
+ goto fput_out;
+
+ /*
+ * Defer restoring the epoll items until the file table is
+ * fully restored. Ensures that valid file objrefs will resolve.
+ */
+ ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, NULL);
+ if (ret < 0) {
+fput_out:
+ fput(epfile);
+ epfile = ERR_PTR(ret);
+ }
+ sys_close(epfd); /* harmless even if an error occured */
+ return epfile;
+}
+
+#endif /* CONFIG_CHECKPOINT */
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e00dd70..a8594cc 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -72,6 +72,7 @@ extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
void *ptr, int len, int type);
extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max);
extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
extern int ckpt_read_payload(struct ckpt_ctx *ctx,
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 2ed523f..48736bd 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -85,6 +85,7 @@ enum {
CKPT_HDR_PIPE_BUF,
CKPT_HDR_TTY,
CKPT_HDR_TTY_LDISC,
+ CKPT_HDR_EPOLL_ITEMS = 391, /* Follows file-table */
CKPT_HDR_MM = 401,
CKPT_HDR_VMA,
@@ -380,6 +381,7 @@ enum file_type {
CKPT_FILE_FIFO,
CKPT_FILE_SOCKET,
CKPT_FILE_TTY,
+ CKPT_FILE_EPOLL,
CKPT_FILE_MAX
};
@@ -475,6 +477,18 @@ struct ckpt_hdr_file_socket {
__s32 sock_objref;
} __attribute__((aligned(8)));
+struct ckpt_hdr_eventpoll_items {
+ struct ckpt_hdr h;
+ __s32 epfile_objref;
+ __u32 num_items;
+ struct ckpt_eventpoll_item {
+ __u64 data;
+ __u32 fd;
+ __s32 file_objref;
+ __u32 events;
+ } items[0];
+} __attribute__((aligned(8)));
+
/* memory layout */
struct ckpt_hdr_mm {
struct ckpt_hdr h;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f6856a5..34538be 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -56,6 +56,9 @@ struct file;
#ifdef CONFIG_EPOLL
+struct ckpt_ctx;
+struct ckpt_hdr_file;
+
/* Used to initialize the epoll bits inside the "struct file" */
static inline void eventpoll_init_file(struct file *file)
@@ -95,11 +98,23 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
-#else
+#ifdef CONFIG_CHECKPOINT
+extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *h);
+#endif
+#else
+/* !defined(CONFIG_EPOLL) */
static inline void eventpoll_init_file(struct file *file) {}
static inline void eventpoll_release(struct file *file) {}
+#ifdef CONFIG_CHECKPOINT
+static inline struct file* ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr)
+{
+ return ERR_PTR(-ENOSYS);
+}
+#endif
#endif
#endif /* #ifdef __KERNEL__ */
--
1.5.6.3
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <1254164482-2193-2-git-send-email-matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-29 20:33 ` Serge E. Hallyn
2009-09-29 22:56 ` Oren Laadan
1 sibling, 0 replies; 12+ messages in thread
From: Serge E. Hallyn @ 2009-09-29 20:33 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Quoting Matt Helsley (matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> Save/restore epoll items during checkpoint/restart respectively.
> kmalloc failures should be dealt with more kindly than just error-out
> because epoll is made to poll many thousands of file descriptors.
> Subsequent patches will change epoll c/r to "chunk" its output/input
> respectively.
>
> Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Haven't looked much at epoll, but it looks good...
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *h)
> +{
> + struct file *epfile;
> + int epfd, ret;
> +
> + if (h->h.type != CKPT_HDR_FILE ||
> + h->h.len != sizeof(*h) ||
> + h->f_type != CKPT_FILE_EPOLL)
> + return ERR_PTR(-EINVAL);
> +
> + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> + if (epfd < 0)
> + return ERR_PTR(epfd);
> + epfile = fget(epfd);
> + BUG_ON(!epfile);
> +
> + /*
> + * Needed before we can properly restore the watches and enforce the
> + * limit on watch numbers.
> + */
> + ret = restore_file_common(ctx, epfile, h);
> + if (ret < 0)
> + goto fput_out;
> +
> + /*
> + * Defer restoring the epoll items until the file table is
> + * fully restored. Ensures that valid file objrefs will resolve.
> + */
> + ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, NULL);
> + if (ret < 0) {
> +fput_out:
I've heard complaints before about labels in the middle of an
if block. Since we know ret < 0 at the goto, the label could
just as well go up one line...
> + fput(epfile);
> + epfile = ERR_PTR(ret);
> + }
> + sys_close(epfd); /* harmless even if an error occured */
> + return epfile;
> +}
> +
> +#endif /* CONFIG_CHECKPOINT */
> +
> static int __init eventpoll_init(void)
> {
> struct sysinfo si;
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index e00dd70..a8594cc 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -72,6 +72,7 @@ extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
> void *ptr, int len, int type);
> extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
> extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
> +extern void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max);
> extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
> extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
> extern int ckpt_read_payload(struct ckpt_ctx *ctx,
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 2ed523f..48736bd 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -85,6 +85,7 @@ enum {
> CKPT_HDR_PIPE_BUF,
> CKPT_HDR_TTY,
> CKPT_HDR_TTY_LDISC,
> + CKPT_HDR_EPOLL_ITEMS = 391, /* Follows file-table */
Hmm, why are you specifying the number here? That's just
begging for conflicts with other people's patches...
>
> CKPT_HDR_MM = 401,
> CKPT_HDR_VMA,
> @@ -380,6 +381,7 @@ enum file_type {
> CKPT_FILE_FIFO,
> CKPT_FILE_SOCKET,
> CKPT_FILE_TTY,
> + CKPT_FILE_EPOLL,
> CKPT_FILE_MAX
> };
>
> @@ -475,6 +477,18 @@ struct ckpt_hdr_file_socket {
> __s32 sock_objref;
> } __attribute__((aligned(8)));
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <1254164482-2193-2-git-send-email-matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-29 20:33 ` Serge E. Hallyn
@ 2009-09-29 22:56 ` Oren Laadan
[not found] ` <4AC29086.8080407-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
1 sibling, 1 reply; 12+ messages in thread
From: Oren Laadan @ 2009-09-29 22:56 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Matt Helsley wrote:
> Save/restore epoll items during checkpoint/restart respectively.
> kmalloc failures should be dealt with more kindly than just error-out
> because epoll is made to poll many thousands of file descriptors.
> Subsequent patches will change epoll c/r to "chunk" its output/input
> respectively.
[...]
>
> @@ -311,9 +313,11 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
> }
>
> ret = deferqueue_run(ctx->files_deferq);
> - ckpt_debug("files_deferq ran %d entries\n", ret);
> - if (ret > 0)
> + if (ret > 0) {
> + ckpt_debug("file checkpoint deferred %d work items\n", ret);
> ret = 0;
> + }
> +
With your permission, I'll do this hunk as a separate patch (and
the restore counterpart too). So you can remove from your patch.
> out:
> kfree(fdtable);
> return ret;
> @@ -604,6 +608,13 @@ static struct restore_file_ops restore_file_ops[] = {
> .file_type = CKPT_FILE_TTY,
> .restore = tty_file_restore,
> },
> +#ifdef CONFIG_EPOLL
> + {
> + .file_name = "EPOLL",
> + .file_type = CKPT_FILE_EPOLL,
> + .restore = ep_file_restore,
> + },
> +#endif
> };
>
> static struct file *do_restore_file(struct ckpt_ctx *ctx)
> @@ -731,9 +742,11 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
> }
>
> ret = deferqueue_run(ctx->files_deferq);
> - ckpt_debug("files_deferq ran %d entries\n", ret);
> - if (ret > 0)
> + if (ret > 0) {
> + ckpt_debug("file restore deferred %d work items\n", ret);
> ret = 0;
> + }
> +
(This part too).
> out:
> ckpt_hdr_put(ctx, h);
> if (!ret) {
[...]
>
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + int ret = 0;
> +
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + struct epitem *epi;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
s/ckpt_obj_collect/ckpt_collect_file/
> + if (ret < 0)
> + break;
> + }
> + mutex_unlock(&ep->mtx);
> + return ret;
> +}
> +
> +struct epoll_deferq_entry {
> + struct ckpt_ctx *ctx;
> + struct file *epfile;
> +};
> +
> +static int ep_items_checkpoint(void *data)
> +{
> + struct epoll_deferq_entry *ep_dq_entry = data;
> + struct ckpt_ctx *ctx;
> + struct file *file;
> + struct ckpt_hdr_eventpoll_items *h;
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + __s32 epfile_objref;
> + int i, ret;
> +
> + file = ep_dq_entry->epfile;
> + ctx = ep_dq_entry->ctx;
> +
> + epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> + BUG_ON(epfile_objref <= 0);
> +
> +
Nit: extraneous newline :p
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> + mutex_unlock(&ep->mtx);
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> + CKPT_HDR_EPOLL_ITEMS);
> + if (!h)
> + return -ENOMEM;
> +
> + h->num_items = i;
> + h->epfile_objref = epfile_objref;
> +
> + ret = 0;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> + struct epitem *epi;
> + int objref;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> + if (objref <= 0) {
(Should never return 0)
> + ret = -EBUSY; /* checkpoint obj leak */
> + break;
> + }
> + h->items[i].fd = epi->ffd.fd;
> + h->items[i].file_objref = objref;
> + h->items[i].events = epi->event.events;
> + h->items[i].data = epi->event.data;
> + }
> + mutex_unlock(&ep->mtx);
> + if (!ret && (i != h->num_items))
> + /*
> + * We raced with another thread between our first and second
Not a thread (cannot be a thread, because we checkpoint whole thread
groups). Any process who shared access to this file pointer.
> + * walks of the rbtree such that there weren't the same number
> + * of items. This means there is a checkpoint "leak".
> + */
> + ret = -EBUSY;
> + if (ret == -EBUSY)
> + ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak detected.\n", "");
It would be useful (for the user) to know which fd/item caused the
leak, or whether the leak was because i != h->num_items (You could
use two ckpt_write_err(), one inside the mutex and on here).
Also, the prototype of ckpt_write_err() has changed, see the code.
> + else if (!ret)
> + ret = ckpt_write_obj(ctx, &h->h);
What other value could @ret have ?
> + ckpt_hdr_put(ctx, &h->h);
> + return ret;
> +}
> +
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct ckpt_hdr_file *h;
> + struct epoll_deferq_entry ep_dq_entry;
> + int ret = -ENOMEM;
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> + if (!h)
> + return -ENOMEM;
> + h->f_type = CKPT_FILE_EPOLL;
> + ret = checkpoint_file_common(ctx, file, h);
> + if (ret < 0)
> + goto out;
> + ret = ckpt_write_obj(ctx, &h->h);
> + if (ret < 0)
> + goto out;
> +
> + /*
> + * Defer saving the epoll items until all of the ffd.file pointers
> + * have an objref; after the file table has been checkpointed.
> + */
> + ep_dq_entry.ctx = ctx;
> + ep_dq_entry.epfile = file;
> + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> +out:
> + ckpt_hdr_put(ctx, h);
> + return ret;
> +}
> +
> +static int ep_items_restore(void *data)
> +{
> + struct ckpt_ctx *ctx = deferqueue_data_ptr(data);
> + struct ckpt_hdr_eventpoll_items *h;
> + struct eventpoll *ep;
> + struct file *epfile = NULL;
> + int ret, i = 0, remaining_watches;
> +
> + h = ckpt_read_obj(ctx, 0,
> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
s/ckpt_read_obj/ckpt_read_buf_type/
Saves you the @type and @len test below.
> + if (IS_ERR(h))
> + return PTR_ERR(h);
> +
> + ret = -EINVAL;
> + if ((h->h.type != CKPT_HDR_EPOLL_ITEMS) ||
> + (h->h.len < sizeof(*h)))
> + goto out;
> +
> + /* Make sure the items match the size we expect */
> + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> + goto out;
> +
> + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> + BUG_ON(IS_ERR(epfile));
> + BUG_ON(!is_file_epoll(epfile));
> +
> + /* Make sure there are enough watches left. */
> + ret = -ENOSPC;
> + ep = epfile->private_data;
> + remaining_watches = (max_user_watches -
> + atomic_read(&ep->user->epoll_watches));
> + if (h->num_items > remaining_watches)
> + goto out;
I unsure if this is necessary: running out of watches will be
detected anyway later on. I wouldn't worry about early detection.
> +
> + ret = 0;
> + /* Restore the epoll items/watches */
> + for (i = 0; !ret && i < h->num_items; i++) {
> + struct epoll_event epev;
> + struct file *tfile;
> +
> + /* Get the file* for the target file */
> + if (h->items[i].file_objref <= 0) {
> + ret = -EINVAL;
> + break;
> + }
I should probably change the code elsewhere too, but this test
is unnecessary because ckpt_obj_fetch() will complain anyway.
> + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> + CKPT_OBJ_FILE);
> + if (IS_ERR(tfile)) {
> + ret = PTR_ERR(tfile);
> + break;
> + }
> +
> + epev.events = h->items[i].events;
> + epev.data = h->items[i].data;
> +
> + ret = do_epoll_ctl(EPOLL_CTL_ADD, h->items[i].fd,
> + epfile, tfile, &epev);
> + }
> +out:
> + ckpt_hdr_put(ctx, h);
> + return ret;
> +}
> +
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *h)
> +{
> + struct file *epfile;
> + int epfd, ret;
> +
> + if (h->h.type != CKPT_HDR_FILE ||
> + h->h.len != sizeof(*h) ||
> + h->f_type != CKPT_FILE_EPOLL)
> + return ERR_PTR(-EINVAL);
> +
> + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> + if (epfd < 0)
> + return ERR_PTR(epfd);
> + epfile = fget(epfd);
> + BUG_ON(!epfile);
> +
> + /*
> + * Needed before we can properly restore the watches and enforce the
> + * limit on watch numbers.
> + */
> + ret = restore_file_common(ctx, epfile, h);
> + if (ret < 0)
> + goto fput_out;
> +
> + /*
> + * Defer restoring the epoll items until the file table is
> + * fully restored. Ensures that valid file objrefs will resolve.
> + */
> + ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, NULL);
> + if (ret < 0) {
> +fput_out:
> + fput(epfile);
> + epfile = ERR_PTR(ret);
> + }
> + sys_close(epfd); /* harmless even if an error occured */
Nit: you can move this up to right after fget().
[...]
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <4AC29086.8080407-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-09-29 23:31 ` Oren Laadan
2009-10-02 9:30 ` Matt Helsley
1 sibling, 0 replies; 12+ messages in thread
From: Oren Laadan @ 2009-09-29 23:31 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Oren Laadan wrote:
>
> Matt Helsley wrote:
>> Save/restore epoll items during checkpoint/restart respectively.
>> kmalloc failures should be dealt with more kindly than just error-out
>> because epoll is made to poll many thousands of file descriptors.
>> Subsequent patches will change epoll c/r to "chunk" its output/input
>> respectively.
>
> [...]
>
>>
>> @@ -311,9 +313,11 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
>> }
>>
>> ret = deferqueue_run(ctx->files_deferq);
>> - ckpt_debug("files_deferq ran %d entries\n", ret);
>> - if (ret > 0)
>> + if (ret > 0) {
>> + ckpt_debug("file checkpoint deferred %d work items\n", ret);
>> ret = 0;
>> + }
>> +
>
> With your permission, I'll do this hunk as a separate patch (and
> the restore counterpart too). So you can remove from your patch.
I take that back. I prefer the previous debug, that give info
whether the call succeeded or not...
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <4AC29086.8080407-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-29 23:31 ` Oren Laadan
@ 2009-10-02 9:30 ` Matt Helsley
[not found] ` <20091002093050.GA4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
1 sibling, 1 reply; 12+ messages in thread
From: Matt Helsley @ 2009-10-02 9:30 UTC (permalink / raw)
To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Tue, Sep 29, 2009 at 06:56:06PM -0400, Oren Laadan wrote:
>
>
> Matt Helsley wrote:
> > Save/restore epoll items during checkpoint/restart respectively.
> > kmalloc failures should be dealt with more kindly than just error-out
> > because epoll is made to poll many thousands of file descriptors.
> > Subsequent patches will change epoll c/r to "chunk" its output/input
> > respectively.
>
> [...]
>
> >
> > @@ -311,9 +313,11 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
> > }
> >
> > ret = deferqueue_run(ctx->files_deferq);
> > - ckpt_debug("files_deferq ran %d entries\n", ret);
> > - if (ret > 0)
> > + if (ret > 0) {
> > + ckpt_debug("file checkpoint deferred %d work items\n", ret);
> > ret = 0;
> > + }
> > +
>
> With your permission, I'll do this hunk as a separate patch (and
> the restore counterpart too). So you can remove from your patch.
Removed.
<snip>
> >
> > +#ifdef CONFIG_CHECKPOINT
> > +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > + struct rb_node *rbp;
> > + struct eventpoll *ep;
> > + int ret = 0;
> > +
> > + ep = file->private_data;
> > + mutex_lock(&ep->mtx);
> > + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> > + struct epitem *epi;
> > +
> > + epi = rb_entry(rbp, struct epitem, rbn);
> > + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
>
> s/ckpt_obj_collect/ckpt_collect_file/
OK. Seems like we might recurse here though if the file is an epoll
file and there's a cycle in the epoll sets (e.g. epoll set A includes the fd
for epoll set B, epoll set B includes the fd for epoll set A).
So I think I'll do:
if (is_file_epoll(epi->ffd.file))
continue; /* Don't recurse */
ret = ckpt_collect_file(ctx, epi->ffd.file);
That works properly since epoll files are anonymous, can't be mmap'd, and
hence must be reachable from the file descriptor table.
I've also started a testcase to cover this scenario.
>
> > + if (ret < 0)
> > + break;
> > + }
> > + mutex_unlock(&ep->mtx);
> > + return ret;
> > +}
> > +
> > +struct epoll_deferq_entry {
> > + struct ckpt_ctx *ctx;
> > + struct file *epfile;
> > +};
> > +
> > +static int ep_items_checkpoint(void *data)
> > +{
> > + struct epoll_deferq_entry *ep_dq_entry = data;
> > + struct ckpt_ctx *ctx;
> > + struct file *file;
> > + struct ckpt_hdr_eventpoll_items *h;
> > + struct rb_node *rbp;
> > + struct eventpoll *ep;
> > + __s32 epfile_objref;
> > + int i, ret;
> > +
> > + file = ep_dq_entry->epfile;
> > + ctx = ep_dq_entry->ctx;
> > +
> > + epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> > + BUG_ON(epfile_objref <= 0);
> > +
> > +
>
> Nit: extraneous newline :p
Removed
>
> > + ep = file->private_data;
> > + mutex_lock(&ep->mtx);
> > + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> > + mutex_unlock(&ep->mtx);
> > +
> > + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> > + CKPT_HDR_EPOLL_ITEMS);
> > + if (!h)
> > + return -ENOMEM;
> > +
> > + h->num_items = i;
> > + h->epfile_objref = epfile_objref;
> > +
> > + ret = 0;
> > + mutex_lock(&ep->mtx);
> > + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> > + struct epitem *epi;
> > + int objref;
> > +
> > + epi = rb_entry(rbp, struct epitem, rbn);
> > + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> > + if (objref <= 0) {
>
> (Should never return 0)
So? I don't think that implies the test should be changed.
>
> > + ret = -EBUSY; /* checkpoint obj leak */
> > + break;
> > + }
> > + h->items[i].fd = epi->ffd.fd;
> > + h->items[i].file_objref = objref;
> > + h->items[i].events = epi->event.events;
> > + h->items[i].data = epi->event.data;
> > + }
> > + mutex_unlock(&ep->mtx);
> > + if (!ret && (i != h->num_items))
> > + /*
> > + * We raced with another thread between our first and second
>
> Not a thread (cannot be a thread, because we checkpoint whole thread
> groups). Any process who shared access to this file pointer.
Good point I'll s/thread/task/ here.
>
> > + * walks of the rbtree such that there weren't the same number
> > + * of items. This means there is a checkpoint "leak".
> > + */
> > + ret = -EBUSY;
> > + if (ret == -EBUSY)
> > + ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak detected.\n", "");
>
> It would be useful (for the user) to know which fd/item caused the
> leak, or whether the leak was because i != h->num_items (You could
> use two ckpt_write_err(), one inside the mutex and on here).
That's a good idea. Note the "fd" in the item isn't necessarily what's
in the fd table and the index "i" isn't a stable indicator of where we
are in the epoll set. So I'm going to have it print out the path to the
"file" rather than either of those things.
>
> Also, the prototype of ckpt_write_err() has changed, see the code.
I noticed that -- that's why I put in the "". However I see that it's
much more complicated than the prototype implied. Here's my idiotic
thought process:
extern int ckpt_write_err(struct ckpt_ctx *ctx, char *ptr, char *fmt,
...);
Gosh it looks like an arbitrary string with the incredibly redundant
name "ptr" which tells me absolutely nothing about what goes there.
Now that you've pointed this out I've dug a little deeper. Calling it the
"pre-format" string is also not very informative -- I can guess that it
goes before "fmt" based on its position in the arguments. The
best information comes from the comment above __ckpt_generate_fmt():
* PREFMT is constructed from @prefmt by subtituting format snippets
* according to the contents of @prefmt. The format characters in
* @prefmt can be E (error), O (objref), P (pointer), S (string) and
* V (variable/symbol). For example, E will generate a "err %d" in
* PREFMT (see prefmt_array below).
...
In other words it has a very specific encoding which neither the
prototype nor the function itself hint at or bother to describe. I had to
dig down through the vararg layers to find that description.
There needs to be information near the prototype in checkpoint.h and near
the function itself which shouts to anyone reading it that "ptr" is some
oddly-encoded string.
It would also be better if each prefmt element required a % before it -- then
reviewers would probably recognize it as a "second" format string with
strange conversion specifications (%E, %O, %P, %V, %S).
I can't help but think there's more improvement possible. Some less
well-thought-out ideas:
It might be even better to remove the prefmt string and have ckpt_write_err
variants for specialized areas of the code. If area "foo" could use an
err, objref, and file flags you might define:
int ckpt_write_foo_err(struct ckpt_ctx *ctx, int err, __s32 objref, int
flags, char *fmt, ...);
Otherwise I wonder if it would be better to join the prefmt and fmt
strings by just defining our own, new, conversion specifiers.
>
> > + else if (!ret)
> > + ret = ckpt_write_obj(ctx, &h->h);
>
> What other value could @ret have ?
None as the code stands right now. Maybe I should change it to something
like:
if (ret) {
ckpt_write_err(...);
} else
ckpt_write_obj(...);
That should handle any errors -- not just -EBUSY -- and avoid an extra
test. Actually I've looked into the error paths and done a complete
rewrite of them to use ckpt_write_err() properly and more verbosely.
It's more expensive and complex though.
>
> > + ckpt_hdr_put(ctx, &h->h);
> > + return ret;
> > +}
> > +
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > + struct ckpt_hdr_file *h;
> > + struct epoll_deferq_entry ep_dq_entry;
> > + int ret = -ENOMEM;
> > +
> > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> > + if (!h)
> > + return -ENOMEM;
> > + h->f_type = CKPT_FILE_EPOLL;
> > + ret = checkpoint_file_common(ctx, file, h);
> > + if (ret < 0)
> > + goto out;
> > + ret = ckpt_write_obj(ctx, &h->h);
> > + if (ret < 0)
> > + goto out;
> > +
> > + /*
> > + * Defer saving the epoll items until all of the ffd.file pointers
> > + * have an objref; after the file table has been checkpointed.
> > + */
> > + ep_dq_entry.ctx = ctx;
> > + ep_dq_entry.epfile = file;
> > + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> > + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> > +out:
> > + ckpt_hdr_put(ctx, h);
> > + return ret;
> > +}
> > +
> > +static int ep_items_restore(void *data)
> > +{
> > + struct ckpt_ctx *ctx = deferqueue_data_ptr(data);
> > + struct ckpt_hdr_eventpoll_items *h;
> > + struct eventpoll *ep;
> > + struct file *epfile = NULL;
> > + int ret, i = 0, remaining_watches;
> > +
> > + h = ckpt_read_obj(ctx, 0,
> > + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
>
> s/ckpt_read_obj/ckpt_read_buf_type/
> Saves you the @type and @len test below.
Good. That means I can get rid of the portion of the patch that removed
"static" from ckpt_read_obj too...
Again, the prototype in checkpoint.h is misleading. It says the second
argument is "len", not "max" (or better: "maxlen"). Naming is critical --
even for prototype parameters in headers.
I think I see a bug in either ckpt_read_obj or its kerneldocs:
/**
* ckpt_read_obj - allocate and read an object (ckpt_hdr followed by
* payload)
* @ctx: checkpoint context
* @h: object descriptor
* @len: desired payload length (if 0, flexible)
* @max: maximum payload length
*
* Return: new buffer allocated on success, error pointer otherwise
*/
static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
{
struct ckpt_hdr hh;
struct ckpt_hdr *h;
int ret;
again:
ret = ckpt_kread(ctx, &hh, sizeof(hh));
if (ret < 0)
return ERR_PTR(ret);
_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
hh.type, hh.len, len, max);
if (hh.len < sizeof(*h))
return ERR_PTR(-EINVAL);
if (hh.type == CKPT_HDR_OBJREF) {
ret = _ckpt_read_objref(ctx, &hh);
if (ret < 0)
return ERR_PTR(ret);
goto again;
}
/* if len specified, enforce, else if maximum specified, enforce */
if ((len && hh.len != len) || (!len && max && hh.len > max))
return ERR_PTR(-EINVAL);
The last test really should be:
if ((len && hh.len != len) ||
(!len && max && (hh.len - sizeof(*h)) > max))
return ERR_PTR(-EINVAL);
Otherwise it's not really the maximum payload length and the "bug" is
just the comment.
>
> > + if (IS_ERR(h))
> > + return PTR_ERR(h);
> > +
> > + ret = -EINVAL;
> > + if ((h->h.type != CKPT_HDR_EPOLL_ITEMS) ||
> > + (h->h.len < sizeof(*h)))
> > + goto out;
> > +
> > + /* Make sure the items match the size we expect */
> > + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> > + goto out;
> > +
> > + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> > + BUG_ON(IS_ERR(epfile));
> > + BUG_ON(!is_file_epoll(epfile));
> > +
> > + /* Make sure there are enough watches left. */
> > + ret = -ENOSPC;
> > + ep = epfile->private_data;
> > + remaining_watches = (max_user_watches -
> > + atomic_read(&ep->user->epoll_watches));
> > + if (h->num_items > remaining_watches)
> > + goto out;
>
> I unsure if this is necessary: running out of watches will be
> detected anyway later on. I wouldn't worry about early detection.
I'm tired of arguing over this one so OK, I've removed it.
>
> > +
> > + ret = 0;
> > + /* Restore the epoll items/watches */
> > + for (i = 0; !ret && i < h->num_items; i++) {
> > + struct epoll_event epev;
> > + struct file *tfile;
> > +
> > + /* Get the file* for the target file */
> > + if (h->items[i].file_objref <= 0) {
> > + ret = -EINVAL;
> > + break;
> > + }
>
> I should probably change the code elsewhere too, but this test
> is unnecessary because ckpt_obj_fetch() will complain anyway.
I don't think it will complain since I don't see anything in the read or hash
code that checks for negative objrefs. However moving this into
ckpt_obj_fetch() would get rid of alot of code much like passing NULL into
kfree() does, so I'll remove this test.
>
> > + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> > + CKPT_OBJ_FILE);
> > + if (IS_ERR(tfile)) {
> > + ret = PTR_ERR(tfile);
> > + break;
> > + }
> > +
> > + epev.events = h->items[i].events;
> > + epev.data = h->items[i].data;
> > +
> > + ret = do_epoll_ctl(EPOLL_CTL_ADD, h->items[i].fd,
> > + epfile, tfile, &epev);
> > + }
> > +out:
> > + ckpt_hdr_put(ctx, h);
> > + return ret;
> > +}
> > +
> > +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > + struct ckpt_hdr_file *h)
> > +{
> > + struct file *epfile;
> > + int epfd, ret;
> > +
> > + if (h->h.type != CKPT_HDR_FILE ||
> > + h->h.len != sizeof(*h) ||
> > + h->f_type != CKPT_FILE_EPOLL)
> > + return ERR_PTR(-EINVAL);
> > +
> > + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> > + if (epfd < 0)
> > + return ERR_PTR(epfd);
> > + epfile = fget(epfd);
> > + BUG_ON(!epfile);
> > +
> > + /*
> > + * Needed before we can properly restore the watches and enforce the
> > + * limit on watch numbers.
> > + */
> > + ret = restore_file_common(ctx, epfile, h);
> > + if (ret < 0)
> > + goto fput_out;
> > +
> > + /*
> > + * Defer restoring the epoll items until the file table is
> > + * fully restored. Ensures that valid file objrefs will resolve.
> > + */
> > + ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, NULL);
> > + if (ret < 0) {
> > +fput_out:
> > + fput(epfile);
> > + epfile = ERR_PTR(ret);
> > + }
> > + sys_close(epfd); /* harmless even if an error occured */
>
> Nit: you can move this up to right after fget().
Good idea.
Thanks for the review.
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <20091002093050.GA4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-10-02 18:22 ` Oren Laadan
[not found] ` <4AC644C8.2070905-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Oren Laadan @ 2009-10-02 18:22 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Matt Helsley wrote:
> On Tue, Sep 29, 2009 at 06:56:06PM -0400, Oren Laadan wrote:
>>
>> Matt Helsley wrote:
>>> Save/restore epoll items during checkpoint/restart respectively.
>>> kmalloc failures should be dealt with more kindly than just error-out
>>> because epoll is made to poll many thousands of file descriptors.
>>> Subsequent patches will change epoll c/r to "chunk" its output/input
>>> respectively.
[...]
>>> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
>>> + struct epitem *epi;
>>> +
>>> + epi = rb_entry(rbp, struct epitem, rbn);
>>> + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
>> s/ckpt_obj_collect/ckpt_collect_file/
>
> OK. Seems like we might recurse here though if the file is an epoll
> file and there's a cycle in the epoll sets (e.g. epoll set A includes the fd
> for epoll set B, epoll set B includes the fd for epoll set A).
> So I think I'll do:
I doubt if an epoll file can epoll an epoll file ...
(eg. see line 1265 in fs/eventpoll.c)
>
> if (is_file_epoll(epi->ffd.file))
> continue; /* Don't recurse */
> ret = ckpt_collect_file(ctx, epi->ffd.file);
>
> That works properly since epoll files are anonymous, can't be mmap'd, and
> hence must be reachable from the file descriptor table.
>
> I've also started a testcase to cover this scenario.
>
[...]
>>> + * walks of the rbtree such that there weren't the same number
>>> + * of items. This means there is a checkpoint "leak".
>>> + */
>>> + ret = -EBUSY;
>>> + if (ret == -EBUSY)
>>> + ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak detected.\n", "");
>> It would be useful (for the user) to know which fd/item caused the
>> leak, or whether the leak was because i != h->num_items (You could
>> use two ckpt_write_err(), one inside the mutex and on here).
>
> That's a good idea. Note the "fd" in the item isn't necessarily what's
> in the fd table and the index "i" isn't a stable indicator of where we
> are in the epoll set. So I'm going to have it print out the path to the
> "file" rather than either of those things.
>
Good point. Then please note both - the (original) fd may be a
useful fact for the developer trying to understand what failed.
>> Also, the prototype of ckpt_write_err() has changed, see the code.
>
> I noticed that -- that's why I put in the "". However I see that it's
> much more complicated than the prototype implied. Here's my idiotic
> thought process:
>
> extern int ckpt_write_err(struct ckpt_ctx *ctx, char *ptr, char *fmt,
> ...);
>
> Gosh it looks like an arbitrary string with the incredibly redundant
> name "ptr" which tells me absolutely nothing about what goes there.
>
> Now that you've pointed this out I've dug a little deeper. Calling it the
> "pre-format" string is also not very informative -- I can guess that it
> goes before "fmt" based on its position in the arguments. The
> best information comes from the comment above __ckpt_generate_fmt():
>
> * PREFMT is constructed from @prefmt by subtituting format snippets
> * according to the contents of @prefmt. The format characters in
> * @prefmt can be E (error), O (objref), P (pointer), S (string) and
> * V (variable/symbol). For example, E will generate a "err %d" in
> * PREFMT (see prefmt_array below).
> ...
>
> In other words it has a very specific encoding which neither the
> prototype nor the function itself hint at or bother to describe. I had to
> dig down through the vararg layers to find that description.
>
> There needs to be information near the prototype in checkpoint.h and near
> the function itself which shouts to anyone reading it that "ptr" is some
> oddly-encoded string.
You are correct. Will add.
>
> It would also be better if each prefmt element required a % before it -- then
> reviewers would probably recognize it as a "second" format string with
> strange conversion specifications (%E, %O, %P, %V, %S).
Adding "%" sounds useful. I'll do that.
>
> I can't help but think there's more improvement possible. Some less
> well-thought-out ideas:
>
> It might be even better to remove the prefmt string and have ckpt_write_err
> variants for specialized areas of the code. If area "foo" could use an
> err, objref, and file flags you might define:
>
> int ckpt_write_foo_err(struct ckpt_ctx *ctx, int err, __s32 objref, int
> flags, char *fmt, ...);
I thought about it, but wished to avoid proliferation of these
variants.
>
> Otherwise I wonder if it would be better to join the prefmt and fmt
> strings by just defining our own, new, conversion specifiers.
>
Thought about it, but that would limit us in the conversion specifiers
that we can use, because many are used by printf already. Perhaps one
way to do it is use
"%Zxyz"
where "%Z" tells the following letters indicate a ckpt_write_err()
format, and then "xyz" are the letters from the current @prefmt.
Thoughts ?
[...]
>>> +
>>> + h = ckpt_read_obj(ctx, 0,
>>> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
>> s/ckpt_read_obj/ckpt_read_buf_type/
>> Saves you the @type and @len test below.
>
> Good. That means I can get rid of the portion of the patch that removed
> "static" from ckpt_read_obj too...
>
> Again, the prototype in checkpoint.h is misleading. It says the second
> argument is "len", not "max" (or better: "maxlen"). Naming is critical --
> even for prototype parameters in headers.
Yes, I forgot to update the prototype in the header.
>
> I think I see a bug in either ckpt_read_obj or its kerneldocs:
>
> /**
> * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by
> * payload)
> * @ctx: checkpoint context
> * @h: object descriptor
> * @len: desired payload length (if 0, flexible)
> * @max: maximum payload length
> *
> * Return: new buffer allocated on success, error pointer otherwise
> */
> static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> {
> struct ckpt_hdr hh;
> struct ckpt_hdr *h;
> int ret;
>
> again:
> ret = ckpt_kread(ctx, &hh, sizeof(hh));
> if (ret < 0)
> return ERR_PTR(ret);
> _ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
> hh.type, hh.len, len, max);
> if (hh.len < sizeof(*h))
> return ERR_PTR(-EINVAL);
>
> if (hh.type == CKPT_HDR_OBJREF) {
> ret = _ckpt_read_objref(ctx, &hh);
> if (ret < 0)
> return ERR_PTR(ret);
> goto again;
> }
>
> /* if len specified, enforce, else if maximum specified, enforce */
> if ((len && hh.len != len) || (!len && max && hh.len > max))
> return ERR_PTR(-EINVAL);
>
> The last test really should be:
>
> if ((len && hh.len != len) ||
> (!len && max && (hh.len - sizeof(*h)) > max))
> return ERR_PTR(-EINVAL);
>
> Otherwise it's not really the maximum payload length and the "bug" is
> just the comment.
The bug is in the comment: @max is intended to tell the max (total)
buffer size returned to the user. Will fix.
[...]
>>> + ret = 0;
>>> + /* Restore the epoll items/watches */
>>> + for (i = 0; !ret && i < h->num_items; i++) {
>>> + struct epoll_event epev;
>>> + struct file *tfile;
>>> +
>>> + /* Get the file* for the target file */
>>> + if (h->items[i].file_objref <= 0) {
>>> + ret = -EINVAL;
>>> + break;
>>> + }
>> I should probably change the code elsewhere too, but this test
>> is unnecessary because ckpt_obj_fetch() will complain anyway.
>
> I don't think it will complain since I don't see anything in the read or hash
> code that checks for negative objrefs. However moving this into
What is "this" that you want to move ?
> ckpt_obj_fetch() would get rid of alot of code much like passing NULL into
> kfree() does, so I'll remove this test.
ckpt_obj_fetch() won't complain about a negative value a-priori, but
the search is certain to fail. Nevertheless, I'll tighten the restart
related calls in objhash to ensure that.
[...]
>
> Thanks for the review.
Thanks for the cross-review :)
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <4AC644C8.2070905-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-10-02 19:38 ` Oren Laadan
[not found] ` <4AC656A8.8070103-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-02 20:27 ` Matt Helsley
1 sibling, 1 reply; 12+ messages in thread
From: Oren Laadan @ 2009-10-02 19:38 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
>>> I should probably change the code elsewhere too, but this test
>>> is unnecessary because ckpt_obj_fetch() will complain anyway.
>> I don't think it will complain since I don't see anything in the read or hash
>> code that checks for negative objrefs. However moving this into
>
> What is "this" that you want to move ?
>
>> ckpt_obj_fetch() would get rid of alot of code much like passing NULL into
>> kfree() does, so I'll remove this test.
>
> ckpt_obj_fetch() won't complain about a negative value a-priori, but
> the search is certain to fail. Nevertheless, I'll tighten the restart
> related calls in objhash to ensure that.
I take it back: ckpt_obj_fetch() returns -EINVAL when an objref isn't
found, not that the original value was invalid.
Oren
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <4AC644C8.2070905-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-02 19:38 ` Oren Laadan
@ 2009-10-02 20:27 ` Matt Helsley
[not found] ` <20091002202726.GC4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
1 sibling, 1 reply; 12+ messages in thread
From: Matt Helsley @ 2009-10-02 20:27 UTC (permalink / raw)
To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Fri, Oct 02, 2009 at 02:22:00PM -0400, Oren Laadan wrote:
>
>
> Matt Helsley wrote:
> > On Tue, Sep 29, 2009 at 06:56:06PM -0400, Oren Laadan wrote:
> >>
> >> Matt Helsley wrote:
> >>> Save/restore epoll items during checkpoint/restart respectively.
> >>> kmalloc failures should be dealt with more kindly than just error-out
> >>> because epoll is made to poll many thousands of file descriptors.
> >>> Subsequent patches will change epoll c/r to "chunk" its output/input
> >>> respectively.
>
> [...]
>
> >>> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> >>> + struct epitem *epi;
> >>> +
> >>> + epi = rb_entry(rbp, struct epitem, rbn);
> >>> + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> >> s/ckpt_obj_collect/ckpt_collect_file/
> >
> > OK. Seems like we might recurse here though if the file is an epoll
> > file and there's a cycle in the epoll sets (e.g. epoll set A includes the fd
> > for epoll set B, epoll set B includes the fd for epoll set A).
> > So I think I'll do:
>
> I doubt if an epoll file can epoll an epoll file ...
> (eg. see line 1265 in fs/eventpoll.c)
I am certain it can. The testcase I have proves it and the man page mentions
it too. Since it has a poll op I believe it can be epoll'd (line 691 of
fs/eventpoll.c).
I haven't double checked, but I believe the line you cited just prevents the
epoll set from including its own fd.
<snip>
> >>> + ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak detected.\n", "");
> >> It would be useful (for the user) to know which fd/item caused the
> >> leak, or whether the leak was because i != h->num_items (You could
> >> use two ckpt_write_err(), one inside the mutex and on here).
> >
> > That's a good idea. Note the "fd" in the item isn't necessarily what's
> > in the fd table and the index "i" isn't a stable indicator of where we
> > are in the epoll set. So I'm going to have it print out the path to the
> > "file" rather than either of those things.
> >
>
> Good point. Then please note both - the (original) fd may be a
> useful fact for the developer trying to understand what failed.
True. I suppose if the fd is inconsistent with the file's path then
they'll have enough information to eventually figure it out.
<snip>
> > I can't help but think there's more improvement possible. Some less
> > well-thought-out ideas:
> >
> > It might be even better to remove the prefmt string and have ckpt_write_err
> > variants for specialized areas of the code. If area "foo" could use an
> > err, objref, and file flags you might define:
> >
> > int ckpt_write_foo_err(struct ckpt_ctx *ctx, int err, __s32 objref, int
> > flags, char *fmt, ...);
>
> I thought about it, but wished to avoid proliferation of these
> variants.
Makes sense.
> > Otherwise I wonder if it would be better to join the prefmt and fmt
> > strings by just defining our own, new, conversion specifiers.
> >
>
> Thought about it, but that would limit us in the conversion specifiers
> that we can use, because many are used by printf already. Perhaps one
> way to do it is use
> "%Zxyz"
> where "%Z" tells the following letters indicate a ckpt_write_err()
> format, and then "xyz" are the letters from the current @prefmt.
Would %( work better? The ( suggests that there's more to it.
<snip>
> > I think I see a bug in either ckpt_read_obj or its kerneldocs:
> >
> > /**
> > * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by
> > * payload)
> > * @ctx: checkpoint context
> > * @h: object descriptor
> > * @len: desired payload length (if 0, flexible)
> > * @max: maximum payload length
> > *
> > * Return: new buffer allocated on success, error pointer otherwise
> > */
> > static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> > {
> > struct ckpt_hdr hh;
> > struct ckpt_hdr *h;
> > int ret;
> >
> > again:
> > ret = ckpt_kread(ctx, &hh, sizeof(hh));
> > if (ret < 0)
> > return ERR_PTR(ret);
> > _ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
> > hh.type, hh.len, len, max);
> > if (hh.len < sizeof(*h))
> > return ERR_PTR(-EINVAL);
> >
> > if (hh.type == CKPT_HDR_OBJREF) {
> > ret = _ckpt_read_objref(ctx, &hh);
> > if (ret < 0)
> > return ERR_PTR(ret);
> > goto again;
> > }
> >
> > /* if len specified, enforce, else if maximum specified, enforce */
> > if ((len && hh.len != len) || (!len && max && hh.len > max))
> > return ERR_PTR(-EINVAL);
> >
> > The last test really should be:
> >
> > if ((len && hh.len != len) ||
> > (!len && max && (hh.len - sizeof(*h)) > max))
> > return ERR_PTR(-EINVAL);
> >
> > Otherwise it's not really the maximum payload length and the "bug" is
> > just the comment.
>
> The bug is in the comment: @max is intended to tell the max (total)
> buffer size returned to the user. Will fix.
OK. That means I need to fix my code too then. Thanks for fixing these
up.
>
> [...]
>
> >>> + ret = 0;
> >>> + /* Restore the epoll items/watches */
> >>> + for (i = 0; !ret && i < h->num_items; i++) {
> >>> + struct epoll_event epev;
> >>> + struct file *tfile;
> >>> +
> >>> + /* Get the file* for the target file */
> >>> + if (h->items[i].file_objref <= 0) {
> >>> + ret = -EINVAL;
> >>> + break;
> >>> + }
> >> I should probably change the code elsewhere too, but this test
> >> is unnecessary because ckpt_obj_fetch() will complain anyway.
> >
> > I don't think it will complain since I don't see anything in the read or hash
> > code that checks for negative objrefs. However moving this into
>
> What is "this" that you want to move ?
I thought it would be necessary to move the < 0 test above into the
objhash code because...
>
> > ckpt_obj_fetch() would get rid of alot of code much like passing NULL into
> > kfree() does, so I'll remove this test.
>
> ckpt_obj_fetch() won't complain about a negative value a-priori, but
> the search is certain to fail. Nevertheless, I'll tighten the restart
> related calls in objhash to ensure that.
Hmm, I didn't see how it was certain to fail. I guess I'll have to look
again.
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <4AC656A8.8070103-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-10-02 20:36 ` Matt Helsley
[not found] ` <20091002203609.GD4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Matt Helsley @ 2009-10-02 20:36 UTC (permalink / raw)
To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Fri, Oct 02, 2009 at 03:38:16PM -0400, Oren Laadan wrote:
>
> >>> I should probably change the code elsewhere too, but this test
> >>> is unnecessary because ckpt_obj_fetch() will complain anyway.
> >> I don't think it will complain since I don't see anything in the read or hash
> >> code that checks for negative objrefs. However moving this into
> >
> > What is "this" that you want to move ?
> >
> >> ckpt_obj_fetch() would get rid of alot of code much like passing NULL into
> >> kfree() does, so I'll remove this test.
> >
> > ckpt_obj_fetch() won't complain about a negative value a-priori, but
> > the search is certain to fail. Nevertheless, I'll tighten the restart
> > related calls in objhash to ensure that.
>
> I take it back: ckpt_obj_fetch() returns -EINVAL when an objref isn't
> found, not that the original value was invalid.
Right, so then the question is whether it's possible to insert a negative
objref by modifying the checkpoint image before restart. As far as I can tell
that will work. Are you saying we don't need to care about that?
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <20091002203609.GD4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-10-02 21:18 ` Oren Laadan
0 siblings, 0 replies; 12+ messages in thread
From: Oren Laadan @ 2009-10-02 21:18 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Matt Helsley wrote:
> On Fri, Oct 02, 2009 at 03:38:16PM -0400, Oren Laadan wrote:
>>>>> I should probably change the code elsewhere too, but this test
>>>>> is unnecessary because ckpt_obj_fetch() will complain anyway.
>>>> I don't think it will complain since I don't see anything in the read or hash
>>>> code that checks for negative objrefs. However moving this into
>>> What is "this" that you want to move ?
>>>
>>>> ckpt_obj_fetch() would get rid of alot of code much like passing NULL into
>>>> kfree() does, so I'll remove this test.
>>> ckpt_obj_fetch() won't complain about a negative value a-priori, but
>>> the search is certain to fail. Nevertheless, I'll tighten the restart
>>> related calls in objhash to ensure that.
>> I take it back: ckpt_obj_fetch() returns -EINVAL when an objref isn't
>> found, not that the original value was invalid.
>
> Right, so then the question is whether it's possible to insert a negative
> objref by modifying the checkpoint image before restart. As far as I can tell
> that will work. Are you saying we don't need to care about that?
You are correct, and I said I'll fix that ("Nevertheless..."). Fix
is queued already for ckpt-v18-dev.
I took back the incorrect suggestion that it's ok to _always_ let
ckpt_obj_fetch() do the work, as it depends on the context:
- if you use it to fetch an object you expect to already have been
inserted, then it's good enough.
- if you use it to fetch an object to decide if it's the first
encounter of that object, then you need to explicitly test for
invalid @objref before calling ckpt_obj_fetch().
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <20091002202726.GC4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-10-02 21:22 ` Oren Laadan
[not found] ` <4AC66F08.6050405-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Oren Laadan @ 2009-10-02 21:22 UTC (permalink / raw)
To: Matt Helsley; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Matt Helsley wrote:
[...]
>>>> s/ckpt_obj_collect/ckpt_collect_file/
>>> OK. Seems like we might recurse here though if the file is an epoll
>>> file and there's a cycle in the epoll sets (e.g. epoll set A includes the fd
>>> for epoll set B, epoll set B includes the fd for epoll set A).
>>> So I think I'll do:
>> I doubt if an epoll file can epoll an epoll file ...
>> (eg. see line 1265 in fs/eventpoll.c)
>
> I am certain it can. The testcase I have proves it and the man page mentions
> it too. Since it has a poll op I believe it can be epoll'd (line 691 of
> fs/eventpoll.c).
>
> I haven't double checked, but I believe the line you cited just prevents the
> epoll set from including its own fd.
Duh ... you are totally correct. Sorry for the noise.
[...]
>>> Otherwise I wonder if it would be better to join the prefmt and fmt
>>> strings by just defining our own, new, conversion specifiers.
>>>
>> Thought about it, but that would limit us in the conversion specifiers
>> that we can use, because many are used by printf already. Perhaps one
>> way to do it is use
>> "%Zxyz"
>> where "%Z" tells the following letters indicate a ckpt_write_err()
>> format, and then "xyz" are the letters from the current @prefmt.
>
> Would %( work better? The ( suggests that there's more to it.
Sounds cool. Care to work out the details with a patch ?
Oren.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
[not found] ` <4AC66F08.6050405-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-10-02 21:35 ` Matt Helsley
0 siblings, 0 replies; 12+ messages in thread
From: Matt Helsley @ 2009-10-02 21:35 UTC (permalink / raw)
To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Fri, Oct 02, 2009 at 05:22:16PM -0400, Oren Laadan wrote:
>
>
> Matt Helsley wrote:
>
> [...]
>
> >>>> s/ckpt_obj_collect/ckpt_collect_file/
> >>> OK. Seems like we might recurse here though if the file is an epoll
> >>> file and there's a cycle in the epoll sets (e.g. epoll set A includes the fd
> >>> for epoll set B, epoll set B includes the fd for epoll set A).
> >>> So I think I'll do:
> >> I doubt if an epoll file can epoll an epoll file ...
> >> (eg. see line 1265 in fs/eventpoll.c)
> >
> > I am certain it can. The testcase I have proves it and the man page mentions
> > it too. Since it has a poll op I believe it can be epoll'd (line 691 of
> > fs/eventpoll.c).
> >
> > I haven't double checked, but I believe the line you cited just prevents the
> > epoll set from including its own fd.
>
> Duh ... you are totally correct. Sorry for the noise.
No problem -- whoever's right or wrong, it's always good to have folks
who are trying to help check these things.
>
> [...]
>
> >>> Otherwise I wonder if it would be better to join the prefmt and fmt
> >>> strings by just defining our own, new, conversion specifiers.
> >>>
> >> Thought about it, but that would limit us in the conversion specifiers
> >> that we can use, because many are used by printf already. Perhaps one
> >> way to do it is use
> >> "%Zxyz"
> >> where "%Z" tells the following letters indicate a ckpt_write_err()
> >> format, and then "xyz" are the letters from the current @prefmt.
> >
> > Would %( work better? The ( suggests that there's more to it.
>
> Sounds cool. Care to work out the details with a patch ?
I've got enough on my plate for now -- I've got to update and send out
the next epoll patch. I've got the user-cr kernel header patches to send
out. And I'm looking at what to do for unlinked files, inotify, eventfd,
timerfd, and signalfd (all but the first two should be even simpler than
epoll).
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2009-10-02 21:35 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-28 19:01 [PATCH 2/2] Add checkpoint/restart support for epoll files Matt Helsley
[not found] ` <1254164482-2193-2-git-send-email-matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-29 20:33 ` Serge E. Hallyn
2009-09-29 22:56 ` Oren Laadan
[not found] ` <4AC29086.8080407-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-29 23:31 ` Oren Laadan
2009-10-02 9:30 ` Matt Helsley
[not found] ` <20091002093050.GA4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-10-02 18:22 ` Oren Laadan
[not found] ` <4AC644C8.2070905-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-02 19:38 ` Oren Laadan
[not found] ` <4AC656A8.8070103-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-02 20:36 ` Matt Helsley
[not found] ` <20091002203609.GD4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-10-02 21:18 ` Oren Laadan
2009-10-02 20:27 ` Matt Helsley
[not found] ` <20091002202726.GC4189-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-10-02 21:22 ` Oren Laadan
[not found] ` <4AC66F08.6050405-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-02 21:35 ` Matt Helsley
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.