* [PATCH 1/2] Check for valid destructor pointer before calling it.
@ 2009-08-20 5:17 Matt Helsley
[not found] ` <6d896e12f316e5ff5f44bd13ae1482dab7f9253a.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 11+ messages in thread
From: Matt Helsley @ 2009-08-20 5:17 UTC (permalink / raw)
To: Containers
Not every use of deferqueue will have a destructor function so
we need to check it before calling it.
Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
kernel/deferqueue.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/kernel/deferqueue.c b/kernel/deferqueue.c
index 3fb388b..1204c8b 100644
--- a/kernel/deferqueue.c
+++ b/kernel/deferqueue.c
@@ -53,7 +53,8 @@ void deferqueue_destroy(struct deferqueue_head *h)
pr_debug("%s: freeing non-empty queue\n", __func__);
list_for_each_entry_safe(dq, n, &h->list, list) {
- dq->destructor(dq->data);
+ if (dq->destructor)
+ dq->destructor(dq->data);
list_del(&dq->list);
kfree(dq);
}
--
1.5.6.3
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <6d896e12f316e5ff5f44bd13ae1482dab7f9253a.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-08-20 5:17 ` Matt Helsley
[not found] ` <e820d207a3b1a3228ded29bce9b29f2179ab598b.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-08-20 12:28 ` [PATCH 1/2] Check for valid destructor pointer before calling it Oren Laadan
1 sibling, 1 reply; 11+ messages in thread
From: Matt Helsley @ 2009-08-20 5:17 UTC (permalink / raw)
To: Containers
Save/restore epoll items during checkpoint/restart respectively.
Tests for the cr_tests suite to follow. Tests pass on i386.
TODOs (search the patch for "TODO") that could probably use some
comments:
What to do when there's a "possible checkpoint obj leak"? (search patch
for this string to see what I'm talking about)
Ensure get_current_user will be correct (a userns question/issue?).
kmalloc failures should be dealt with more kindly than just error-out
because epoll is made to poll many thousands of file
descriptors.
This seems like a more general problem with some of the
ckpt_hdr* functions than an epoll problem but...
Pick better errnos for some cases.
Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
---
checkpoint/files.c | 35 +++++
checkpoint/restart.c | 2 +-
fs/eventpoll.c | 280 +++++++++++++++++++++++++++++++++++++-
include/linux/checkpoint.h | 1 +
include/linux/checkpoint_hdr.h | 14 ++
include/linux/checkpoint_types.h | 2 +
include/linux/eventpoll.h | 14 ++-
7 files changed, 345 insertions(+), 3 deletions(-)
diff --git a/checkpoint/files.c b/checkpoint/files.c
index 204055b..8f86dcc 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,8 @@
#include <linux/syscalls.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+#include <linux/eventpoll.h>
#include <net/sock.h>
@@ -289,11 +291,24 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
goto out;
ckpt_debug("nfds %d\n", nfds);
+ ctx->files_deferq = deferqueue_create();
+ if (!ctx->files_deferq) {
+ ret = -ENOMEM;
+ goto out;
+ }
for (n = 0; n < nfds; n++) {
ret = checkpoint_file_desc(ctx, files, fdtable[n]);
if (ret < 0)
break;
}
+ if (!ret) {
+ ret = deferqueue_run(ctx->files_deferq);
+ if (ret > 0) {
+ pr_warning("c/r: files deferqueue had %d entries\n", ret);
+ ret = 0;
+ }
+ }
+ deferqueue_destroy(ctx->files_deferq);
out:
kfree(fdtable);
return ret;
@@ -572,6 +587,14 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_SOCKET,
.restore = sock_file_restore,
},
+#ifdef CONFIG_EPOLL
+ /* epoll */
+ {
+ .file_name = "EPOLL",
+ .file_type = CKPT_FILE_EPOLL,
+ .restore = ep_file_restore,
+ },
+#endif
};
static struct file *do_restore_file(struct ckpt_ctx *ctx)
@@ -692,11 +715,23 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
if (ret < 0)
goto out;
+ ret = -ENOMEM;
+ ctx->files_deferq = deferqueue_create();
+ if (!ctx->files_deferq)
+ goto out;
for (i = 0; i < h->fdt_nfds; i++) {
ret = restore_file_desc(ctx);
if (ret < 0)
break;
}
+ if (!ret) {
+ ret = deferqueue_run(ctx->files_deferq);
+ if (ret > 0) {
+ pr_warning("c/r: files deferqueue had %d entries\n", ret);
+ ret = 0;
+ }
+ }
+ deferqueue_destroy(ctx->files_deferq);
out:
ckpt_hdr_put(ctx, h);
if (!ret) {
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 4fdae78..3a7d914 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -164,7 +164,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
*
* Return: new buffer allocated on success, error pointer otherwise
*/
-static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
{
struct ckpt_hdr hh;
struct ckpt_hdr *h;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c0..7f7070f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
return pollflags != -1 ? pollflags : 0;
}
+#ifdef CONFIG_CHECKPOINT
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
+#else
+#define ep_eventpoll_checkpoint NULL
+#endif
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
- .poll = ep_eventpoll_poll
+ .poll = ep_eventpoll_poll,
+ .checkpoint = ep_eventpoll_checkpoint,
+ .collect = ep_file_collect,
};
/* Fast test to see if the file is an evenpoll file */
@@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
#endif /* HAVE_SET_RESTORE_SIGMASK */
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct rb_node *rbp;
+ struct eventpoll *ep;
+ int ret = 0;
+
+ if (!is_file_epoll(file))
+ return 0;
+
+ ep = file->private_data;
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ struct epitem *epi;
+
+ epi = rb_entry(rbp, struct epitem, rbn);
+ ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
+ if (ret < 0)
+ break;
+ }
+ mutex_unlock(&ep->mtx);
+ return ret;
+}
+
+struct epoll_deferq_entry {
+ struct ckpt_ctx *ctx;
+ struct file *epfile;
+};
+
+static int ep_items_checkpoint(void *data)
+{
+ struct epoll_deferq_entry *ep_dq_entry = data;
+ struct ckpt_ctx *ctx;
+ struct file *file;
+ struct ckpt_eventpoll_items *h;
+ struct rb_node *rbp;
+ struct eventpoll *ep;
+ int i, ret = -ENOMEM;
+
+ file = ep_dq_entry->epfile;
+ ctx = ep_dq_entry->ctx;
+
+ ep = file->private_data;
+ mutex_lock(&ep->mtx);
+ for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
+ mutex_unlock(&ep->mtx);
+
+ /* TODO likely allocation failure when lots of epoll items */
+ h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
+ CKPT_HDR_FILE_EPOLL_ITEMS);
+ if (!h)
+ goto out;
+
+ ret = -ENODEV;
+ h->num_items = i;
+ h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
+ if (h->epfile_objref <= 0)
+ goto out;
+
+ ret = 0;
+ mutex_lock(&ep->mtx);
+ for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
+ struct epitem *epi;
+ int objref;
+
+ epi = rb_entry(rbp, struct epitem, rbn);
+ objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
+ if (objref <= 0) {
+ /* TODO error -- possible checkpoint obj leak */
+ ret = -ENODEV;
+ break;
+ }
+ h->items[i].fd = epi->ffd.fd;
+ h->items[i].file_objref = objref;
+ h->items[i].events = epi->event.events;
+ h->items[i].data = epi->event.data;
+ }
+ mutex_unlock(&ep->mtx);
+ if (h && !ret)
+ ret = ckpt_write_obj(ctx, &h->h);
+ if (!ret && (i != h->num_items)) {
+ /* TODO error -- possible checkpoint obj leak */
+ }
+out:
+ if (h)
+ ckpt_hdr_put(ctx, &h->h);
+ return ret;
+}
+
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct ckpt_hdr_file *h;
+ struct epoll_deferq_entry ep_dq_entry;
+ int ret = -ENOMEM;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ goto out_print;
+ h->f_type = CKPT_FILE_EPOLL;
+ ret = checkpoint_file_common(ctx, file, h);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Defer saving the epoll items until all of the ffd.file pointers
+ * have an objref; after the file table has been checkpointed.
+ */
+ ep_dq_entry.ctx = ctx;
+ ep_dq_entry.epfile = file;
+ ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
+ sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
+out:
+ ckpt_hdr_put(ctx, h);
+out_print:
+ return ret;
+}
+
+static int ep_items_restore(void *data)
+{
+ struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
+ struct ckpt_eventpoll_items *h;
+ struct eventpoll *ep;
+ struct file *epfile = NULL;
+ int ret, i = 0, remaining_watches;
+
+ /*
+ * TODO possible kmalloc failure due to too many watches.
+ */
+ h = ckpt_read_obj(ctx, 0,
+ sizeof(*h) + max_user_watches*sizeof(h->items[0]));
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ret = -EINVAL;
+ if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
+ (h->h.len < sizeof(*h)))
+ goto out;
+
+ /* Make sure the items match the size we expect */
+ if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
+ goto out;
+
+ epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(epfile)) {
+ ret = PTR_ERR(epfile);
+ goto out;
+ }
+ ret = -ENOMSG;
+ if (!is_file_epoll(epfile))
+ goto out;
+
+ ep = epfile->private_data;
+
+ ret = -ENOSPC;
+ remaining_watches = (max_user_watches -
+ atomic_read(&ep->user->epoll_watches));
+ if (h->num_items > remaining_watches)
+ goto out;
+
+ ret = 0;
+ /* Restore the epoll items/watches */
+ for (i = 0; !ret && i < h->num_items; i++) {
+ /*
+ * Loop body like multiple epoll_ctl(ep, ADD, event)
+ * calls except we've already done much of the checking.
+ */
+ struct epoll_event epev;
+ struct epitem *epi;
+ struct file *tfile;
+
+ epev.events = h->items[i].events;
+ epev.data = h->items[i].data;
+
+ /* Get the file* for the target file */
+ if (h->items[i].file_objref <= 0) {
+ ret = -EINVAL;
+ break;
+ }
+
+ tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
+ CKPT_OBJ_FILE);
+ if (IS_ERR(tfile)) {
+ ret = PTR_ERR(tfile);
+ break;
+ }
+
+ /* The target file must support poll */
+ if (!tfile->f_op || !tfile->f_op->poll) {
+ ret = -EPERM;
+ break;
+ }
+
+ /* Cannot add an epoll file descriptor inside itself. */
+ if (epfile == tfile) {
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_lock(&ep->mtx);
+ epi = ep_find(ep, tfile, h->items[i].fd);
+ if (!epi) {
+ epev.events |= POLLERR | POLLHUP;
+ ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
+ } else
+ ret = -EEXIST;
+ mutex_unlock(&ep->mtx);
+ }
+out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+/* TODO confirm that get_current_user() has been restored */
+struct file* ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *h)
+{
+ struct file *epfile;
+ int epfd, ret;
+
+ if (h->h.type != CKPT_HDR_FILE ||
+ h->h.len != sizeof(*h) ||
+ h->f_type != CKPT_FILE_EPOLL)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * TODO Normally h->f_flags contains flags that epoll_create() won't
+ * accept. Right now we pass only those flags it will accept here
+ * and restore the rest during the "common" file restore. Check
+ * to make sure we're not missing anything.
+ */
+ epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
+ if (epfd < 0)
+ return ERR_PTR(epfd);
+ epfile = fget(epfd);
+ if (!epfile)
+ return ERR_PTR(-ENOENT); /* TODO pick better error? */
+
+ ret = restore_file_common(ctx, epfile, h);
+ if (ret < 0)
+ goto fput_out;
+
+ /*
+ * Now we have the file and file descriptor but the epoll set is empty.
+ * Defer restoring the epoll set until we encounter its corresponding
+ * items. Note that this effectively counts the number of
+ * ckpt_eventpoll_items blocks we should expect -- we rely on the
+ * epfile_objref of those blocks to associate them with the proper
+ * file.
+ */
+ ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
+ ep_items_restore, NULL);
+ if (ret < 0) {
+fput_out:
+ fput(epfile);
+ epfile = ERR_PTR(ret);
+ }
+ sys_close(epfd);
+ return epfile;
+}
+
+#endif /* CONFIG_CHECKPOINT */
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 761cad5..053c6c0 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -69,6 +69,7 @@ extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
extern int _ckpt_read_nbuffer(struct ckpt_ctx *ctx, void *ptr, int len);
extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max);
extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
extern int ckpt_read_payload(struct ckpt_ctx *ctx,
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 4d5c22a..3a3e530 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -76,6 +76,7 @@ enum {
CKPT_HDR_FILE_NAME,
CKPT_HDR_FILE,
CKPT_HDR_PIPE_BUF,
+ CKPT_HDR_FILE_EPOLL_ITEMS, /* Follows file-table */
CKPT_HDR_MM = 401,
CKPT_HDR_VMA,
@@ -342,6 +343,7 @@ enum file_type {
CKPT_FILE_PIPE,
CKPT_FILE_FIFO,
CKPT_FILE_SOCKET,
+ CKPT_FILE_EPOLL,
CKPT_FILE_MAX
};
@@ -426,6 +428,18 @@ struct ckpt_hdr_file_socket {
struct ckpt_hdr_socket socket;
} __attribute__((aligned(8)));
+struct ckpt_eventpoll_items {
+ struct ckpt_hdr h;
+ __s32 epfile_objref;
+ __u32 num_items;
+ struct {
+ __u64 data;
+ __u32 fd;
+ __s32 file_objref;
+ __u32 events;
+ } items[0];
+} __attribute__((aligned(8)));
+
struct ckpt_hdr_utsns {
struct ckpt_hdr h;
char sysname[__NEW_UTS_LEN + 1];
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index e98251b..51cdd0c 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -48,6 +48,8 @@ struct ckpt_ctx {
struct ckpt_obj_hash *obj_hash; /* repository for shared objects */
struct deferqueue_head *deferqueue; /* queue of deferred work */
+ struct deferqueue_head *files_deferq; /* deferred work to do after
+ saving file table */
struct path fs_mnt; /* container root (FIXME) */
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f6856a5..ff3de38 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -95,11 +95,23 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint_hdr.h>
+extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *h);
+#endif
#else
static inline void eventpoll_init_file(struct file *file) {}
static inline void eventpoll_release(struct file *file) {}
-
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint_hdr.h>
+static inline struct file* ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr)
+{
+ return NULL;
+}
+#endif
#endif
#endif /* #ifdef __KERNEL__ */
--
1.5.6.3
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 1/2] Check for valid destructor pointer before calling it.
[not found] ` <6d896e12f316e5ff5f44bd13ae1482dab7f9253a.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-08-20 5:17 ` [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files Matt Helsley
@ 2009-08-20 12:28 ` Oren Laadan
1 sibling, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2009-08-20 12:28 UTC (permalink / raw)
To: Matt Helsley; +Cc: Containers
Matt Helsley wrote:
> Not every use of deferqueue will have a destructor function so
> we need to check it before calling it.
>
> Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Yup.
Acked-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> ---
> kernel/deferqueue.c | 3 ++-
> 1 files changed, 2 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/deferqueue.c b/kernel/deferqueue.c
> index 3fb388b..1204c8b 100644
> --- a/kernel/deferqueue.c
> +++ b/kernel/deferqueue.c
> @@ -53,7 +53,8 @@ void deferqueue_destroy(struct deferqueue_head *h)
>
> pr_debug("%s: freeing non-empty queue\n", __func__);
> list_for_each_entry_safe(dq, n, &h->list, list) {
> - dq->destructor(dq->data);
> + if (dq->destructor)
> + dq->destructor(dq->data);
> list_del(&dq->list);
> kfree(dq);
> }
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <e820d207a3b1a3228ded29bce9b29f2179ab598b.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-08-20 16:33 ` Oren Laadan
[not found] ` <4A8D7AE2.1030705-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-08-24 21:27 ` Serge E. Hallyn
1 sibling, 1 reply; 11+ messages in thread
From: Oren Laadan @ 2009-08-20 16:33 UTC (permalink / raw)
To: Matt Helsley; +Cc: Containers
Hi Matt,
I like the approach of the patch and use of deferqueue.
See comments below.
Matt Helsley wrote:
> Save/restore epoll items during checkpoint/restart respectively.
>
> Tests for the cr_tests suite to follow. Tests pass on i386.
>
> TODOs (search the patch for "TODO") that could probably use some
> comments:
>
> What to do when there's a "possible checkpoint obj leak"? (search patch
> for this string to see what I'm talking about)
Generally, complain with ckpt_write_err() and abort.
>
> Ensure get_current_user will be correct (a userns question/issue?).
>
> kmalloc failures should be dealt with more kindly than just error-out
> because epoll is made to poll many thousands of file
> descriptors.
> This seems like a more general problem with some of the
> ckpt_hdr* functions than an epoll problem but...
This is handled with pids by writing in chunks (only on checkpoint
side).
There is now better infrastructure to do chunks on restart
side too - first read the header's header (struct ckpt_hdr) without
the buffer, allocate a local buffer, followed by ckpt_read()s.
Or add ckpt_read_chunk() that will also allocate the buffer for
the next chunk. Or even a loop: ckpt_read_chunk_{start,next,end}() ?
>
> Pick better errnos for some cases.
>
> Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
> ---
> checkpoint/files.c | 35 +++++
> checkpoint/restart.c | 2 +-
> fs/eventpoll.c | 280 +++++++++++++++++++++++++++++++++++++-
> include/linux/checkpoint.h | 1 +
> include/linux/checkpoint_hdr.h | 14 ++
> include/linux/checkpoint_types.h | 2 +
> include/linux/eventpoll.h | 14 ++-
> 7 files changed, 345 insertions(+), 3 deletions(-)
>
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index 204055b..8f86dcc 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -21,6 +21,8 @@
> #include <linux/syscalls.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
> +#include <linux/eventpoll.h>
> #include <net/sock.h>
>
>
> @@ -289,11 +291,24 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
> goto out;
>
> ckpt_debug("nfds %d\n", nfds);
> + ctx->files_deferq = deferqueue_create();
> + if (!ctx->files_deferq) {
> + ret = -ENOMEM;
> + goto out;
> + }
You can create/destroy the deferqueue once, in ckpt_ctx_alloc()
and ckpt_ctx_free(), respectively. It will also simplify the logic
here.
> for (n = 0; n < nfds; n++) {
> ret = checkpoint_file_desc(ctx, files, fdtable[n]);
> if (ret < 0)
> break;
If you s/break/goto out/ here, you can get rid if the 'if' clause
below (assuming ctx->files_deferq is alloc/free in ckpt_ctx_...)
> }
> + if (!ret) {
> + ret = deferqueue_run(ctx->files_deferq);
> + if (ret > 0) {
> + pr_warning("c/r: files deferqueue had %d entries\n", ret);
ckpt_debug(...); ?
> + ret = 0;
> + }
> + }
> + deferqueue_destroy(ctx->files_deferq);
> out:
> kfree(fdtable);
> return ret;
> @@ -572,6 +587,14 @@ static struct restore_file_ops restore_file_ops[] = {
> .file_type = CKPT_FILE_SOCKET,
> .restore = sock_file_restore,
> },
> +#ifdef CONFIG_EPOLL
> + /* epoll */
> + {
> + .file_name = "EPOLL",
> + .file_type = CKPT_FILE_EPOLL,
> + .restore = ep_file_restore,
> + },
> +#endif
> };
>
> static struct file *do_restore_file(struct ckpt_ctx *ctx)
> @@ -692,11 +715,23 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
> if (ret < 0)
> goto out;
>
> + ret = -ENOMEM;
> + ctx->files_deferq = deferqueue_create();
> + if (!ctx->files_deferq)
> + goto out;
Ditto for alloc/free of ctx->files_deferq.
> for (i = 0; i < h->fdt_nfds; i++) {
> ret = restore_file_desc(ctx);
> if (ret < 0)
> break;
Ditto for s/break/goto out/ to simplify logic.
> }
> + if (!ret) {
> + ret = deferqueue_run(ctx->files_deferq);
> + if (ret > 0) {
> + pr_warning("c/r: files deferqueue had %d entries\n", ret);
ckpt_debug(...); ?
> + ret = 0;
> + }
> + }
> + deferqueue_destroy(ctx->files_deferq);
> out:
> ckpt_hdr_put(ctx, h);
> if (!ret) {
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 4fdae78..3a7d914 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -164,7 +164,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
> *
> * Return: new buffer allocated on success, error pointer otherwise
> */
> -static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> +void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> {
> struct ckpt_hdr hh;
> struct ckpt_hdr *h;
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 085c5c0..7f7070f 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
> return pollflags != -1 ? pollflags : 0;
> }
>
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
> +#else
> +#define ep_eventpoll_checkpoint NULL
> +#endif
> +
> /* File callbacks that implement the eventpoll file behaviour */
> static const struct file_operations eventpoll_fops = {
> .release = ep_eventpoll_release,
> - .poll = ep_eventpoll_poll
> + .poll = ep_eventpoll_poll,
> + .checkpoint = ep_eventpoll_checkpoint,
> + .collect = ep_file_collect,
> };
>
> /* Fast test to see if the file is an evenpoll file */
> @@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
>
> #endif /* HAVE_SET_RESTORE_SIGMASK */
>
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
> +
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + int ret = 0;
> +
> + if (!is_file_epoll(file))
> + return 0;
This should never happen, right ?
Perhaps BUG_ON() instead ?
> +
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + struct epitem *epi;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> + if (ret < 0)
> + break;
> + }
> + mutex_unlock(&ep->mtx);
> + return ret;
> +}
> +
> +struct epoll_deferq_entry {
> + struct ckpt_ctx *ctx;
> + struct file *epfile;
> +};
I suggest that:
1) Add 'int fd' here,
2) in ckpt_eventpoll_items use the fd (instead of file objref)
The motivation is security: if, on restart, you rely on a file objref,
then malicious user can change the file objref and the restart logic
may end up populating some other process's event poll items.
Instead, by using an fd, we ensure that the restarting process will
only modify a file that it actually owns.
> +
> +static int ep_items_checkpoint(void *data)
> +{
> + struct epoll_deferq_entry *ep_dq_entry = data;
> + struct ckpt_ctx *ctx;
> + struct file *file;
> + struct ckpt_eventpoll_items *h;
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + int i, ret = -ENOMEM;
> +
> + file = ep_dq_entry->epfile;
> + ctx = ep_dq_entry->ctx;
> +
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> + mutex_unlock(&ep->mtx);
> +
> + /* TODO likely allocation failure when lots of epoll items */
You can loop in chunks (a-la pids), all inside the mutex_lock().
> + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> + CKPT_HDR_FILE_EPOLL_ITEMS);
> + if (!h)
> + goto out;
return -ENOMEM;
> +
> + ret = -ENODEV;
> + h->num_items = i;
> + h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> + if (h->epfile_objref <= 0)
> + goto out;
This also should never happen. BUG_ON() here, too ?
(Also, this can go before the allocation of @h)
> +
> + ret = 0;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> + struct epitem *epi;
> + int objref;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> + if (objref <= 0) {
> + /* TODO error -- possible checkpoint obj leak */
> + ret = -ENODEV;
This can probably go away (see next comment), but if it stays, then
call to ckpt_write_err() to explain what happens, and return -EBUSY.
> + break;
> + }
> + h->items[i].fd = epi->ffd.fd;
> + h->items[i].file_objref = objref;
Is there a reason to also save the file pointer ? I thought that
the fd suffices.
> + h->items[i].events = epi->event.events;
> + h->items[i].data = epi->event.data;
> + }
> + mutex_unlock(&ep->mtx);
> + if (h && !ret)
> + ret = ckpt_write_obj(ctx, &h->h);
Test for @h is unnecessary.
If you keep only fd (not file_objref) as suggested above, then test
for @ret is also unnecessary.
> + if (!ret && (i != h->num_items)) {
> + /* TODO error -- possible checkpoint obj leak */
> + }
Complain with ckpt_write_err(), return -EBUSY... (btw, this test
may go before ckpt_write_obj above).
> +out:
> + if (h)
Test for @h is unnecessary (with the change above).
> + ckpt_hdr_put(ctx, &h->h);
> + return ret;
> +}
> +
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct ckpt_hdr_file *h;
> + struct epoll_deferq_entry ep_dq_entry;
> + int ret = -ENOMEM;
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> + if (!h)
> + goto out_print;
return -ENOMEM; (save label below)
> + h->f_type = CKPT_FILE_EPOLL;
> + ret = checkpoint_file_common(ctx, file, h);
> + if (ret < 0)
> + goto out;
> + ret = ckpt_write_obj(ctx, &h->h);
> + if (ret < 0)
> + goto out;
> +
> + /*
> + * Defer saving the epoll items until all of the ffd.file pointers
> + * have an objref; after the file table has been checkpointed.
> + */
> + ep_dq_entry.ctx = ctx;
> + ep_dq_entry.epfile = file;
> + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> +out:
> + ckpt_hdr_put(ctx, h);
> +out_print:
> + return ret;
> +}
> +
> +static int ep_items_restore(void *data)
> +{
> + struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
This is really obfuscated :(
How about ((struct epoll_deferq_entry *) data)->ctx ?
Actually, during restart the only data you save is @ctx... why at
all save an epoll_deferq_entry ? and ctx = (struct ckpt_ctx *) data;
> + struct ckpt_eventpoll_items *h;
> + struct eventpoll *ep;
> + struct file *epfile = NULL;
> + int ret, i = 0, remaining_watches;
> +
> + /*
> + * TODO possible kmalloc failure due to too many watches.
> + */
> + h = ckpt_read_obj(ctx, 0,
> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
Any reason to perfer ckpt_read_obj() over ckpt_read_buf_type() ?
(it will rid the check for type/len below).
Yep.. chunks is probably better. Such a mechanism would also be
useful for pending signals, and for reading in pids array.
> + if (IS_ERR(h))
> + return PTR_ERR(h);
> +
> + ret = -EINVAL;
> + if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
> + (h->h.len < sizeof(*h)))
> + goto out;
> +
> + /* Make sure the items match the size we expect */
> + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> + goto out;
> +
> + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> + if (IS_ERR(epfile)) {
> + ret = PTR_ERR(epfile);
> + goto out;
> + }
As suggested above, use h->epfile_fd instead of epfile_objref:
epfile = fget(h->epfile_fd);
fput(epfile); /* safe since it's already in objhash */
or even BUG_ON(!ckpt_obj_fetch ...)
> + ret = -ENOMSG;
Maybe -EINVAL - data provided by user is invalid... ? (and the setting
of @ret above remains).
> + if (!is_file_epoll(epfile))
> + goto out;
> +
> + ep = epfile->private_data;
> +
> + ret = -ENOSPC;
> + remaining_watches = (max_user_watches -
> + atomic_read(&ep->user->epoll_watches));
> + if (h->num_items > remaining_watches)
> + goto out;
This is tested anyway by ep_insert() below. Besides, it's racy here
(user may have another restart in parallel, and the check-modify isn't
atomic. Unlike with ep_insert(), this check is for a bunch of fd's not
a single one.
> +
> + ret = 0;
> + /* Restore the epoll items/watches */
> + for (i = 0; !ret && i < h->num_items; i++) {
> + /*
> + * Loop body like multiple epoll_ctl(ep, ADD, event)
> + * calls except we've already done much of the checking.
> + */
> + struct epoll_event epev;
> + struct epitem *epi;
> + struct file *tfile;
> +
> + epev.events = h->items[i].events;
> + epev.data = h->items[i].data;
> +
> + /* Get the file* for the target file */
> + if (h->items[i].file_objref <= 0) {
> + ret = -EINVAL;
> + break;
> + }
> +
> + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> + CKPT_OBJ_FILE);
> + if (IS_ERR(tfile)) {
> + ret = PTR_ERR(tfile);
> + break;
> + }
Get the file from the fd using fget() instead ?
> +
> + /* The target file must support poll */
> + if (!tfile->f_op || !tfile->f_op->poll) {
> + ret = -EPERM;
> + break;
> + }
> +
> + /* Cannot add an epoll file descriptor inside itself. */
> + if (epfile == tfile) {
> + ret = -EINVAL;
> + break;
> + }
These last two tests are already done by epoll_ctl(). It also works
with the fd.
How about reusing epoll_ctl() - e.g. move the copy_from_user() to a
wrapper and the rest in do_epoll_ctl() - instead of using ep_insert()
directly ?
This will save you most of the logic above (including testing that
this is an epoll). It will make it less likely that new checks in
epoll_ctl() be left out from this code.
> +
> + mutex_lock(&ep->mtx);
> + epi = ep_find(ep, tfile, h->items[i].fd);
> + if (!epi) {
> + epev.events |= POLLERR | POLLHUP;
> + ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
> + } else
> + ret = -EEXIST;
> + mutex_unlock(&ep->mtx);
> + }
> +out:
> + ckpt_hdr_put(ctx, h);
> + return ret;
> +}
> +
> +/* TODO confirm that get_current_user() has been restored */
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *h)
> +{
> + struct file *epfile;
> + int epfd, ret;
> +
> + if (h->h.type != CKPT_HDR_FILE ||
> + h->h.len != sizeof(*h) ||
> + h->f_type != CKPT_FILE_EPOLL)
> + return ERR_PTR(-EINVAL);
> +
> + /*
> + * TODO Normally h->f_flags contains flags that epoll_create() won't
> + * accept. Right now we pass only those flags it will accept here
> + * and restore the rest during the "common" file restore. Check
> + * to make sure we're not missing anything.
> + */
Why is this 'TODO' ?
(FWIW, even the effect of this flag is restored in restore_file_common)
> + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> + if (epfd < 0)
> + return ERR_PTR(epfd);
> + epfile = fget(epfd);
> + if (!epfile)
> + return ERR_PTR(-ENOENT); /* TODO pick better error? */
Ughh... I wonder how this can happen ? Not even if the "bad guys" are
ptracing us... so this will probably be a BUG_ON() ?
I see in fs/pipe() I return -EBADF, but that one there, too, deserves
at the very least a nasty warning message, or even a BUG_ON.
> +
> + ret = restore_file_common(ctx, epfile, h);
> + if (ret < 0)
> + goto fput_out;
> +
> + /*
> + * Now we have the file and file descriptor but the epoll set is empty.
> + * Defer restoring the epoll set until we encounter its corresponding
> + * items. Note that this effectively counts the number of
> + * ckpt_eventpoll_items blocks we should expect -- we rely on the
> + * epfile_objref of those blocks to associate them with the proper
> + * file.
The second part of the comment is a bit confusing.
> + */
> + ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
> + ep_items_restore, NULL);
> + if (ret < 0) {
> +fput_out:
> + fput(epfile);
> + epfile = ERR_PTR(ret);
> + }
> + sys_close(epfd);
/* harmless even if an error occurred */
> + return epfile;
> +}
> +
> +#endif /* CONFIG_CHECKPOINT */
> +
> static int __init eventpoll_init(void)
> {
> struct sysinfo si;
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 761cad5..053c6c0 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -69,6 +69,7 @@ extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
> extern int _ckpt_read_nbuffer(struct ckpt_ctx *ctx, void *ptr, int len);
> extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
> extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
> +extern void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max);
> extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
> extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
> extern int ckpt_read_payload(struct ckpt_ctx *ctx,
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 4d5c22a..3a3e530 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -76,6 +76,7 @@ enum {
> CKPT_HDR_FILE_NAME,
> CKPT_HDR_FILE,
> CKPT_HDR_PIPE_BUF,
> + CKPT_HDR_FILE_EPOLL_ITEMS, /* Follows file-table */
Nit: s/CKPT_HDR_FILE_EPOLL_ITEMS/CKPT_HDR_EPOLL_ITEMS/ ?
>
> CKPT_HDR_MM = 401,
> CKPT_HDR_VMA,
> @@ -342,6 +343,7 @@ enum file_type {
> CKPT_FILE_PIPE,
> CKPT_FILE_FIFO,
> CKPT_FILE_SOCKET,
> + CKPT_FILE_EPOLL,
> CKPT_FILE_MAX
> };
>
> @@ -426,6 +428,18 @@ struct ckpt_hdr_file_socket {
> struct ckpt_hdr_socket socket;
> } __attribute__((aligned(8)));
>
> +struct ckpt_eventpoll_items {
Nit: how about 'struct ckpt_hdr_eventpoll' ?
> + struct ckpt_hdr h;
> + __s32 epfile_objref;
> + __u32 num_items;
... also, making this a named struct (e.g. struct ckpt_hdr_eventpoll_item)
will make it friendlier to a 'chunks' implementation as mentioned above.
> + struct {
> + __u64 data;
> + __u32 fd;
> + __s32 file_objref;
> + __u32 events;
> + } items[0];
> +} __attribute__((aligned(8)));
So:
struct ckpt_hdr_eventpoll_item {
__u64 data;
__u32 fd;
__u32 events;
};
struct ckpt_hdr_eventpoll {
struct ckpt_hdr h;
__u32 epfile_fd;
__u32 num_items;
struct ckpt_hdr_eventpoll_item items[0];
};
> +
> struct ckpt_hdr_utsns {
> struct ckpt_hdr h;
> char sysname[__NEW_UTS_LEN + 1];
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index e98251b..51cdd0c 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -48,6 +48,8 @@ struct ckpt_ctx {
>
> struct ckpt_obj_hash *obj_hash; /* repository for shared objects */
> struct deferqueue_head *deferqueue; /* queue of deferred work */
> + struct deferqueue_head *files_deferq; /* deferred work to do after
> + saving file table */
>
> struct path fs_mnt; /* container root (FIXME) */
>
> diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
> index f6856a5..ff3de38 100644
> --- a/include/linux/eventpoll.h
> +++ b/include/linux/eventpoll.h
> @@ -95,11 +95,23 @@ static inline void eventpoll_release(struct file *file)
> eventpoll_release_file(file);
> }
>
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/checkpoint_hdr.h>
To avoid dependency on checkpoint_hdr.h, you can instead:
struct ckpt_ctx;
struct ckpt_hdr_file;
> +extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *h);
> +#endif
> #else
>
> static inline void eventpoll_init_file(struct file *file) {}
> static inline void eventpoll_release(struct file *file) {}
> -
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/checkpoint_hdr.h>
Here too.
> +static inline struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *ptr)
> +{
> + return NULL;
Hmmm... the caller expects either a valid file pointer or an
error pointer:
return ERR_PTR(-ENOSYS);
> +}
> +#endif
> #endif
>
> #endif /* #ifdef __KERNEL__ */
Oren.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <e820d207a3b1a3228ded29bce9b29f2179ab598b.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-08-20 16:33 ` Oren Laadan
@ 2009-08-24 21:27 ` Serge E. Hallyn
[not found] ` <20090824212725.GA25030-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
1 sibling, 1 reply; 11+ messages in thread
From: Serge E. Hallyn @ 2009-08-24 21:27 UTC (permalink / raw)
To: Matt Helsley; +Cc: Containers
Quoting Matt Helsley (matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> Save/restore epoll items during checkpoint/restart respectively.
>
> Tests for the cr_tests suite to follow. Tests pass on i386.
>
> TODOs (search the patch for "TODO") that could probably use some
> comments:
>
> What to do when there's a "possible checkpoint obj leak"? (search patch
> for this string to see what I'm talking about)
>
> Ensure get_current_user will be correct (a userns question/issue?).
>
> kmalloc failures should be dealt with more kindly than just error-out
> because epoll is made to poll many thousands of file
> descriptors.
> This seems like a more general problem with some of the
> ckpt_hdr* functions than an epoll problem but...
>
> Pick better errnos for some cases.
>
> Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
> ---
> checkpoint/files.c | 35 +++++
> checkpoint/restart.c | 2 +-
> fs/eventpoll.c | 280 +++++++++++++++++++++++++++++++++++++-
> include/linux/checkpoint.h | 1 +
> include/linux/checkpoint_hdr.h | 14 ++
> include/linux/checkpoint_types.h | 2 +
> include/linux/eventpoll.h | 14 ++-
> 7 files changed, 345 insertions(+), 3 deletions(-)
>
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index 204055b..8f86dcc 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -21,6 +21,8 @@
> #include <linux/syscalls.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
> +#include <linux/eventpoll.h>
> #include <net/sock.h>
>
>
> @@ -289,11 +291,24 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
> goto out;
>
> ckpt_debug("nfds %d\n", nfds);
> + ctx->files_deferq = deferqueue_create();
> + if (!ctx->files_deferq) {
> + ret = -ENOMEM;
> + goto out;
> + }
> for (n = 0; n < nfds; n++) {
> ret = checkpoint_file_desc(ctx, files, fdtable[n]);
> if (ret < 0)
> break;
> }
> + if (!ret) {
> + ret = deferqueue_run(ctx->files_deferq);
> + if (ret > 0) {
> + pr_warning("c/r: files deferqueue had %d entries\n", ret);
> + ret = 0;
> + }
> + }
> + deferqueue_destroy(ctx->files_deferq);
> out:
> kfree(fdtable);
> return ret;
> @@ -572,6 +587,14 @@ static struct restore_file_ops restore_file_ops[] = {
> .file_type = CKPT_FILE_SOCKET,
> .restore = sock_file_restore,
> },
> +#ifdef CONFIG_EPOLL
> + /* epoll */
> + {
> + .file_name = "EPOLL",
> + .file_type = CKPT_FILE_EPOLL,
> + .restore = ep_file_restore,
> + },
> +#endif
> };
>
> static struct file *do_restore_file(struct ckpt_ctx *ctx)
> @@ -692,11 +715,23 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
> if (ret < 0)
> goto out;
>
> + ret = -ENOMEM;
> + ctx->files_deferq = deferqueue_create();
> + if (!ctx->files_deferq)
> + goto out;
> for (i = 0; i < h->fdt_nfds; i++) {
> ret = restore_file_desc(ctx);
> if (ret < 0)
> break;
> }
> + if (!ret) {
> + ret = deferqueue_run(ctx->files_deferq);
> + if (ret > 0) {
> + pr_warning("c/r: files deferqueue had %d entries\n", ret);
> + ret = 0;
> + }
> + }
> + deferqueue_destroy(ctx->files_deferq);
> out:
> ckpt_hdr_put(ctx, h);
> if (!ret) {
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 4fdae78..3a7d914 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -164,7 +164,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
> *
> * Return: new buffer allocated on success, error pointer otherwise
> */
> -static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> +void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> {
> struct ckpt_hdr hh;
> struct ckpt_hdr *h;
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 085c5c0..7f7070f 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
> return pollflags != -1 ? pollflags : 0;
> }
>
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
> +#else
> +#define ep_eventpoll_checkpoint NULL
Don't forget ep_file_collect here.
> +#endif
> +
> /* File callbacks that implement the eventpoll file behaviour */
> static const struct file_operations eventpoll_fops = {
> .release = ep_eventpoll_release,
> - .poll = ep_eventpoll_poll
> + .poll = ep_eventpoll_poll,
> + .checkpoint = ep_eventpoll_checkpoint,
> + .collect = ep_file_collect,
> };
>
> /* Fast test to see if the file is an evenpoll file */
> @@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
>
> #endif /* HAVE_SET_RESTORE_SIGMASK */
>
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
> +
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + int ret = 0;
> +
> + if (!is_file_epoll(file))
> + return 0;
> +
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + struct epitem *epi;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> + if (ret < 0)
> + break;
> + }
> + mutex_unlock(&ep->mtx);
> + return ret;
> +}
> +
> +struct epoll_deferq_entry {
> + struct ckpt_ctx *ctx;
> + struct file *epfile;
> +};
> +
> +static int ep_items_checkpoint(void *data)
> +{
> + struct epoll_deferq_entry *ep_dq_entry = data;
> + struct ckpt_ctx *ctx;
> + struct file *file;
> + struct ckpt_eventpoll_items *h;
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + int i, ret = -ENOMEM;
> +
> + file = ep_dq_entry->epfile;
> + ctx = ep_dq_entry->ctx;
> +
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> + mutex_unlock(&ep->mtx);
> +
> + /* TODO likely allocation failure when lots of epoll items */
> + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> + CKPT_HDR_FILE_EPOLL_ITEMS);
> + if (!h)
> + goto out;
> +
> + ret = -ENODEV;
> + h->num_items = i;
> + h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> + if (h->epfile_objref <= 0)
> + goto out;
> +
> + ret = 0;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> + struct epitem *epi;
> + int objref;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> + if (objref <= 0) {
> + /* TODO error -- possible checkpoint obj leak */
> + ret = -ENODEV;
> + break;
> + }
> + h->items[i].fd = epi->ffd.fd;
> + h->items[i].file_objref = objref;
> + h->items[i].events = epi->event.events;
> + h->items[i].data = epi->event.data;
> + }
> + mutex_unlock(&ep->mtx);
> + if (h && !ret)
> + ret = ckpt_write_obj(ctx, &h->h);
> + if (!ret && (i != h->num_items)) {
> + /* TODO error -- possible checkpoint obj leak */
> + }
> +out:
> + if (h)
> + ckpt_hdr_put(ctx, &h->h);
> + return ret;
> +}
> +
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct ckpt_hdr_file *h;
> + struct epoll_deferq_entry ep_dq_entry;
> + int ret = -ENOMEM;
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> + if (!h)
> + goto out_print;
> + h->f_type = CKPT_FILE_EPOLL;
> + ret = checkpoint_file_common(ctx, file, h);
> + if (ret < 0)
> + goto out;
> + ret = ckpt_write_obj(ctx, &h->h);
> + if (ret < 0)
> + goto out;
> +
> + /*
> + * Defer saving the epoll items until all of the ffd.file pointers
> + * have an objref; after the file table has been checkpointed.
> + */
> + ep_dq_entry.ctx = ctx;
> + ep_dq_entry.epfile = file;
> + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> +out:
> + ckpt_hdr_put(ctx, h);
> +out_print:
> + return ret;
> +}
> +
> +static int ep_items_restore(void *data)
> +{
> + struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
> + struct ckpt_eventpoll_items *h;
> + struct eventpoll *ep;
> + struct file *epfile = NULL;
> + int ret, i = 0, remaining_watches;
> +
> + /*
> + * TODO possible kmalloc failure due to too many watches.
> + */
> + h = ckpt_read_obj(ctx, 0,
> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
> + if (IS_ERR(h))
> + return PTR_ERR(h);
> +
> + ret = -EINVAL;
> + if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
> + (h->h.len < sizeof(*h)))
> + goto out;
> +
> + /* Make sure the items match the size we expect */
> + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> + goto out;
> +
> + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> + if (IS_ERR(epfile)) {
> + ret = PTR_ERR(epfile);
> + goto out;
> + }
> + ret = -ENOMSG;
> + if (!is_file_epoll(epfile))
> + goto out;
> +
> + ep = epfile->private_data;
> +
> + ret = -ENOSPC;
> + remaining_watches = (max_user_watches -
> + atomic_read(&ep->user->epoll_watches));
> + if (h->num_items > remaining_watches)
> + goto out;
> +
> + ret = 0;
> + /* Restore the epoll items/watches */
> + for (i = 0; !ret && i < h->num_items; i++) {
> + /*
> + * Loop body like multiple epoll_ctl(ep, ADD, event)
> + * calls except we've already done much of the checking.
> + */
> + struct epoll_event epev;
> + struct epitem *epi;
> + struct file *tfile;
> +
> + epev.events = h->items[i].events;
> + epev.data = h->items[i].data;
> +
> + /* Get the file* for the target file */
> + if (h->items[i].file_objref <= 0) {
> + ret = -EINVAL;
> + break;
> + }
> +
> + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> + CKPT_OBJ_FILE);
> + if (IS_ERR(tfile)) {
> + ret = PTR_ERR(tfile);
> + break;
> + }
> +
> + /* The target file must support poll */
> + if (!tfile->f_op || !tfile->f_op->poll) {
> + ret = -EPERM;
> + break;
> + }
> +
> + /* Cannot add an epoll file descriptor inside itself. */
> + if (epfile == tfile) {
> + ret = -EINVAL;
> + break;
> + }
> +
> + mutex_lock(&ep->mtx);
> + epi = ep_find(ep, tfile, h->items[i].fd);
> + if (!epi) {
> + epev.events |= POLLERR | POLLHUP;
> + ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
> + } else
> + ret = -EEXIST;
> + mutex_unlock(&ep->mtx);
> + }
> +out:
> + ckpt_hdr_put(ctx, h);
> + return ret;
> +}
> +
> +/* TODO confirm that get_current_user() has been restored */
Not sure what you mean. At this point, the task's credentials
are still the ones used when it called sys_restart(). We won't
update them until the end of the restart operation. However the
normal file restore operations should have reset the file->f_cred
to the checkpointed value - including hierarchical user namespaces.
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *h)
> +{
> + struct file *epfile;
> + int epfd, ret;
> +
> + if (h->h.type != CKPT_HDR_FILE ||
> + h->h.len != sizeof(*h) ||
> + h->f_type != CKPT_FILE_EPOLL)
> + return ERR_PTR(-EINVAL);
> +
> + /*
> + * TODO Normally h->f_flags contains flags that epoll_create() won't
> + * accept. Right now we pass only those flags it will accept here
> + * and restore the rest during the "common" file restore. Check
> + * to make sure we're not missing anything.
> + */
> + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> + if (epfd < 0)
> + return ERR_PTR(epfd);
> + epfile = fget(epfd);
> + if (!epfile)
> + return ERR_PTR(-ENOENT); /* TODO pick better error? */
> +
> + ret = restore_file_common(ctx, epfile, h);
> + if (ret < 0)
> + goto fput_out;
> +
> + /*
> + * Now we have the file and file descriptor but the epoll set is empty.
> + * Defer restoring the epoll set until we encounter its corresponding
> + * items. Note that this effectively counts the number of
> + * ckpt_eventpoll_items blocks we should expect -- we rely on the
> + * epfile_objref of those blocks to associate them with the proper
> + * file.
> + */
> + ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
> + ep_items_restore, NULL);
> + if (ret < 0) {
> +fput_out:
> + fput(epfile);
> + epfile = ERR_PTR(ret);
> + }
> + sys_close(epfd);
> + return epfile;
> +}
> +
> +#endif /* CONFIG_CHECKPOINT */
> +
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <20090824212725.GA25030-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-08-25 2:01 ` Matt Helsley
[not found] ` <20090825020146.GC8078-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
0 siblings, 1 reply; 11+ messages in thread
From: Matt Helsley @ 2009-08-25 2:01 UTC (permalink / raw)
To: Serge E. Hallyn; +Cc: Containers
On Mon, Aug 24, 2009 at 04:27:25PM -0500, Serge E. Hallyn wrote:
> Quoting Matt Helsley (matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> > Save/restore epoll items during checkpoint/restart respectively.
<snip>
> > diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> > index 085c5c0..7f7070f 100644
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
> > return pollflags != -1 ? pollflags : 0;
> > }
> >
> > +#ifdef CONFIG_CHECKPOINT
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> > +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
> > +#else
> > +#define ep_eventpoll_checkpoint NULL
>
> Don't forget ep_file_collect here.
Oops, good catch.
>
> > +#endif
> > +
> > /* File callbacks that implement the eventpoll file behaviour */
> > static const struct file_operations eventpoll_fops = {
> > .release = ep_eventpoll_release,
> > - .poll = ep_eventpoll_poll
> > + .poll = ep_eventpoll_poll,
> > + .checkpoint = ep_eventpoll_checkpoint,
> > + .collect = ep_file_collect,
> > };
> >
> > /* Fast test to see if the file is an evenpoll file */
> > @@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> >
> > #endif /* HAVE_SET_RESTORE_SIGMASK */
> >
> > +#ifdef CONFIG_CHECKPOINT
> > +#include <linux/checkpoint.h>
> > +#include <linux/checkpoint_hdr.h>
> > +#include <linux/deferqueue.h>
> > +
> > +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > + struct rb_node *rbp;
> > + struct eventpoll *ep;
> > + int ret = 0;
> > +
> > + if (!is_file_epoll(file))
> > + return 0;
> > +
> > + ep = file->private_data;
> > + mutex_lock(&ep->mtx);
> > + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> > + struct epitem *epi;
> > +
> > + epi = rb_entry(rbp, struct epitem, rbn);
> > + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> > + if (ret < 0)
> > + break;
> > + }
> > + mutex_unlock(&ep->mtx);
> > + return ret;
> > +}
> > +
> > +struct epoll_deferq_entry {
> > + struct ckpt_ctx *ctx;
> > + struct file *epfile;
> > +};
> > +
> > +static int ep_items_checkpoint(void *data)
> > +{
> > + struct epoll_deferq_entry *ep_dq_entry = data;
> > + struct ckpt_ctx *ctx;
> > + struct file *file;
> > + struct ckpt_eventpoll_items *h;
> > + struct rb_node *rbp;
> > + struct eventpoll *ep;
> > + int i, ret = -ENOMEM;
> > +
> > + file = ep_dq_entry->epfile;
> > + ctx = ep_dq_entry->ctx;
> > +
> > + ep = file->private_data;
> > + mutex_lock(&ep->mtx);
> > + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> > + mutex_unlock(&ep->mtx);
> > +
> > + /* TODO likely allocation failure when lots of epoll items */
> > + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> > + CKPT_HDR_FILE_EPOLL_ITEMS);
> > + if (!h)
> > + goto out;
> > +
> > + ret = -ENODEV;
> > + h->num_items = i;
> > + h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> > + if (h->epfile_objref <= 0)
> > + goto out;
> > +
> > + ret = 0;
> > + mutex_lock(&ep->mtx);
> > + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> > + struct epitem *epi;
> > + int objref;
> > +
> > + epi = rb_entry(rbp, struct epitem, rbn);
> > + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> > + if (objref <= 0) {
> > + /* TODO error -- possible checkpoint obj leak */
> > + ret = -ENODEV;
> > + break;
> > + }
> > + h->items[i].fd = epi->ffd.fd;
> > + h->items[i].file_objref = objref;
> > + h->items[i].events = epi->event.events;
> > + h->items[i].data = epi->event.data;
> > + }
> > + mutex_unlock(&ep->mtx);
> > + if (h && !ret)
> > + ret = ckpt_write_obj(ctx, &h->h);
> > + if (!ret && (i != h->num_items)) {
> > + /* TODO error -- possible checkpoint obj leak */
> > + }
> > +out:
> > + if (h)
> > + ckpt_hdr_put(ctx, &h->h);
> > + return ret;
> > +}
> > +
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > + struct ckpt_hdr_file *h;
> > + struct epoll_deferq_entry ep_dq_entry;
> > + int ret = -ENOMEM;
> > +
> > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> > + if (!h)
> > + goto out_print;
> > + h->f_type = CKPT_FILE_EPOLL;
> > + ret = checkpoint_file_common(ctx, file, h);
> > + if (ret < 0)
> > + goto out;
> > + ret = ckpt_write_obj(ctx, &h->h);
> > + if (ret < 0)
> > + goto out;
> > +
> > + /*
> > + * Defer saving the epoll items until all of the ffd.file pointers
> > + * have an objref; after the file table has been checkpointed.
> > + */
> > + ep_dq_entry.ctx = ctx;
> > + ep_dq_entry.epfile = file;
> > + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> > + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> > +out:
> > + ckpt_hdr_put(ctx, h);
> > +out_print:
> > + return ret;
> > +}
> > +
> > +static int ep_items_restore(void *data)
> > +{
> > + struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
> > + struct ckpt_eventpoll_items *h;
> > + struct eventpoll *ep;
> > + struct file *epfile = NULL;
> > + int ret, i = 0, remaining_watches;
> > +
> > + /*
> > + * TODO possible kmalloc failure due to too many watches.
> > + */
> > + h = ckpt_read_obj(ctx, 0,
> > + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
> > + if (IS_ERR(h))
> > + return PTR_ERR(h);
> > +
> > + ret = -EINVAL;
> > + if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
> > + (h->h.len < sizeof(*h)))
> > + goto out;
> > +
> > + /* Make sure the items match the size we expect */
> > + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> > + goto out;
> > +
> > + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> > + if (IS_ERR(epfile)) {
> > + ret = PTR_ERR(epfile);
> > + goto out;
> > + }
> > + ret = -ENOMSG;
> > + if (!is_file_epoll(epfile))
> > + goto out;
> > +
> > + ep = epfile->private_data;
> > +
> > + ret = -ENOSPC;
> > + remaining_watches = (max_user_watches -
> > + atomic_read(&ep->user->epoll_watches));
> > + if (h->num_items > remaining_watches)
> > + goto out;
> > +
> > + ret = 0;
> > + /* Restore the epoll items/watches */
> > + for (i = 0; !ret && i < h->num_items; i++) {
> > + /*
> > + * Loop body like multiple epoll_ctl(ep, ADD, event)
> > + * calls except we've already done much of the checking.
> > + */
> > + struct epoll_event epev;
> > + struct epitem *epi;
> > + struct file *tfile;
> > +
> > + epev.events = h->items[i].events;
> > + epev.data = h->items[i].data;
> > +
> > + /* Get the file* for the target file */
> > + if (h->items[i].file_objref <= 0) {
> > + ret = -EINVAL;
> > + break;
> > + }
> > +
> > + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> > + CKPT_OBJ_FILE);
> > + if (IS_ERR(tfile)) {
> > + ret = PTR_ERR(tfile);
> > + break;
> > + }
> > +
> > + /* The target file must support poll */
> > + if (!tfile->f_op || !tfile->f_op->poll) {
> > + ret = -EPERM;
> > + break;
> > + }
> > +
> > + /* Cannot add an epoll file descriptor inside itself. */
> > + if (epfile == tfile) {
> > + ret = -EINVAL;
> > + break;
> > + }
> > +
> > + mutex_lock(&ep->mtx);
> > + epi = ep_find(ep, tfile, h->items[i].fd);
> > + if (!epi) {
> > + epev.events |= POLLERR | POLLHUP;
> > + ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
> > + } else
> > + ret = -EEXIST;
> > + mutex_unlock(&ep->mtx);
> > + }
> > +out:
> > + ckpt_hdr_put(ctx, h);
> > + return ret;
> > +}
> > +
> > +/* TODO confirm that get_current_user() has been restored */
>
> Not sure what you mean. At this point, the task's credentials
> are still the ones used when it called sys_restart(). We won't
> update them until the end of the restart operation. However the
> normal file restore operations should have reset the file->f_cred
> to the checkpointed value - including hierarchical user namespaces.
I wanted to check to ensure that we're properly enforcing the epoll
watch limits for the restarted tasks. So the patch looks good because
it restore the "common" file pieces before the epoll items/watches.
See below if you're curious where/how that happens. I think I can
remove this TODO now.
> > +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > + struct ckpt_hdr_file *h)
> > +{
> > + struct file *epfile;
> > + int epfd, ret;
> > +
> > + if (h->h.type != CKPT_HDR_FILE ||
> > + h->h.len != sizeof(*h) ||
> > + h->f_type != CKPT_FILE_EPOLL)
> > + return ERR_PTR(-EINVAL);
> > +
> > + /*
> > + * TODO Normally h->f_flags contains flags that epoll_create() won't
> > + * accept. Right now we pass only those flags it will accept here
> > + * and restore the rest during the "common" file restore. Check
> > + * to make sure we're not missing anything.
> > + */
> > + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> > + if (epfd < 0)
> > + return ERR_PTR(epfd);
> > + epfile = fget(epfd);
> > + if (!epfile)
> > + return ERR_PTR(-ENOENT); /* TODO pick better error? */
> > +
> > + ret = restore_file_common(ctx, epfile, h);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This should restore epfile->f_cred and...
> > + if (ret < 0)
> > + goto fput_out;
> > +
> > + /*
> > + * Now we have the file and file descriptor but the epoll set is empty.
> > + * Defer restoring the epoll set until we encounter its corresponding
> > + * items. Note that this effectively counts the number of
> > + * ckpt_eventpoll_items blocks we should expect -- we rely on the
> > + * epfile_objref of those blocks to associate them with the proper
> > + * file.
> > + */
> > + ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
> > + ep_items_restore, NULL);
This defers restoration of the items/watches.
> > + if (ret < 0) {
> > +fput_out:
> > + fput(epfile);
> > + epfile = ERR_PTR(ret);
> > + }
> > + sys_close(epfd);
> > + return epfile;
> > +}
> > +
> > +#endif /* CONFIG_CHECKPOINT */
> > +
Thanks for the review!
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <20090825020146.GC8078-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-08-25 4:17 ` Oren Laadan
[not found] ` <4A9365C4.3010305-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
0 siblings, 1 reply; 11+ messages in thread
From: Oren Laadan @ 2009-08-25 4:17 UTC (permalink / raw)
To: Matt Helsley; +Cc: Containers
Matt Helsley wrote:
> On Mon, Aug 24, 2009 at 04:27:25PM -0500, Serge E. Hallyn wrote:
>> Quoting Matt Helsley (matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
>>> Save/restore epoll items during checkpoint/restart respectively.
[...]
>>> +/* TODO confirm that get_current_user() has been restored */
>> Not sure what you mean. At this point, the task's credentials
>> are still the ones used when it called sys_restart(). We won't
>> update them until the end of the restart operation. However the
>> normal file restore operations should have reset the file->f_cred
>> to the checkpointed value - including hierarchical user namespaces.
>
> I wanted to check to ensure that we're properly enforcing the epoll
> watch limits for the restarted tasks. So the patch looks good because
> it restore the "common" file pieces before the epoll items/watches.
> See below if you're curious where/how that happens. I think I can
> remove this TODO now.
Good point. Me thinks it deserves a comment in the code ?
Oren.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <4A8D7AE2.1030705-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-08-25 5:17 ` Matt Helsley
[not found] ` <20090825051723.GD8078-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
0 siblings, 1 reply; 11+ messages in thread
From: Matt Helsley @ 2009-08-25 5:17 UTC (permalink / raw)
To: Oren Laadan; +Cc: Containers
On Thu, Aug 20, 2009 at 12:33:38PM -0400, Oren Laadan wrote:
> Hi Matt,
>
> I like the approach of the patch and use of deferqueue.
> See comments below.
>
> Matt Helsley wrote:
> > Save/restore epoll items during checkpoint/restart respectively.
> >
> > Tests for the cr_tests suite to follow. Tests pass on i386.
> >
> > TODOs (search the patch for "TODO") that could probably use some
> > comments:
> >
> > What to do when there's a "possible checkpoint obj leak"? (search patch
> > for this string to see what I'm talking about)
>
> Generally, complain with ckpt_write_err() and abort.
OK
>
> >
> > Ensure get_current_user will be correct (a userns question/issue?).
> >
> > kmalloc failures should be dealt with more kindly than just error-out
> > because epoll is made to poll many thousands of file
> > descriptors.
> > This seems like a more general problem with some of the
> > ckpt_hdr* functions than an epoll problem but...
>
> This is handled with pids by writing in chunks (only on checkpoint
> side).
>
> There is now better infrastructure to do chunks on restart
> side too - first read the header's header (struct ckpt_hdr) without
> the buffer, allocate a local buffer, followed by ckpt_read()s.
>
> Or add ckpt_read_chunk() that will also allocate the buffer for
> the next chunk. Or even a loop: ckpt_read_chunk_{start,next,end}() ?
OK, I'll look into this. I think I will keep this patch without
"chunking" and add a second patch which will do the chunking.
Please consider all other comments re: chunks OK'd.
>
> >
> > Pick better errnos for some cases.
> >
> > Signed-off-by: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> > Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
> > ---
> > checkpoint/files.c | 35 +++++
> > checkpoint/restart.c | 2 +-
> > fs/eventpoll.c | 280 +++++++++++++++++++++++++++++++++++++-
> > include/linux/checkpoint.h | 1 +
> > include/linux/checkpoint_hdr.h | 14 ++
> > include/linux/checkpoint_types.h | 2 +
> > include/linux/eventpoll.h | 14 ++-
> > 7 files changed, 345 insertions(+), 3 deletions(-)
> >
> > diff --git a/checkpoint/files.c b/checkpoint/files.c
> > index 204055b..8f86dcc 100644
> > --- a/checkpoint/files.c
> > +++ b/checkpoint/files.c
> > @@ -21,6 +21,8 @@
> > #include <linux/syscalls.h>
> > #include <linux/checkpoint.h>
> > #include <linux/checkpoint_hdr.h>
> > +#include <linux/deferqueue.h>
> > +#include <linux/eventpoll.h>
> > #include <net/sock.h>
> >
> >
> > @@ -289,11 +291,24 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
> > goto out;
> >
> > ckpt_debug("nfds %d\n", nfds);
> > + ctx->files_deferq = deferqueue_create();
> > + if (!ctx->files_deferq) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
>
> You can create/destroy the deferqueue once, in ckpt_ctx_alloc()
> and ckpt_ctx_free(), respectively. It will also simplify the logic
> here.
OK. Consider other cleanups related to this OK'd.
>
> > for (n = 0; n < nfds; n++) {
> > ret = checkpoint_file_desc(ctx, files, fdtable[n]);
> > if (ret < 0)
> > break;
>
> If you s/break/goto out/ here, you can get rid if the 'if' clause
> below (assuming ctx->files_deferq is alloc/free in ckpt_ctx_...)
OK.
>
> > }
> > + if (!ret) {
> > + ret = deferqueue_run(ctx->files_deferq);
> > + if (ret > 0) {
> > + pr_warning("c/r: files deferqueue had %d entries\n", ret);
>
> ckpt_debug(...); ?
OK
>
> > + ret = 0;
> > + }
> > + }
> > + deferqueue_destroy(ctx->files_deferq);
> > out:
> > kfree(fdtable);
> > return ret;
> > @@ -572,6 +587,14 @@ static struct restore_file_ops restore_file_ops[] = {
> > .file_type = CKPT_FILE_SOCKET,
> > .restore = sock_file_restore,
> > },
> > +#ifdef CONFIG_EPOLL
> > + /* epoll */
> > + {
> > + .file_name = "EPOLL",
> > + .file_type = CKPT_FILE_EPOLL,
> > + .restore = ep_file_restore,
> > + },
> > +#endif
> > };
> >
> > static struct file *do_restore_file(struct ckpt_ctx *ctx)
> > @@ -692,11 +715,23 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
> > if (ret < 0)
> > goto out;
> >
> > + ret = -ENOMEM;
> > + ctx->files_deferq = deferqueue_create();
> > + if (!ctx->files_deferq)
> > + goto out;
>
> Ditto for alloc/free of ctx->files_deferq.
>
> > for (i = 0; i < h->fdt_nfds; i++) {
> > ret = restore_file_desc(ctx);
> > if (ret < 0)
> > break;
>
> Ditto for s/break/goto out/ to simplify logic.
>
> > }
> > + if (!ret) {
> > + ret = deferqueue_run(ctx->files_deferq);
> > + if (ret > 0) {
> > + pr_warning("c/r: files deferqueue had %d entries\n", ret);
>
> ckpt_debug(...); ?
>
> > + ret = 0;
> > + }
> > + }
> > + deferqueue_destroy(ctx->files_deferq);
> > out:
> > ckpt_hdr_put(ctx, h);
> > if (!ret) {
> > diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> > index 4fdae78..3a7d914 100644
> > --- a/checkpoint/restart.c
> > +++ b/checkpoint/restart.c
> > @@ -164,7 +164,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
> > *
> > * Return: new buffer allocated on success, error pointer otherwise
> > */
> > -static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> > +void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> > {
> > struct ckpt_hdr hh;
> > struct ckpt_hdr *h;
> > diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> > index 085c5c0..7f7070f 100644
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
> > return pollflags != -1 ? pollflags : 0;
> > }
> >
> > +#ifdef CONFIG_CHECKPOINT
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> > +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
> > +#else
> > +#define ep_eventpoll_checkpoint NULL
> > +#endif
> > +
> > /* File callbacks that implement the eventpoll file behaviour */
> > static const struct file_operations eventpoll_fops = {
> > .release = ep_eventpoll_release,
> > - .poll = ep_eventpoll_poll
> > + .poll = ep_eventpoll_poll,
> > + .checkpoint = ep_eventpoll_checkpoint,
> > + .collect = ep_file_collect,
> > };
> >
> > /* Fast test to see if the file is an evenpoll file */
> > @@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> >
> > #endif /* HAVE_SET_RESTORE_SIGMASK */
> >
> > +#ifdef CONFIG_CHECKPOINT
> > +#include <linux/checkpoint.h>
> > +#include <linux/checkpoint_hdr.h>
> > +#include <linux/deferqueue.h>
> > +
> > +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > + struct rb_node *rbp;
> > + struct eventpoll *ep;
> > + int ret = 0;
> > +
> > + if (!is_file_epoll(file))
> > + return 0;
>
> This should never happen, right ?
> Perhaps BUG_ON() instead ?
It's redundant so I removed it. I was unconditionally calling this on
each file when collecting the file table as a bad simulation of your
.collect file operation. Now that that's in I can remove this check.
>
> > +
> > + ep = file->private_data;
> > + mutex_lock(&ep->mtx);
> > + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> > + struct epitem *epi;
> > +
> > + epi = rb_entry(rbp, struct epitem, rbn);
> > + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> > + if (ret < 0)
> > + break;
> > + }
> > + mutex_unlock(&ep->mtx);
> > + return ret;
> > +}
> > +
> > +struct epoll_deferq_entry {
> > + struct ckpt_ctx *ctx;
> > + struct file *epfile;
> > +};
>
> I suggest that:
>
> 1) Add 'int fd' here,
> 2) in ckpt_eventpoll_items use the fd (instead of file objref)
>
> The motivation is security: if, on restart, you rely on a file objref,
> then malicious user can change the file objref and the restart logic
> may end up populating some other process's event poll items.
>
> Instead, by using an fd, we ensure that the restarting process will
> only modify a file that it actually owns.
The fd saved with the items is not necessarily matched with the file*
anymore. For example userspace could do:
fda = open("/dev/null", O_RDONLY);
epoll_ctl(efd, EPOLL_CTL_ADD, fda, ...);
fdb = dup(fda);
dup2(0, fda);
<checkpoint here>
epoll_ctl(efd, EPOLL_CTL_MOD, fda, ...);
Saving the fd would associate stdin with the epoll item and not
/dev/null.
So we can't use the fd in place of the file reference.
>
>
> > +
> > +static int ep_items_checkpoint(void *data)
> > +{
> > + struct epoll_deferq_entry *ep_dq_entry = data;
> > + struct ckpt_ctx *ctx;
> > + struct file *file;
> > + struct ckpt_eventpoll_items *h;
> > + struct rb_node *rbp;
> > + struct eventpoll *ep;
> > + int i, ret = -ENOMEM;
> > +
> > + file = ep_dq_entry->epfile;
> > + ctx = ep_dq_entry->ctx;
> > +
> > + ep = file->private_data;
> > + mutex_lock(&ep->mtx);
> > + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> > + mutex_unlock(&ep->mtx);
> > +
> > + /* TODO likely allocation failure when lots of epoll items */
>
> You can loop in chunks (a-la pids), all inside the mutex_lock().
>
> > + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> > + CKPT_HDR_FILE_EPOLL_ITEMS);
> > + if (!h)
> > + goto out;
>
> return -ENOMEM;
> > +
> > + ret = -ENODEV;
> > + h->num_items = i;
> > + h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> > + if (h->epfile_objref <= 0)
> > + goto out;
>
> This also should never happen. BUG_ON() here, too ?
OK
> (Also, this can go before the allocation of @h)
Good idea!
>
> > +
> > + ret = 0;
> > + mutex_lock(&ep->mtx);
> > + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> > + struct epitem *epi;
> > + int objref;
> > +
> > + epi = rb_entry(rbp, struct epitem, rbn);
> > + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> > + if (objref <= 0) {
> > + /* TODO error -- possible checkpoint obj leak */
> > + ret = -ENODEV;
>
> This can probably go away (see next comment), but if it stays, then
> call to ckpt_write_err() to explain what happens, and return -EBUSY.
OK
>
> > + break;
> > + }
> > + h->items[i].fd = epi->ffd.fd;
> > + h->items[i].file_objref = objref;
>
> Is there a reason to also save the file pointer ? I thought that
> the fd suffices.
The fd does not suffice as demonstrated above.
>
> > + h->items[i].events = epi->event.events;
> > + h->items[i].data = epi->event.data;
> > + }
> > + mutex_unlock(&ep->mtx);
> > + if (h && !ret)
> > + ret = ckpt_write_obj(ctx, &h->h);
>
> Test for @h is unnecessary.
OK
>
> If you keep only fd (not file_objref) as suggested above, then test
> for @ret is also unnecessary.
The file ref is needed.
>
> > + if (!ret && (i != h->num_items)) {
> > + /* TODO error -- possible checkpoint obj leak */
> > + }
>
> Complain with ckpt_write_err(), return -EBUSY... (btw, this test
> may go before ckpt_write_obj above).
OK
> > +out:
> > + if (h)
>
> Test for @h is unnecessary (with the change above).
OK
>
> > + ckpt_hdr_put(ctx, &h->h);
> > + return ret;
> > +}
> > +
> > +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> > +{
> > + struct ckpt_hdr_file *h;
> > + struct epoll_deferq_entry ep_dq_entry;
> > + int ret = -ENOMEM;
> > +
> > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> > + if (!h)
> > + goto out_print;
> return -ENOMEM; (save label below)
Good idea.
>
> > + h->f_type = CKPT_FILE_EPOLL;
> > + ret = checkpoint_file_common(ctx, file, h);
> > + if (ret < 0)
> > + goto out;
> > + ret = ckpt_write_obj(ctx, &h->h);
> > + if (ret < 0)
> > + goto out;
> > +
> > + /*
> > + * Defer saving the epoll items until all of the ffd.file pointers
> > + * have an objref; after the file table has been checkpointed.
> > + */
> > + ep_dq_entry.ctx = ctx;
> > + ep_dq_entry.epfile = file;
> > + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> > + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> > +out:
> > + ckpt_hdr_put(ctx, h);
> > +out_print:
> > + return ret;
> > +}
> > +
> > +static int ep_items_restore(void *data)
> > +{
> > + struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
>
> This is really obfuscated :(
Tell that to the author of deferqueue. ;) Since deferqueue does memcpy()
there's really not a better way to pass the ctx pointer.
> How about ((struct epoll_deferq_entry *) data)->ctx ?
Because I don't need struct epoll_deferq_entry during restore, as you
point out below.
> Actually, during restart the only data you save is @ctx... why at
> all save an epoll_deferq_entry ? and ctx = (struct ckpt_ctx *) data;
I don't. I need the pointer-to-a-pointer because of the way deferqueue
works. It doesn't simply pass a pointer but makes a copy of what the
pointer points to. Hence a pointer to the ctx pointer is passed.
> > + struct ckpt_eventpoll_items *h;
> > + struct eventpoll *ep;
> > + struct file *epfile = NULL;
> > + int ret, i = 0, remaining_watches;
> > +
> > + /*
> > + * TODO possible kmalloc failure due to too many watches.
> > + */
> > + h = ckpt_read_obj(ctx, 0,
> > + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
>
> Any reason to perfer ckpt_read_obj() over ckpt_read_buf_type() ?
> (it will rid the check for type/len below).
So I could read it all at once rather than fiddle with reading the
header and chunking all of the input. I wanted to get something simple
working first and then make it work with thousands of watches.
> Yep.. chunks is probably better. Such a mechanism would also be
> useful for pending signals, and for reading in pids array.
>
> > + if (IS_ERR(h))
> > + return PTR_ERR(h);
> > +
> > + ret = -EINVAL;
> > + if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
> > + (h->h.len < sizeof(*h)))
> > + goto out;
> > +
> > + /* Make sure the items match the size we expect */
> > + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> > + goto out;
> > +
> > + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> > + if (IS_ERR(epfile)) {
> > + ret = PTR_ERR(epfile);
> > + goto out;
> > + }
>
> As suggested above, use h->epfile_fd instead of epfile_objref:
> epfile = fget(h->epfile_fd);
> fput(epfile); /* safe since it's already in objhash */
> or even BUG_ON(!ckpt_obj_fetch ...)
I would except the fd isn't passed in, looking it up with a manual walk
of the fd table is a bad idea, and there could be multiple fds
(via dup*()) for each epoll file. Otherwise an fd would work here (but
not for the items).
>
> > + ret = -ENOMSG;
>
> Maybe -EINVAL - data provided by user is invalid... ? (and the setting
> of @ret above remains).
OK
>
> > + if (!is_file_epoll(epfile))
> > + goto out;
> > +
> > + ep = epfile->private_data;
> > +
> > + ret = -ENOSPC;
> > + remaining_watches = (max_user_watches -
> > + atomic_read(&ep->user->epoll_watches));
> > + if (h->num_items > remaining_watches)
> > + goto out;
>
> This is tested anyway by ep_insert() below. Besides, it's racy here
> (user may have another restart in parallel, and the check-modify isn't
> atomic. Unlike with ep_insert(), this check is for a bunch of fd's not
> a single one.
Yes, it's racy. However, we're restarting so we're primarily adding watches.
It's also possible we're restarting thousands of watches only to fail
when we add the last few. This avoids all that useless work when possible.
> > +
> > + ret = 0;
> > + /* Restore the epoll items/watches */
> > + for (i = 0; !ret && i < h->num_items; i++) {
> > + /*
> > + * Loop body like multiple epoll_ctl(ep, ADD, event)
> > + * calls except we've already done much of the checking.
> > + */
> > + struct epoll_event epev;
> > + struct epitem *epi;
> > + struct file *tfile;
> > +
> > + epev.events = h->items[i].events;
> > + epev.data = h->items[i].data;
> > +
> > + /* Get the file* for the target file */
> > + if (h->items[i].file_objref <= 0) {
> > + ret = -EINVAL;
> > + break;
> > + }
> > +
> > + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> > + CKPT_OBJ_FILE);
> > + if (IS_ERR(tfile)) {
> > + ret = PTR_ERR(tfile);
> > + break;
> > + }
>
> Get the file from the fd using fget() instead ?
No -- we need the file and we can't rely on the fd table.
> > +
> > + /* The target file must support poll */
> > + if (!tfile->f_op || !tfile->f_op->poll) {
> > + ret = -EPERM;
> > + break;
> > + }
> > +
> > + /* Cannot add an epoll file descriptor inside itself. */
> > + if (epfile == tfile) {
> > + ret = -EINVAL;
> > + break;
> > + }
>
> These last two tests are already done by epoll_ctl(). It also works
> with the fd.
>
> How about reusing epoll_ctl() - e.g. move the copy_from_user() to a
> wrapper and the rest in do_epoll_ctl() - instead of using ep_insert()
> directly ?
OK except I moved the fd lookup into the wrapper.
>
> This will save you most of the logic above (including testing that
> this is an epoll). It will make it less likely that new checks in
> epoll_ctl() be left out from this code.
Yup.
>
> > +
> > + mutex_lock(&ep->mtx);
> > + epi = ep_find(ep, tfile, h->items[i].fd);
> > + if (!epi) {
> > + epev.events |= POLLERR | POLLHUP;
> > + ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
> > + } else
> > + ret = -EEXIST;
> > + mutex_unlock(&ep->mtx);
> > + }
> > +out:
> > + ckpt_hdr_put(ctx, h);
> > + return ret;
> > +}
> > +
> > +/* TODO confirm that get_current_user() has been restored */
> > +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > + struct ckpt_hdr_file *h)
> > +{
> > + struct file *epfile;
> > + int epfd, ret;
> > +
> > + if (h->h.type != CKPT_HDR_FILE ||
> > + h->h.len != sizeof(*h) ||
> > + h->f_type != CKPT_FILE_EPOLL)
> > + return ERR_PTR(-EINVAL);
> > +
> > + /*
> > + * TODO Normally h->f_flags contains flags that epoll_create() won't
> > + * accept. Right now we pass only those flags it will accept here
> > + * and restore the rest during the "common" file restore. Check
> > + * to make sure we're not missing anything.
> > + */
>
> Why is this 'TODO' ?
I wanted to doublecheck and make sure userspace couldn't get around any
flags restrictions.
> (FWIW, even the effect of this flag is restored in restore_file_common)
Yup.
>
> > + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> > + if (epfd < 0)
> > + return ERR_PTR(epfd);
> > + epfile = fget(epfd);
> > + if (!epfile)
> > + return ERR_PTR(-ENOENT); /* TODO pick better error? */
>
> Ughh... I wonder how this can happen ? Not even if the "bad guys" are
> ptracing us... so this will probably be a BUG_ON() ?
OK.
> I see in fs/pipe() I return -EBADF, but that one there, too, deserves
> at the very least a nasty warning message, or even a BUG_ON.
>
> > +
> > + ret = restore_file_common(ctx, epfile, h);
> > + if (ret < 0)
> > + goto fput_out;
> > +
> > + /*
> > + * Now we have the file and file descriptor but the epoll set is empty.
> > + * Defer restoring the epoll set until we encounter its corresponding
> > + * items. Note that this effectively counts the number of
> > + * ckpt_eventpoll_items blocks we should expect -- we rely on the
> > + * epfile_objref of those blocks to associate them with the proper
> > + * file.
>
> The second part of the comment is a bit confusing.
OK
>
> > + */
> > + ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
> > + ep_items_restore, NULL);
> > + if (ret < 0) {
> > +fput_out:
> > + fput(epfile);
> > + epfile = ERR_PTR(ret);
> > + }
> > + sys_close(epfd);
> /* harmless even if an error occurred */
OK
>
> > + return epfile;
> > +}
> > +
> > +#endif /* CONFIG_CHECKPOINT */
> > +
> > static int __init eventpoll_init(void)
> > {
> > struct sysinfo si;
> > diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> > index 761cad5..053c6c0 100644
> > --- a/include/linux/checkpoint.h
> > +++ b/include/linux/checkpoint.h
> > @@ -69,6 +69,7 @@ extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
> > extern int _ckpt_read_nbuffer(struct ckpt_ctx *ctx, void *ptr, int len);
> > extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
> > extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
> > +extern void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max);
> > extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
> > extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
> > extern int ckpt_read_payload(struct ckpt_ctx *ctx,
> > diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> > index 4d5c22a..3a3e530 100644
> > --- a/include/linux/checkpoint_hdr.h
> > +++ b/include/linux/checkpoint_hdr.h
> > @@ -76,6 +76,7 @@ enum {
> > CKPT_HDR_FILE_NAME,
> > CKPT_HDR_FILE,
> > CKPT_HDR_PIPE_BUF,
> > + CKPT_HDR_FILE_EPOLL_ITEMS, /* Follows file-table */
>
> Nit: s/CKPT_HDR_FILE_EPOLL_ITEMS/CKPT_HDR_EPOLL_ITEMS/ ?
OK
>
> >
> > CKPT_HDR_MM = 401,
> > CKPT_HDR_VMA,
> > @@ -342,6 +343,7 @@ enum file_type {
> > CKPT_FILE_PIPE,
> > CKPT_FILE_FIFO,
> > CKPT_FILE_SOCKET,
> > + CKPT_FILE_EPOLL,
> > CKPT_FILE_MAX
> > };
> >
> > @@ -426,6 +428,18 @@ struct ckpt_hdr_file_socket {
> > struct ckpt_hdr_socket socket;
> > } __attribute__((aligned(8)));
> >
> > +struct ckpt_eventpoll_items {
>
> Nit: how about 'struct ckpt_hdr_eventpoll' ?
OK, but not for the items...
I'm also debating renaming: s/item/watch/
My guess is "watch" is the name that epoll gives userspace while "item" is
the kernel representation of a "watch"...
>
> > + struct ckpt_hdr h;
> > + __s32 epfile_objref;
> > + __u32 num_items;
>
> ... also, making this a named struct (e.g. struct ckpt_hdr_eventpoll_item)
> will make it friendlier to a 'chunks' implementation as mentioned above.
Yup.
>
> > + struct {
> > + __u64 data;
> > + __u32 fd;
> > + __s32 file_objref;
> > + __u32 events;
> > + } items[0];
> > +} __attribute__((aligned(8)));
>
> So:
>
> struct ckpt_hdr_eventpoll_item {
^^^^^
I strongly dislike this name change idea. There's no header for each item and
it's obviously part of the checkpoint image. If anything, too many
things have "_hdr_" in their names when they lack a ckpt_hdr or don't
need one: ckpt_hdr_const, ckpt_hdr_pids, ckpt_hdr_socket, ckpt_hdr_sigset,
ckpt_hdr_sigaction, ckpt_hdr_siginfo, ckpt_hdr_rlimit, ckpt_hdr_ipc_perms*)
(*We shouldn't need two adjacent headers for ckpt_hdr_ipc_{shm,msg,sem}.)
> __u64 data;
> __u32 fd;
> __u32 events;
> };
>
> struct ckpt_hdr_eventpoll {
> struct ckpt_hdr h;
> __u32 epfile_fd;
Can't see a good way to use/get the fd.
> __u32 num_items;
> struct ckpt_hdr_eventpoll_item items[0];
> };
>
> > +
> > struct ckpt_hdr_utsns {
> > struct ckpt_hdr h;
> > char sysname[__NEW_UTS_LEN + 1];
> > diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> > index e98251b..51cdd0c 100644
> > --- a/include/linux/checkpoint_types.h
> > +++ b/include/linux/checkpoint_types.h
> > @@ -48,6 +48,8 @@ struct ckpt_ctx {
> >
> > struct ckpt_obj_hash *obj_hash; /* repository for shared objects */
> > struct deferqueue_head *deferqueue; /* queue of deferred work */
> > + struct deferqueue_head *files_deferq; /* deferred work to do after
> > + saving file table */
> >
> > struct path fs_mnt; /* container root (FIXME) */
> >
> > diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
> > index f6856a5..ff3de38 100644
> > --- a/include/linux/eventpoll.h
> > +++ b/include/linux/eventpoll.h
> > @@ -95,11 +95,23 @@ static inline void eventpoll_release(struct file *file)
> > eventpoll_release_file(file);
> > }
> >
> > +#ifdef CONFIG_CHECKPOINT
> > +#include <linux/checkpoint_hdr.h>
>
> To avoid dependency on checkpoint_hdr.h, you can instead:
>
> struct ckpt_ctx;
> struct ckpt_hdr_file;
OK
>
> > +extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > + struct ckpt_hdr_file *h);
> > +#endif
> > #else
> >
> > static inline void eventpoll_init_file(struct file *file) {}
> > static inline void eventpoll_release(struct file *file) {}
> > -
> > +#ifdef CONFIG_CHECKPOINT
> > +#include <linux/checkpoint_hdr.h>
>
> Here too.
I moved it to a common spot rather than declare them twice.
>
> > +static inline struct file* ep_file_restore(struct ckpt_ctx *ctx,
> > + struct ckpt_hdr_file *ptr)
> > +{
> > + return NULL;
>
> Hmmm... the caller expects either a valid file pointer or an
> error pointer:
>
> return ERR_PTR(-ENOSYS);
Good catch.
>
>
> > +}
> > +#endif
> > #endif
> >
> > #endif /* #ifdef __KERNEL__ */
>
> Oren.
Thanks for the review!
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <4A9365C4.3010305-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-08-25 12:09 ` Matt Helsley
0 siblings, 0 replies; 11+ messages in thread
From: Matt Helsley @ 2009-08-25 12:09 UTC (permalink / raw)
To: Oren Laadan; +Cc: Containers
On Tue, Aug 25, 2009 at 12:17:08AM -0400, Oren Laadan wrote:
>
>
> Matt Helsley wrote:
> > On Mon, Aug 24, 2009 at 04:27:25PM -0500, Serge E. Hallyn wrote:
> >> Quoting Matt Helsley (matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> >>> Save/restore epoll items during checkpoint/restart respectively.
>
> [...]
>
> >>> +/* TODO confirm that get_current_user() has been restored */
> >> Not sure what you mean. At this point, the task's credentials
> >> are still the ones used when it called sys_restart(). We won't
> >> update them until the end of the restart operation. However the
> >> normal file restore operations should have reset the file->f_cred
> >> to the checkpointed value - including hierarchical user namespaces.
> >
> > I wanted to check to ensure that we're properly enforcing the epoll
> > watch limits for the restarted tasks. So the patch looks good because
> > it restore the "common" file pieces before the epoll items/watches.
> > See below if you're curious where/how that happens. I think I can
> > remove this TODO now.
>
> Good point. Me thinks it deserves a comment in the code ?
Easy enough. :)
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <20090825051723.GD8078-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
@ 2009-08-25 17:42 ` Oren Laadan
[not found] ` <4A94227C.9050101-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
0 siblings, 1 reply; 11+ messages in thread
From: Oren Laadan @ 2009-08-25 17:42 UTC (permalink / raw)
To: Matt Helsley; +Cc: Containers
Matt Helsley wrote:
> On Thu, Aug 20, 2009 at 12:33:38PM -0400, Oren Laadan wrote:
>> Hi Matt,
>>
>> I like the approach of the patch and use of deferqueue.
>> See comments below.
>>
>> Matt Helsley wrote:
>>> Save/restore epoll items during checkpoint/restart respectively.
>>>
>>> Tests for the cr_tests suite to follow. Tests pass on i386.
>>>
>>> TODOs (search the patch for "TODO") that could probably use some
>>> comments:
>>>
[...]
>>> +
>>> +struct epoll_deferq_entry {
>>> + struct ckpt_ctx *ctx;
>>> + struct file *epfile;
>>> +};
>> I suggest that:
>>
>> 1) Add 'int fd' here,
>> 2) in ckpt_eventpoll_items use the fd (instead of file objref)
>>
>> The motivation is security: if, on restart, you rely on a file objref,
>> then malicious user can change the file objref and the restart logic
>> may end up populating some other process's event poll items.
>>
>> Instead, by using an fd, we ensure that the restarting process will
>> only modify a file that it actually owns.
>
> The fd saved with the items is not necessarily matched with the file*
> anymore. For example userspace could do:
>
> fda = open("/dev/null", O_RDONLY);
> epoll_ctl(efd, EPOLL_CTL_ADD, fda, ...);
> fdb = dup(fda);
> dup2(0, fda);
> <checkpoint here>
> epoll_ctl(efd, EPOLL_CTL_MOD, fda, ...);
>
> Saving the fd would associate stdin with the epoll item and not
> /dev/null.
>
> So we can't use the fd in place of the file reference.
You are totally right. I was confused by the fact that epoll data
does save the original fd, and assumed it was "updated" at close()
and dup() time. But not...
... which makes me wonder: is it impossible for a user to delete
the original fd from the epoll using epoll_ctl(.., EPOLL_CTL_DEL...) ?
This is related to my concern during restart: the input can provide
_any_ file objref that is valid at that time during restart, and
plug in files owned by different processes than intended.
I'm unsure to what extent this is a security concern, and what's
the right approach to solve it.
Looking at ep_cmp_ffd(), it uses both the file pointer and the fd
to find a matching entry in the RB tree; So I'd think that in that
case you need to save both the original FD and the objref of the
file, no ?
>>> +static int ep_items_restore(void *data)
>>> +{
>>> + struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
>> This is really obfuscated :(
>
> Tell that to the author of deferqueue. ;) Since deferqueue does memcpy()
> there's really not a better way to pass the ctx pointer.
Next time I'll use my right to remain silent :p
Seriously, maybe we can add a wrapper for this (common ?) case of
a passing a single pointer as data. Do you think this interface
may be usedful:
deferqueue_add_ptr(head, ptr, func, dtor)
{
return deferqueue(head, &ptr, sizeof(ptr), func, dtor);
}
and
void *deferqueue_data_ptr(void *data)
{
return *((void **) data);
}
[...]
>>> + struct ckpt_eventpoll_items *h;
>>> + struct eventpoll *ep;
>>> + struct file *epfile = NULL;
>>> + int ret, i = 0, remaining_watches;
>>> +
>>> + /*
>>> + * TODO possible kmalloc failure due to too many watches.
>>> + */
>>> + h = ckpt_read_obj(ctx, 0,
>>> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
>> Any reason to perfer ckpt_read_obj() over ckpt_read_buf_type() ?
>> (it will rid the check for type/len below).
>
> So I could read it all at once rather than fiddle with reading the
> header and chunking all of the input. I wanted to get something simple
> working first and then make it work with thousands of watches.
Hmm.. ckpt_read_buf_type() calls ckpt_read_obj() like you do, then
checks that the type matches. So you are left with the h->h.len sanity
check only.
[...]
>>> + if (!is_file_epoll(epfile))
>>> + goto out;
>>> +
>>> + ep = epfile->private_data;
>>> +
>>> + ret = -ENOSPC;
>>> + remaining_watches = (max_user_watches -
>>> + atomic_read(&ep->user->epoll_watches));
>>> + if (h->num_items > remaining_watches)
>>> + goto out;
>> This is tested anyway by ep_insert() below. Besides, it's racy here
>> (user may have another restart in parallel, and the check-modify isn't
>> atomic. Unlike with ep_insert(), this check is for a bunch of fd's not
>> a single one.
>
> Yes, it's racy. However, we're restarting so we're primarily adding watches.
> It's also possible we're restarting thousands of watches only to fail
> when we add the last few. This avoids all that useless work when possible.
Is this concern worth taking the code out of its "usual" place (the
common epoll code) and making a special case here ?
>>
>> struct ckpt_hdr_eventpoll_item {
> ^^^^^
> I strongly dislike this name change idea. There's no header for each item and
> it's obviously part of the checkpoint image. If anything, too many
> things have "_hdr_" in their names when they lack a ckpt_hdr or don't
> need one: ckpt_hdr_const, ckpt_hdr_pids, ckpt_hdr_socket, ckpt_hdr_sigset,
> ckpt_hdr_sigaction, ckpt_hdr_siginfo, ckpt_hdr_rlimit, ckpt_hdr_ipc_perms*)
I have no strong opinion in either way, and what you say makes sense.
I do strongly prefer a single common convention to all "ckpt_hdr"-less
structures (and one to all "ckpt_hdr"-full ones). When the patches for
this show up, I'll pull them :)
>
> (*We shouldn't need two adjacent headers for ckpt_hdr_ipc_{shm,msg,sem}.)
Right. I'll remove the ckpt-hdr from ckpt_{hdr_}ipc_perms.
Oren.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.
[not found] ` <4A94227C.9050101-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-08-26 9:00 ` Matt Helsley
0 siblings, 0 replies; 11+ messages in thread
From: Matt Helsley @ 2009-08-26 9:00 UTC (permalink / raw)
To: Oren Laadan; +Cc: Containers
On Tue, Aug 25, 2009 at 01:42:20PM -0400, Oren Laadan wrote:
>
>
> Matt Helsley wrote:
> > On Thu, Aug 20, 2009 at 12:33:38PM -0400, Oren Laadan wrote:
> >> Hi Matt,
> >>
> >> I like the approach of the patch and use of deferqueue.
> >> See comments below.
> >>
> >> Matt Helsley wrote:
> >>> Save/restore epoll items during checkpoint/restart respectively.
> >>>
> >>> Tests for the cr_tests suite to follow. Tests pass on i386.
> >>>
> >>> TODOs (search the patch for "TODO") that could probably use some
> >>> comments:
> >>>
>
> [...]
>
> >>> +
> >>> +struct epoll_deferq_entry {
> >>> + struct ckpt_ctx *ctx;
> >>> + struct file *epfile;
> >>> +};
> >> I suggest that:
> >>
> >> 1) Add 'int fd' here,
> >> 2) in ckpt_eventpoll_items use the fd (instead of file objref)
> >>
> >> The motivation is security: if, on restart, you rely on a file objref,
> >> then malicious user can change the file objref and the restart logic
> >> may end up populating some other process's event poll items.
> >>
> >> Instead, by using an fd, we ensure that the restarting process will
> >> only modify a file that it actually owns.
> >
> > The fd saved with the items is not necessarily matched with the file*
> > anymore. For example userspace could do:
> >
> > fda = open("/dev/null", O_RDONLY);
> > epoll_ctl(efd, EPOLL_CTL_ADD, fda, ...);
> > fdb = dup(fda);
> > dup2(0, fda);
> > <checkpoint here>
> > epoll_ctl(efd, EPOLL_CTL_MOD, fda, ...);
> >
> > Saving the fd would associate stdin with the epoll item and not
> > /dev/null.
> >
> > So we can't use the fd in place of the file reference.
>
> You are totally right. I was confused by the fact that epoll data
> does save the original fd, and assumed it was "updated" at close()
> and dup() time. But not...
>
> ... which makes me wonder: is it impossible for a user to delete
> the original fd from the epoll using epoll_ctl(.., EPOLL_CTL_DEL...) ?
>
> This is related to my concern during restart: the input can provide
> _any_ file objref that is valid at that time during restart, and
> plug in files owned by different processes than intended.
>
> I'm unsure to what extent this is a security concern, and what's
> the right approach to solve it.
If it is a security concern (and I don't think it's any more a concern
than the other objrefs) then my hunch is that namespaces ought to be
able to help somehow, but I haven't really thought that through.
> Looking at ep_cmp_ffd(), it uses both the file pointer and the fd
> to find a matching entry in the RB tree; So I'd think that in that
> case you need to save both the original FD and the objref of the
> file, no ?
Yup. I believe that's what's in the patch I posted. Each item has the
fd, events, and data that it was registered with plus a file objref.
>
> >>> +static int ep_items_restore(void *data)
> >>> +{
> >>> + struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
> >> This is really obfuscated :(
> >
> > Tell that to the author of deferqueue. ;) Since deferqueue does memcpy()
> > there's really not a better way to pass the ctx pointer.
>
> Next time I'll use my right to remain silent :p
>
> Seriously, maybe we can add a wrapper for this (common ?) case of
> a passing a single pointer as data. Do you think this interface
> may be usedful:
>
> deferqueue_add_ptr(head, ptr, func, dtor)
> {
> return deferqueue(head, &ptr, sizeof(ptr), func, dtor);
> }
>
> and
>
> void *deferqueue_data_ptr(void *data)
> {
> return *((void **) data);
> }
>
> [...]
Seems like a great idea to me.
> >>> + struct ckpt_eventpoll_items *h;
> >>> + struct eventpoll *ep;
> >>> + struct file *epfile = NULL;
> >>> + int ret, i = 0, remaining_watches;
> >>> +
> >>> + /*
> >>> + * TODO possible kmalloc failure due to too many watches.
> >>> + */
> >>> + h = ckpt_read_obj(ctx, 0,
> >>> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
> >> Any reason to perfer ckpt_read_obj() over ckpt_read_buf_type() ?
> >> (it will rid the check for type/len below).
> >
> > So I could read it all at once rather than fiddle with reading the
> > header and chunking all of the input. I wanted to get something simple
> > working first and then make it work with thousands of watches.
>
> Hmm.. ckpt_read_buf_type() calls ckpt_read_obj() like you do, then
> checks that the type matches. So you are left with the h->h.len sanity
> check only.
>
> [...]
So I looked at how pids are handled again. There's a slight difference
in that I also record the epfile objref and number of "items", so I
found I couldn't use similar code. I'm still working on this part of the
patch but I've already incorporated your suggestions for the next post.
> >>> + if (!is_file_epoll(epfile))
> >>> + goto out;
> >>> +
> >>> + ep = epfile->private_data;
> >>> +
> >>> + ret = -ENOSPC;
> >>> + remaining_watches = (max_user_watches -
> >>> + atomic_read(&ep->user->epoll_watches));
> >>> + if (h->num_items > remaining_watches)
> >>> + goto out;
> >> This is tested anyway by ep_insert() below. Besides, it's racy here
> >> (user may have another restart in parallel, and the check-modify isn't
> >> atomic. Unlike with ep_insert(), this check is for a bunch of fd's not
> >> a single one.
> >
> > Yes, it's racy. However, we're restarting so we're primarily adding watches.
> > It's also possible we're restarting thousands of watches only to fail
> > when we add the last few. This avoids all that useless work when possible.
>
> Is this concern worth taking the code out of its "usual" place (the
> common epoll code) and making a special case here ?
I don't think it's worth that.
Incidentally, the next version will factor out and re-use half of epoll_ctl().
The main benefit will be maintenance since it saves only about 5 lines.
> >>
> >> struct ckpt_hdr_eventpoll_item {
> > ^^^^^
> > I strongly dislike this name change idea. There's no header for each item and
> > it's obviously part of the checkpoint image. If anything, too many
> > things have "_hdr_" in their names when they lack a ckpt_hdr or don't
> > need one: ckpt_hdr_const, ckpt_hdr_pids, ckpt_hdr_socket, ckpt_hdr_sigset,
> > ckpt_hdr_sigaction, ckpt_hdr_siginfo, ckpt_hdr_rlimit, ckpt_hdr_ipc_perms*)
>
> I have no strong opinion in either way, and what you say makes sense.
> I do strongly prefer a single common convention to all "ckpt_hdr"-less
> structures (and one to all "ckpt_hdr"-full ones). When the patches for
> this show up, I'll pull them :)
OK, I think this might deserve a code comment in the patches:
/*
* All struct ckpt_hdr_* structs have a struct ckpt_hdr (usually as
* their first member).
*/
>
> >
> > (*We shouldn't need two adjacent headers for ckpt_hdr_ipc_{shm,msg,sem}.)
>
> Right. I'll remove the ckpt-hdr from ckpt_{hdr_}ipc_perms.
Great, thanks.
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2009-08-26 9:00 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-08-20 5:17 [PATCH 1/2] Check for valid destructor pointer before calling it Matt Helsley
[not found] ` <6d896e12f316e5ff5f44bd13ae1482dab7f9253a.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-08-20 5:17 ` [PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files Matt Helsley
[not found] ` <e820d207a3b1a3228ded29bce9b29f2179ab598b.1250745409.git.matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-08-20 16:33 ` Oren Laadan
[not found] ` <4A8D7AE2.1030705-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-08-25 5:17 ` Matt Helsley
[not found] ` <20090825051723.GD8078-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-08-25 17:42 ` Oren Laadan
[not found] ` <4A94227C.9050101-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-08-26 9:00 ` Matt Helsley
2009-08-24 21:27 ` Serge E. Hallyn
[not found] ` <20090824212725.GA25030-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-08-25 2:01 ` Matt Helsley
[not found] ` <20090825020146.GC8078-52DBMbEzqgQ/wnmkkaCWp/UQ3DHhIser@public.gmane.org>
2009-08-25 4:17 ` Oren Laadan
[not found] ` <4A9365C4.3010305-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-08-25 12:09 ` Matt Helsley
2009-08-20 12:28 ` [PATCH 1/2] Check for valid destructor pointer before calling it Oren Laadan
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.