* [PATCH v21 019/100] Make file_pos_read/write() public and export kernel_write()
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-06 12:26 ` Josef Bacik
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
` (19 subsequent siblings)
20 siblings, 1 reply; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
These three are used in a subsequent patch to allow the kernel c/r
code to call vfs_read/write() to read and write data to and from the
checkpoint image.
This patch makes the following changes:
1) Move kernel_write() from fs/splice.c to fs/exec.c to be near
kernel_read()
2) Make kernel_read/write() iterate if they face partial reads or
writes, and retry if they face -EAGAIN.
3) Adjust prototypes of kernel_read/write() to use size_t and ssize_t
4) Move file_pos_read/write() to include/linux/fs.h
Changelog [ckpt-v21]
- Introduce kernel_write(), fix kernel_read()
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/exec.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++----
fs/read_write.c | 10 -------
fs/splice.c | 17 +------------
include/linux/fs.h | 13 +++++++++-
4 files changed, 77 insertions(+), 32 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa1..7bacb6a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -693,23 +693,82 @@ exit:
}
EXPORT_SYMBOL(open_exec);
-int kernel_read(struct file *file, loff_t offset,
- char *addr, unsigned long count)
+static ssize_t _kernel_read(struct file *file, loff_t offset,
+ char __user *ubuf, size_t count)
{
- mm_segment_t old_fs;
+ ssize_t nread;
+ size_t nleft;
loff_t pos = offset;
- int result;
+
+ for (nleft = count; nleft; nleft -= nread) {
+ nread = vfs_read(file, ubuf, nleft, &pos);
+ if (nread <= 0) {
+ if (nread == -EAGAIN) {
+ nread = 0;
+ continue;
+ } else if (nread == 0)
+ break;
+ else
+ return nread;
+ }
+ ubuf += nread;
+ }
+ return count - nleft;
+}
+
+ssize_t kernel_read(struct file *file, loff_t offset,
+ char *addr, size_t count)
+{
+ mm_segment_t old_fs;
+ ssize_t result;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
- result = vfs_read(file, (void __user *)addr, count, &pos);
+ result = _kernel_read(file, offset, (void __user *)addr, count);
set_fs(old_fs);
return result;
}
EXPORT_SYMBOL(kernel_read);
+static ssize_t _kernel_write(struct file *file, loff_t offset,
+ const char __user *ubuf, size_t count)
+{
+ ssize_t nwrite;
+ size_t nleft;
+ loff_t pos = offset;
+
+ for (nleft = count; nleft; nleft -= nwrite) {
+ nwrite = vfs_write(file, ubuf, nleft, &pos);
+ if (nwrite < 0) {
+ if (nwrite == -EAGAIN) {
+ nwrite = 0;
+ continue;
+ } else
+ return nwrite;
+ }
+ ubuf += nwrite;
+ }
+ return count - nleft;
+}
+
+ssize_t kernel_write(struct file *file, loff_t offset,
+ const char *addr, size_t count)
+{
+ mm_segment_t old_fs;
+ ssize_t result;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ result = _kernel_write(file, offset, (void __user *)addr, count);
+ set_fs(old_fs);
+ return result;
+}
+
+EXPORT_SYMBOL(kernel_write);
+
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d..67b7d83 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -361,16 +361,6 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
EXPORT_SYMBOL(vfs_write);
-static inline loff_t file_pos_read(struct file *file)
-{
- return file->f_pos;
-}
-
-static inline void file_pos_write(struct file *file, loff_t pos)
-{
- file->f_pos = pos;
-}
-
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct file *file;
diff --git a/fs/splice.c b/fs/splice.c
index 9313b61..188e17d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -538,21 +538,6 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
return res;
}
-static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
- loff_t pos)
-{
- mm_segment_t old_fs;
- ssize_t res;
-
- old_fs = get_fs();
- set_fs(get_ds());
- /* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_write(file, (const char __user *)buf, count, &pos);
- set_fs(old_fs);
-
- return res;
-}
-
ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
@@ -1011,7 +996,7 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
return ret;
data = buf->ops->map(pipe, buf, 0);
- ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+ ret = kernel_write(sd->u.file, sd->pos, data + buf->offset, sd->len);
buf->ops->unmap(pipe, buf, data);
return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39d57bc..9e8b171 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1548,6 +1548,16 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
struct iovec *fast_pointer,
struct iovec **ret_pointer);
+static inline loff_t file_pos_read(struct file *file)
+{
+ return file->f_pos;
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+ file->f_pos = pos;
+}
+
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
@@ -2127,7 +2137,8 @@ extern struct file *do_filp_open(int dfd, const char *pathname,
int open_flag, int mode, int acc_mode);
extern int may_open(struct path *, int, int);
-extern int kernel_read(struct file *, loff_t, char *, unsigned long);
+extern ssize_t kernel_read(struct file *, loff_t, char *, size_t);
+extern ssize_t kernel_write(struct file *, loff_t, const char *, size_t);
extern struct file * open_exec(const char *);
/* fs/dcache.c -- generic fs support functions */
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 020/100] c/r: documentation
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
2010-05-01 14:15 ` [PATCH v21 019/100] Make file_pos_read/write() public and export kernel_write() Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-06 20:27 ` Randy Dunlap
2010-05-01 14:15 ` [PATCH v21 022/100] c/r: basic infrastructure for checkpoint/restart Oren Laadan
` (18 subsequent siblings)
20 siblings, 1 reply; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-api, linux-mm, linux-fsdevel,
netdev, Dave Hansen
Covers application checkpoint/restart, overall design, interfaces,
usage, shared objects, and and checkpoint image format.
Changelog[v19-rc1]:
- Update documentation and examples for new syscalls API
- [Liu Alexander] Fix typos
- [Serge Hallyn] Update checkpoint image format
Changelog[v16]:
- Update documentation
- Unify into readme.txt and usage.txt
Changelog[v14]:
- Discard the 'h.parent' field
- New image format (shared objects appear before they are referenced
unless they are compound)
Changelog[v8]:
- Split into multiple files in Documentation/checkpoint/...
- Extend documentation, fix typos and comments from feedback
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
Documentation/checkpoint/checkpoint.c | 38 +++
Documentation/checkpoint/readme.txt | 370 ++++++++++++++++++++++++++++
Documentation/checkpoint/self_checkpoint.c | 69 +++++
Documentation/checkpoint/self_restart.c | 40 +++
Documentation/checkpoint/usage.txt | 247 +++++++++++++++++++
5 files changed, 764 insertions(+), 0 deletions(-)
create mode 100644 Documentation/checkpoint/checkpoint.c
create mode 100644 Documentation/checkpoint/readme.txt
create mode 100644 Documentation/checkpoint/self_checkpoint.c
create mode 100644 Documentation/checkpoint/self_restart.c
create mode 100644 Documentation/checkpoint/usage.txt
diff --git a/Documentation/checkpoint/checkpoint.c b/Documentation/checkpoint/checkpoint.c
new file mode 100644
index 0000000..8560f30
--- /dev/null
+++ b/Documentation/checkpoint/checkpoint.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+ return syscall(__NR_checkpoint, pid, fd, flags);
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t pid;
+ int ret;
+
+ if (argc != 2) {
+ printf("usage: ckpt PID\n");
+ exit(1);
+ }
+
+ pid = atoi(argv[1]);
+ if (pid <= 0) {
+ printf("invalid pid\n");
+ exit(1);
+ }
+
+ ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+
+ if (ret < 0)
+ perror("checkpoint");
+ else
+ printf("checkpoint id %d\n", ret);
+
+ return (ret > 0 ? 0 : 1);
+}
diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
new file mode 100644
index 0000000..4fa5560
--- /dev/null
+++ b/Documentation/checkpoint/readme.txt
@@ -0,0 +1,370 @@
+
+ Checkpoint-Restart support in the Linux kernel
+ ==========================================================
+
+Copyright (C) 2008-2010 Oren Laadan
+
+Author: Oren Laadan <orenl@cs.columbia.edu>
+
+License: The GNU Free Documentation License, Version 1.2
+ (dual licensed under the GPL v2)
+
+Contributors: Oren Laadan <orenl@cs.columbia.edu>
+ Serge Hallyn <serue@us.ibm.com>
+ Dan Smith <danms@us.ibm.com>
+ Matt Helsley <matthltc@us.ibm.com>
+ Nathan Lynch <ntl@pobox.com>
+ Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
+ Dave Hansen <dave@linux.vnet.ibm.com>
+
+
+Introduction
+============
+
+Application checkpoint/restart [C/R] is the ability to save the state
+of a running application so that it can later resume its execution
+from the time at which it was checkpointed. An application can be
+migrated by checkpointing it on one machine and restarting it on
+another. C/R can provide many potential benefits:
+
+* Failure recovery: by rolling back to a previous checkpoint
+
+* Improved response time: by restarting applications from checkpoints
+ instead of from scratch.
+
+* Improved system utilization: by suspending long running CPU
+ intensive jobs and resuming them when load decreases.
+
+* Fault resilience: by migrating applications off faulty hosts.
+
+* Dynamic load balancing: by migrating applications to less loaded
+ hosts.
+
+* Improved service availability and administration: by migrating
+ applications before host maintenance so that they continue to run
+ with minimal downtime
+
+* Time-travel: by taking periodic checkpoints and restarting from
+ any previous checkpoint.
+
+Compared to hypervisor approaches, application C/R is more lightweight
+since it need only save the state associated with applications, while
+operating system data structures (e.g. buffer cache, drivers state
+and the like) are uninteresting.
+
+
+Overall design
+==============
+
+Checkpoint and restart are done in the kernel as much as possible.
+Two new system calls are introduced to provide C/R: sys_checkpoint()
+and sys_restart(). They both operate on a process tree (hierarchy),
+either a whole container or a subtree of a container.
+
+Checkpointing entire containers ensures that there are no dependencies
+on anything outside the container, which guarantees that a matching
+restart will succeed (assuming that the file system state remains
+consistent). However, it requires that users will always run the tasks
+that they wish to checkpoint inside containers. This is ideal for,
+e.g., private virtual servers and the like.
+
+In contrast, when checkpointing a subtree of a container it is up to
+the user to ensure that dependencies either don't exist or can be
+safely ignored. This is useful, for instance, for HPC scenarios or
+even a user that would like to periodically checkpoint a long-running
+batch job.
+
+An additional system call, a la madvise(), is planned, so that tasks
+can advise the kernel how to handle specific resources. For instance,
+a task could ask to skip a memory area at checkpoint to save space,
+or to use a preset file descriptor at restart instead of restoring it
+from the checkpoint image. It will provide the flexibility that is
+particularly useful to address the needs of a diverse crowd of users
+and use-cases.
+
+Syscall sys_checkpoint() is given a pid that indicates the top of the
+hierarchy, a file descriptor to store the image, and flags. The code
+serializes internal user- and kernel-state and writes it out to the
+file descriptor. The resulting image is stream-able. The processes are
+expected to be frozen for the duration of the checkpoint.
+
+In general, a checkpoint consists of 5 steps:
+1. Pre-dump
+2. Freeze the container/subtree
+3. Save tasks' and kernel state <-- sys_checkpoint()
+4. Thaw (or kill) the container/subtree
+5. Post-dump
+
+Step 3 is done by calling sys_checkpoint(). Steps 1 and 5 are an
+optimization to reduce application downtime. In particular, "pre-dump"
+works before freezing the container, e.g. the pre-copy for live
+migration, and "post-dump" works after the container resumes
+execution, e.g. write-back the data to secondary storage.
+
+The kernel exports a relatively opaque 'blob' of data to userspace
+which can then be handed to the new kernel at restart time. The
+'blob' contains data and state of select portions of kernel structures
+such as VMAs and mm_structs, as well as copies of the actual memory
+that the tasks use. Any changes in this blob's format between kernel
+revisions can be handled by an in-userspace conversion program.
+
+To restart, userspace first create a process hierarchy that matches
+that of the checkpoint, and each task calls sys_restart(). The syscall
+reads the saved kernel state from a file descriptor, and re-creates
+the resources that the tasks need to resume execution. The restart
+code is executed by each task that is restored in the new hierarchy to
+reconstruct its own state.
+
+In general, a restart consists of 3 steps:
+1. Create hierarchy
+2. Restore tasks' and kernel state <-- sys_restart()
+3. Resume userspace (or freeze tasks)
+
+Because the process hierarchy, during restart in created in userspace,
+the restarting tasks have the flexibility to prepare before calling
+sys_restart().
+
+
+Checkpoint image format
+=======================
+
+The checkpoint image format is built of records that consist of a
+pre-header identifying its contents, followed by a payload. This
+format allow userspace tools to easily parse and skip through the
+image without requiring intimate knowledge of the data. It will also
+be handy to enable parallel checkpointing in the future where multiple
+threads interleave data from multiple processes into a single stream.
+
+The pre-header is defined by 'struct ckpt_hdr' as follows: @type
+identifies the type of the payload, @len tells its length in bytes
+including the pre-header.
+
+struct ckpt_hdr {
+ __s32 type;
+ __s32 len;
+};
+
+The pre-header must be the first component in all other headers. For
+instance, the task data is saved in 'struct ckpt_hdr_task', which
+looks something like this:
+
+struct ckpt_hdr_task {
+ struct ckpt_hdr h;
+ __u32 pid;
+ ...
+};
+
+THE IMAGE FORMAT IS EXPECTED TO CHANGE over time as more features are
+supported, or as existing features change in the kernel and require to
+adjust their representation. Any such changes will be be handled by
+in-userspace conversion tools.
+
+The general format of the checkpoint image is as follows:
+* Image header
+* Container configuration
+* Task hierarchy
+* Tasks' state
+* Image trailer
+
+The image always begins with a general header that holds a magic
+number, an architecture identifier (little endian format), a format
+version number (@rev), followed by information about the kernel
+(currently version and UTS data). It also holds the time of the
+checkpoint and the flags given to sys_checkpoint(). This header is
+followed by an arch-specific header.
+
+The container configuration section containers information that is
+global to the container. Security (LSM) configuration is one example.
+Network configuration and container-wide mounts may also go here, so
+that the userspace restart coordinator can re-create a suitable
+environment.
+
+The task hierarchy comes next so that userspace tools can read it
+early (even from a stream) and re-create the restarting tasks. This is
+basically an array of all checkpointed tasks, and their relationships
+(parent, siblings, threads, etc).
+
+Then the state of all tasks is saved, in the order that they appear in
+the tasks array above. For each state, we save data like task_struct,
+namespaces, open files, memory layout, memory contents, cpu state,
+signals and signal handlers, etc. For resources that are shared among
+multiple processes, we first checkpoint said resource (and only once),
+and in the task data we give a reference to it. More about shared
+resources below.
+
+Finally, the image always ends with a trailer that holds a (different)
+magic number, serving for sanity check.
+
+
+Shared objects
+==============
+
+Many resources may be shared by multiple tasks (e.g. file descriptors,
+memory address space, etc), or even have multiple references from
+other resources (e.g. a single inode that represents two ends of a
+pipe).
+
+Shared objects are tracked using a hash table (objhash) to ensure that
+they are only checkpointed or restored once. To handle a shared
+object, it is first looked up in the hash table, to determine if is
+the first encounter or a recurring appearance. The hash table itself
+is not saved as part of the checkpoint image: it is constructed
+dynamically during both checkpoint and restart, and discarded at the
+end of the operation.
+
+During checkpoint, when a shared object is encountered for the first
+time, it is inserted to the hash table, indexed by its kernel address.
+It is assigned an identifier (@objref) in order of appearance, and
+then its state is saved. Subsequent lookups of that object in the hash
+will yield that entry, in which case only the @objref is saved, as
+opposed the entire state of the object.
+
+During restart, shared objects are indexed by their @objref as given
+during the checkpoint. On the first appearance of each shared object,
+a new resource will be created and its state restored from the image.
+Then the object is added to the hash table. Subsequent lookups of the
+same unique identifier in the hash table will yield that entry, and
+then the existing object instance is reused instead of creating
+a new one.
+
+The hash grabs a reference to each object that is inserted, and
+maintains this reference for the entire lifetime of the hash. Thus,
+it is always safe to reference an object that is stored in the hash.
+The hash is "one-way" in the sense that objects that are added are
+never deleted from the hash until the hash is discarded. This, in
+turn, happens only when the checkpoint (or restart) terminates.
+
+Shared objects are thus saved when they are first seen, and _before_
+the parent object that uses them. Therefore by the time the parent
+objects needs them, they should already be in the objhash. The one
+exception is when more than a single shared resource will be restarted
+at once (e.g. like the two ends of a pipe, or all the namespaces in an
+nsproxy). In this case the parent object is dumped first followed by
+the individual sub-resources).
+
+The checkpoint image is stream-able, meaning that restarting from it
+may not require lseek(). This is enforced at checkpoint time, by
+carefully selecting the order of shared objects, to respect the rule
+that an object is always saved before the objects that refers to it.
+
+
+Memory contents format
+======================
+
+The memory contents of a given memory address space (->mm) is dumped
+as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
+This header details the vma properties, and a reference to a file
+(if file backed) or an inode (or shared memory) object.
+
+The vma header is followed by the actual contents - but only those
+pages that need to be saved, i.e. dirty pages. They are written in
+chunks of data, where each chunks contains a header that indicates
+that number of pages in the chunk, followed by an array of virtual
+addresses and then an array of actual page contents. The last chunk
+holds zero pages.
+
+To illustrate this, consider a single simple task with two vmas: one
+is file mapped with two dumped pages, and the other is anonymous with
+three dumped pages. The memory dump will look like this:
+
+ ckpt_hdr + ckpt_hdr_vma
+ ckpt_hdr_pgarr (nr_pages = 2)
+ addr1, addr2
+ page1, page2
+ ckpt_hdr_pgarr (nr_pages = 0)
+ ckpt_hdr + ckpt_hdr_vma
+ ckpt_hdr_pgarr (nr_pages = 3)
+ addr3, addr4, addr5
+ page3, page4, page5
+ ckpt_hdr_pgarr (nr_pages = 0)
+
+
+Error handling
+==============
+
+Both checkpoint and restart operations may fail due to a variety of
+reasons. Using a simple, single return value from the system call is
+insufficient to report the reason of a failure.
+
+Instead, both sys_checkpoint() and sys_restart() accept an additional
+argument - a file descriptor to which the kernel writes diagnostic
+and debugging information. Both the checkpoint and restart userspace
+utilities have options to specify a filename to store this log.
+
+In addition, checkpoint provides informative status report upon
+failure in the checkpoint image in the form of (one or more) error
+objects, 'struct ckpt_hdr_err'. An error objects consists of a
+mandatory pre-header followed by a null character ('\0'), and then a
+string that describes the error. By default, if an error occurs, this
+will be the last object written to the checkpoint image.
+
+Upon failure, the caller can examine the image (e.g. with 'ckptinfo')
+and extract the detailed error message. The leading '\0' is useful if
+one wants to seek back from the end of the checkpoint image, instead
+of parsing the entire image separately.
+
+
+Security
+========
+
+The main question is whether sys_checkpoint() and sys_restart()
+require privileged or unprivileged operation.
+
+Early versions checked capable(CAP_SYS_ADMIN) assuming that we would
+attempt to remove the need for privilege, so that all users could
+safely use it. Arnd Bergmann pointed out that it'd make more sense to
+let unprivileged users use them now, so that we'll be more careful
+about the security as patches roll in.
+
+Checkpoint: the main concern is whether a task that performs the
+checkpoint of another task has sufficient privileges to access its
+state. We address this by requiring that the checkpointer task will be
+able to ptrace the target task, by means of ptrace_may_access() with
+access mode.
+
+Restart: the main concern is that we may allow an unprivileged user to
+feed the kernel with random data. To this end, the restart works in a
+way that does not skip the usual security checks. Task credentials,
+i.e. euid, reuid, and LSM security contexts currently come from the
+caller, not the checkpoint image. As credentials are restored too,
+the ability of a task that calls sys_restore() to setresuid/setresgid
+to those values must be checked.
+
+Keeping the restart procedure to operate within the limits of the
+caller's credentials means that there various scenarios that cannot
+be supported. For instance, a setuid program that opened a protected
+log file and then dropped privileges will fail the restart, because
+the user won't have enough credentials to reopen the file. In these
+cases, we should probably treat restarting like inserting a kernel
+module: surely the user can cause havoc by providing incorrect data,
+but then again we must trust the root account.
+
+So that's why we don't want CAP_SYS_ADMIN required up-front. That way
+we will be forced to more carefully review each of those features.
+However, this can be controlled with a sysctl-variable.
+
+
+Kernel interfaces
+=================
+
+* To checkpoint a vma, the 'struct vm_operations_struct' needs to
+ provide a method ->checkpoint:
+ int checkpoint(struct ckpt_ctx *, struct vma_struct *)
+ Restart requires a matching (exported) restore:
+ int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
+
+* To checkpoint a file, the 'struct file_operations' needs to provide
+ the methods ->checkpoint and ->collect:
+ int checkpoint(struct ckpt_ctx *, struct file *)
+ int collect(struct ckpt_ctx *, struct file *)
+ Restart requires a matching (exported) restore:
+ int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
+ For most file systems, generic_file_{checkpoint,restore}() can be
+ used.
+
+* To checkpoint a socket, the 'struct proto_ops' needs to provide
+ the methods ->checkpoint, ->collect and ->restore:
+ int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
+ int collect(struct ckpt_ctx *ctx, struct socket *sock);
+ int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
+
diff --git a/Documentation/checkpoint/self_checkpoint.c b/Documentation/checkpoint/self_checkpoint.c
new file mode 100644
index 0000000..27dba0d
--- /dev/null
+++ b/Documentation/checkpoint/self_checkpoint.c
@@ -0,0 +1,69 @@
+/*
+ * self_checkpoint.c: demonstrate self-checkpoint
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <math.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+ return syscall(__NR_checkpoint, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+#define OUTFILE "/tmp/cr-self.out"
+
+int main(int argc, char *argv[])
+{
+ pid_t pid = getpid();
+ FILE *file;
+ int i, ret;
+
+ close(0);
+ close(2);
+
+ unlink(OUTFILE);
+ file = fopen(OUTFILE, "w+");
+ if (!file) {
+ perror("open");
+ exit(1);
+ }
+ if (dup2(0, 2) < 0) {
+ perror("dup2");
+ exit(1);
+ }
+
+ fprintf(file, "hello, world!\n");
+ fflush(file);
+
+ for (i = 0; i < 1000; i++) {
+ sleep(1);
+ fprintf(file, "count %d\n", i);
+ fflush(file);
+
+ if (i != 2)
+ continue;
+ ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+ if (ret < 0) {
+ fprintf(file, "ckpt: %s\n", strerror(errno));
+ exit(2);
+ }
+
+ fprintf(file, "checkpoint ret: %d\n", ret);
+ fflush(file);
+ }
+
+ return 0;
+}
diff --git a/Documentation/checkpoint/self_restart.c b/Documentation/checkpoint/self_restart.c
new file mode 100644
index 0000000..647ce51
--- /dev/null
+++ b/Documentation/checkpoint/self_restart.c
@@ -0,0 +1,40 @@
+/*
+ * self_restart.c: demonstrate self-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#define _GNU_SOURCE /* or _BSD_SOURCE or _SVID_SOURCE */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int restart(pid_t pid, int fd, unsigned long flags)
+{
+ return syscall(__NR_restart, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t pid = getpid();
+ int ret;
+
+ ret = restart(pid, STDIN_FILENO, RESTART_TASKSELF);
+ if (ret < 0)
+ perror("restart");
+
+ printf("should not reach here !\n");
+
+ return 0;
+}
diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
new file mode 100644
index 0000000..c6fc045
--- /dev/null
+++ b/Documentation/checkpoint/usage.txt
@@ -0,0 +1,247 @@
+
+ How to use Checkpoint-Restart
+ =========================================
+
+
+API
+===
+
+The API consists of three new system calls:
+
+* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
+
+ Checkpoint a (sub-)container whose root task is identified by @pid,
+ to the open file indicated by @fd. If @logfd isn't -1, it indicates
+ an open file to which error and debug messages are written. @flags
+ may be one or more of:
+ - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
+ (other value are not allowed).
+
+ Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
+ it returns from a restart, and -1 if an error occurs. The ckptid will
+ uniquely identify a checkpoint image, for as long as the checkpoint
+ is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
+ partial checkpoint, residing in kernel memory).
+
+* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
+
+ Restart a process hierarchy from a checkpoint image that is read from
+ the blob stored in the file indicated by @fd. If @logfd isn't -1, it
+ indicates an open file to which error and debug messages are written.
+ @flags will have future meaning (must be 0 for now). @pid indicates
+ the root of the hierarchy as seen in the coordinator's pid-namespace,
+ and is expected to be a child of the coordinator. @flags may be one
+ or more of:
+ - RESTART_TASKSELF : (self) restart of a single process
+ - RESTART_FROEZN : processes remain frozen once restart completes
+ - RESTART_GHOST : process is a ghost (placeholder for a pid)
+ (Note that this argument may mean 'ckptid' to identify an in-kernel
+ checkpoint image, with some @flags in the future).
+
+ Returns: -1 if an error occurs, 0 on success when restarting from a
+ "self" checkpoint, and return value of system call at the time of the
+ checkpoint when restarting from an "external" checkpoint.
+
+ (If a process was frozen for checkpoint while in userspace, it will
+ resume running in userspace exactly where it was interrupted. If it
+ was frozen while in kernel doing a syscall, it will return what the
+ syscall returned when interrupted/completed, and proceed from there
+ as if it had only been frozen and then thawed. Finally, if it did a
+ self-checkpoint, it will resume to the first instruction after the
+ call to checkpoint(2), having returned 0, to indicate whether the
+ return is from the checkpoint or a restart).
+
+* int clone_with_pid(unsigned long clone_flags, void *news,
+ int *parent_tidptr, int *child_tidptr,
+ struct target_pid_set *pid_set)
+
+ struct target_pid_set {
+ int num_pids;
+ pid_t *target_pids;
+ }
+
+ Container restart requires that a task have the same pid it had when
+ it was checkpointed. When containers are nested the tasks within the
+ containers exist in multiple pid namespaces and hence have multiple
+ pids to specify during restart.
+
+ clone_with_pids(), intended for use during restart, is similar to
+ clone(), except that it takes a 'target_pid_set' parameter. This
+ parameter lets caller choose specific pid numbers for the child
+ process, in the process's active and ancestor pid namespaces.
+
+ Unlike clone(), clone_with_pids() needs CAP_SYS_ADMIN, at least for
+ now, to prevent unprivileged processes from misusing this interface.
+
+ If a target-pid is 0, the kernel continues to assign a pid for the
+ process in that namespace. If a requested pid is taken, the system
+ call fails with -EBUSY. If 'pid_set.num_pids' exceeds the current
+ nesting level of pid namespaces, the system call fails with -EINVAL.
+
+
+Sysctl/proc
+===========
+
+/proc/sys/kernel/ckpt_unpriv_allowed [default = 1]
+ controls whether c/r operation is allowed for unprivileged users
+
+
+Operation
+=========
+
+The granularity of a checkpoint usually is a process hierarchy. The
+'pid' argument is interpreted in the caller's pid namespace. So to
+checkpoint a container whose init task (pid 1 in that pidns) appears
+as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
+pid 1 will attempt to checkpoint the caller's container, and if the
+caller isn't privileged and init is owned by root, it will fail.
+
+Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
+which does not refer to a container's init task, then sys_checkpoint()
+would return -EINVAL.
+
+We assume that during checkpoint and restart the container state is
+quiescent. During checkpoint, this means that all affected tasks are
+frozen (or otherwise stopped). During restart, this means that all
+affected tasks are executing the sys_restart() call. In both cases, if
+there are other tasks possible sharing state with the container, they
+must not modify it during the operation. It is the responsibility of
+the caller to follow this requirement.
+
+If the assumption that all tasks are frozen and that there is no other
+sharing doesn't hold - then the results of the operation are undefined
+(just as, e.g. not calling execve() immediately after vfork() produces
+undefined results). In particular, either checkpoint will fail, or it
+may produce a checkpoint image that can't be restarted, or (unlikely)
+the restart may produce a container whose state does not match that of
+the original container.
+
+
+User tools
+==========
+
+* checkpoint(1): a tool to perform a checkpoint of a container/subtree
+* restart(1): a tool to restart a container/subtree
+* ckptinfo: a tool to examine a checkpoint image
+
+It is best to use the dedicated user tools for checkpoint and restart.
+
+If you insist, then here is a code snippet that illustrates how a
+checkpoint is initiated by a process inside a container - the logic is
+similar to fork():
+ ...
+ ckptid = checkpoint(0, ...);
+ switch (crid) {
+ case -1:
+ perror("checkpoint failed");
+ break;
+ default:
+ fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
+ /* proceed with execution after checkpoint */
+ ...
+ break;
+ case 0:
+ fprintf(stderr, "returned after restart\n");
+ /* proceed with action required following a restart */
+ ...
+ break;
+ }
+ ...
+
+And to initiate a restart, the process in an empty container can use
+logic similar to execve():
+ ...
+ if (restart(pid, ...) < 0)
+ perror("restart failed");
+ /* only get here if restart failed */
+ ...
+
+Note, that the code also supports "self" checkpoint, where a process
+can checkpoint itself. This mode does not capture the relationships of
+the task with other tasks, or any shared resources. It is useful for
+application that wish to be able to save and restore their state.
+They will either not use (or care about) shared resources, or they
+will be aware of the operations and adapt suitably after a restart.
+The code above can also be used for "self" checkpoint.
+
+
+You may find the following sample programs useful:
+
+* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
+* self_checkpoint.c: a simple test program doing self-checkpoint
+* self_restart.c: restarts a (self-) checkpoint image from stdin
+
+See also the utilities 'checkpoint' and 'restart' (from user-cr).
+
+
+"External" checkpoint
+=====================
+
+To do "external" checkpoint, you need to first freeze that other task
+either using the freezer cgroup.
+
+Restart does not preserve the original PID yet, (because we haven't
+solved yet the fork-with-specific-pid issue). In a real scenario, you
+probably want to first create a new names space, and have the init
+task there call 'sys_restart()'.
+
+I tested it this way:
+ $ ./test &
+ [1] 3493
+
+ $ echo 3493 > /cgroup/0/tasks
+ $ echo FROZEN > /cgroup/0/freezer.state
+ $ ./checkpoint 3493 > ckpt.image
+
+ $ mv /tmp/cr-test.out /tmp/cr-test.out.orig
+ $ cp /tmp/cr-test.out.orig /tmp/cr-test.out
+
+ $ echo THAWED > /cgroup/0/freezer.state
+
+ $ ./self_restart < ckpt.image
+Now compare the output of the two output files.
+
+
+"Self" checkpoint
+================
+
+To do self-checkpoint, you can incorporate the code from
+self_checkpoint.c into your application.
+
+Here is how to test the self-checkpoint:
+ $ ./self_checkpoint > self.image &
+ [1] 3512
+
+ $ sleep 3
+ $ mv /tmp/cr-self.out /tmp/cr-self.out.orig
+ $ cp /tmp/cr-self.out.orig /tmp/cr-self.out
+
+ $ cat /tmp/cr-self.out
+ hello, world!
+ count 0
+ count 1
+ count 2
+ checkpoint ret: 1
+ count 3
+ ...
+
+ $ sed -i 's/count/xxxxx/g' /tmp/cr-self.out
+
+ $ ./self_restart < self.image &
+
+Now compare the output of the two output files.
+ $ cat /tmp/cr-self.out
+ hello, world!
+ xxxxx 0
+ xxxxx 1
+ xxxxx 2
+ checkpoint ret: 0
+ count 3
+ ...
+
+
+Note how in test.c we close stdin, stdout, stderr - that's because
+currently we only support regular files (not ttys/ptys).
+
+If you check the output of ps, you'll see that "self_restart" changed
+its name to "test" or "self_checkpoint", as expected.
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 022/100] c/r: basic infrastructure for checkpoint/restart
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
2010-05-01 14:15 ` [PATCH v21 019/100] Make file_pos_read/write() public and export kernel_write() Oren Laadan
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 036/100] c/r: introduce vfs_fcntl() Oren Laadan
` (17 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-mm, linux-fsdevel, netdev
Add those interfaces, as well as helpers needed to easily manage the
file format. The code is roughly broken out as follows:
kernel/checkpoint/sys.c - user/kernel data transfer, as well as setup
of the c/r context (a per-checkpoint data structure for housekeeping)
kernel/checkpoint/checkpoint.c - output wrappers and checkpoint handling
kernel/checkpoint/restart.c - input wrappers and restart handling
kernel/checkpoint/process.c - c/r of task data
For now, we can only checkpoint the 'current' task ("self" checkpoint),
and the 'pid' argument to the syscall is ignored.
Patches to add the per-architecture support as well as the actual
work to do the memory checkpoint follow in subsequent patches.
Changelog[v21]:
- Complain if checkpoint_hdr.h included without CONFIG_CHECKPOINT
- Do not include checkpoint_hdr.h explicitly
- Consolidate ckpt_read/write with kernel_read/write
- Reorganize code:move checkpoint/* to kernel/checkpoint/*
- [Christoffer Dall] Fix trivial bug in ckpt_msg macro
Changelog[v20]:
- Export key symbols to enable c/r from kernel modules
Changelog[v19]:
- [Serge Hallyn] Use ckpt_err() to for bad header values
Changelog[v19-rc3]:
- sys_{checkpoint,restart} to use ptregs prototype
Changelog[v19-rc1]:
- Set ctx->errno in do_ckpt_msg() if needed
- Document prototype of ckpt_write_err in header
- Update prototype of ckpt_read_obj()
- Fix up headers so we can munge them for use by userspace
- [Matt Helsley] Check for empty string for _ckpt_write_err()
- [Matt Helsley] Add cpp definitions for enums
- [Serge Hallyn] Add global section container to image format
- [Matt Helsley] Fix total byte read/write count for large images
- ckpt_read_buf_type() to accept max payload (excludes ckpt_hdr)
- [Serge Hallyn] Define new api for error and debug logging
- Use logfd in sys_{checkpoint,restart}
Changelog[v18]:
- Detect error-headers in input data on restart, and abort.
- Standard format for checkpoint error strings (and documentation)
- [Matt Helsley] Rename headerless struct ckpt_hdr_* to struct ckpt_*
- [Dan Smith] Add an errno validation function
- Add ckpt_read_payload(): read a variable-length object (no header)
- Add ckpt_read_string(): same for strings (ensures null-terminated)
- Add ckpt_read_consume(): consumes next object without processing
Changelog[v17]:
- Fix compilation for architectures that don't support checkpoint
- Save/restore t->{set,clear}_child_tid
- Restart(2) isn't idempotent: must return -EINTR if interrupted
- ckpt_debug does not depend on DYNAMIC_DEBUG, on by default
- Export generic checkpoint headers to userespace
- Fix comment for prototype of sys_restart
- Have ckpt_debug() print global-pid and __LINE__
- Only save and test kernel constants once (in header)
Changelog[v16]:
- Split ctx->flags to ->uflags (user flags) and ->kflags (kernel flags)
- Introduce __ckpt_write_err() and ckpt_write_err() to report errors
- Allow @ptr == NULL to write (or read) header only without payload
- Introduce _ckpt_read_obj_type()
Changelog[v15]:
- Replace header buffer in ckpt_ctx (hbuf,hpos) with kmalloc/kfree()
Changelog[v14]:
- Cleanup interface to get/put hdr buffers
- Merge checkpoint and restart code into a single file (per subsystem)
- Take uts_sem around access to uts->{release,version,machine}
- Embed ckpt_hdr in all ckpt_hdr_...., cleanup read/write helpers
- Define sys_checkpoint(0,...) as asking for a self-checkpoint (Serge)
- Revert use of 'pr_fmt' to avoid tainting whom includes us (Nathan Lynch)
- Explicitly indicate length of UTS fields in header
- Discard field 'h->parent' from ckpt_hdr
Changelog[v12]:
- ckpt_kwrite/ckpt_kread() again use vfs_read(), vfs_write() (safer)
- Split ckpt_write/ckpt_read() to two parts: _ckpt_write/read() helper
- Befriend with sparse : explicit conversion to 'void __user *'
- Redfine 'pr_fmt' instead of using special ckpt_debug()
Changelog[v10]:
- add ckpt_write_buffer(), ckpt_read_buffer() and ckpt_read_buf_type()
- force end-of-string in ckpt_read_string() (fix possible DoS)
Changelog[v9]:
- ckpt_kwrite/ckpt_kread() use file->f_op->write() directly
- Drop ckpt_uwrite/ckpt_uread() since they aren't used anywhere
Changelog[v6]:
- Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
(although it's not really needed)
Changelog[v5]:
- Rename headers files s/ckpt/checkpoint/
Changelog[v2]:
- Added utsname->{release,version,machine} to checkpoint header
- Pad header structures to 64 bits to ensure compatibility
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
arch/x86/include/asm/unistd_32.h | 2 -
arch/x86/kernel/syscall_table_32.S | 2 -
include/linux/Kbuild | 3 +
include/linux/checkpoint.h | 202 ++++++++++++++++
include/linux/checkpoint_hdr.h | 135 +++++++++++
include/linux/checkpoint_types.h | 44 ++++
include/linux/magic.h | 3 +
include/linux/syscalls.h | 4 -
kernel/checkpoint/Makefile | 6 +-
kernel/checkpoint/checkpoint.c | 213 +++++++++++++++++
kernel/checkpoint/process.c | 101 ++++++++
kernel/checkpoint/restart.c | 460 +++++++++++++++++++++++++++++++++++
kernel/checkpoint/sys.c | 461 +++++++++++++++++++++++++++++++++++-
lib/Kconfig.debug | 13 +
14 files changed, 1632 insertions(+), 17 deletions(-)
create mode 100644 include/linux/checkpoint.h
create mode 100644 include/linux/checkpoint_hdr.h
create mode 100644 include/linux/checkpoint_types.h
create mode 100644 kernel/checkpoint/checkpoint.c
create mode 100644 kernel/checkpoint/process.c
create mode 100644 kernel/checkpoint/restart.c
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 007d7cd..cb67842 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -344,8 +344,6 @@
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
#define __NR_eclone 338
-#define __NR_checkpoint 339
-#define __NR_restart 340
#ifdef __KERNEL__
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 2d5a6b0..0c92570 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,5 +338,3 @@ ENTRY(sys_call_table)
.long sys_perf_event_open
.long sys_recvmmsg
.long ptregs_eclone
- .long sys_checkpoint
- .long sys_restart /* 340 */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e2ea0b2..71bb8d1 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -45,6 +45,9 @@ header-y += bsg.h
header-y += can.h
header-y += cciss_defs.h
header-y += cdk.h
+header-y += checkpoint.h
+header-y += checkpoint_hdr.h
+header-y += checkpoint_types.h
header-y += chio.h
header-y += coda_psdev.h
header-y += coff.h
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
new file mode 100644
index 0000000..4bb5b8d
--- /dev/null
+++ b/include/linux/checkpoint.h
@@ -0,0 +1,202 @@
+#ifndef _LINUX_CHECKPOINT_H_
+#define _LINUX_CHECKPOINT_H_
+/*
+ * Generic checkpoint-restart
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#define CHECKPOINT_VERSION 3
+
+/* misc user visible */
+#define CHECKPOINT_FD_NONE -1
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CHECKPOINT
+
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/err.h>
+
+/* sycall helpers */
+extern long do_sys_checkpoint(pid_t pid, int fd,
+ unsigned long flags, int logfd);
+extern long do_sys_restart(pid_t pid, int fd,
+ unsigned long flags, int logfd);
+
+/* ckpt_ctx: kflags */
+#define CKPT_CTX_CHECKPOINT_BIT 0
+#define CKPT_CTX_RESTART_BIT 1
+#define CKPT_CTX_ERROR_BIT 3
+
+#define CKPT_CTX_CHECKPOINT (1 << CKPT_CTX_CHECKPOINT_BIT)
+#define CKPT_CTX_RESTART (1 << CKPT_CTX_RESTART_BIT)
+#define CKPT_CTX_ERROR (1 << CKPT_CTX_ERROR_BIT)
+
+
+extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, size_t count);
+extern int ckpt_kread(struct ckpt_ctx *ctx, void *buf, size_t count);
+
+extern void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int n);
+extern void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr);
+extern void *ckpt_hdr_get(struct ckpt_ctx *ctx, int n);
+extern void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int n, int type);
+
+extern int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h);
+extern int ckpt_write_obj_type(struct ckpt_ctx *ctx,
+ void *ptr, int len, int type);
+extern int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len);
+
+extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
+ void *ptr, int len, int type);
+extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
+extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type);
+extern int ckpt_read_payload(struct ckpt_ctx *ctx,
+ void **ptr, int max, int type);
+extern char *ckpt_read_string(struct ckpt_ctx *ctx, int max);
+extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
+
+extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
+extern long do_restart(struct ckpt_ctx *ctx, pid_t pid);
+
+/* task */
+extern int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_task(struct ckpt_ctx *ctx);
+
+static inline int ckpt_validate_errno(int errno)
+{
+ return (errno >= 0) && (errno < MAX_ERRNO);
+}
+
+/* debugging flags */
+#define CKPT_DBASE 0x1 /* anything */
+#define CKPT_DSYS 0x2 /* generic (system) */
+#define CKPT_DRW 0x4 /* image read/write */
+
+#define CKPT_DDEFAULT 0xffff /* default debug level */
+
+#ifndef CKPT_DFLAG
+#define CKPT_DFLAG 0xffff /* everything */
+#endif
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+extern unsigned long ckpt_debug_level;
+
+/*
+ * This is deprecated
+ */
+/* use this to select a specific debug level */
+#define _ckpt_debug(level, fmt, args...) \
+ do { \
+ if (ckpt_debug_level & (level)) \
+ printk(KERN_DEBUG "[%d:%d:c/r:%s:%d] " fmt, \
+ current->pid, \
+ current->nsproxy ? \
+ task_pid_vnr(current) : -1, \
+ __func__, __LINE__, ## args); \
+ } while (0)
+
+/*
+ * CKPT_DBASE is the base flags, doesn't change
+ * CKPT_DFLAG is to be redfined in each source file
+ */
+#define ckpt_debug(fmt, args...) \
+ _ckpt_debug(CKPT_DBASE | CKPT_DFLAG, fmt, ## args)
+
+#else
+
+/*
+ * This is deprecated
+ */
+#define _ckpt_debug(level, fmt, args...) do { } while (0)
+#define ckpt_debug(fmt, args...) do { } while (0)
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
+
+/*
+ * prototypes for the new logging api
+ */
+
+extern void ckpt_msg_lock(struct ckpt_ctx *ctx);
+extern void ckpt_msg_unlock(struct ckpt_ctx *ctx);
+
+extern void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...);
+extern void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...);
+
+/*
+ * Append formatted msg to ctx->msg[ctx->msg_len].
+ * Must be called after expanding format.
+ * May be called under spinlock.
+ * Must be called under ckpt_msg_lock().
+ */
+extern void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...);
+
+/*
+ * Write ctx->msg to all relevant places.
+ * Must not be called under spinlock.
+ * Must be called under ckpt_msg_lock().
+ */
+extern void _ckpt_msg_complete(struct ckpt_ctx *ctx);
+
+/*
+ * Append an enhanced formatted message to ctx->msg.
+ * This will not write the message out to the applicable files, so
+ * the caller will have to use _ckpt_msg_complete() to finish up.
+ * @ctx must be a valid checkpoint context.
+ * @fmt is the extended format
+ *
+ * Must be called with ckpt_msg_lock held.
+ */
+#define _ckpt_msg(ctx, fmt, args...) do { \
+ _do_ckpt_msg(ctx, 0, fmt, ##args); \
+} while (0)
+
+/*
+ * Append an enhanced formatted message to ctx->msg.
+ * This will take the ckpt_msg_lock and also write the message out
+ * to the applicable files by calling _ckpt_msg_complete().
+ * @ctx must be a valid checkpoint context.
+ * @fmt is the extended format
+ *
+ * Must not be called under spinlock.
+ */
+#define ckpt_msg(ctx, fmt, args...) do { \
+ do_ckpt_msg(ctx, 0, fmt, ##args); \
+} while (0)
+
+/*
+ * Report an error.
+ * This will take the ckpt_msg_lock and also write the message out
+ * to the applicable files by calling _ckpt_msg_complete().
+ * @ctx must be a valid checkpoint context.
+ * @err is the error value
+ * @fmt is the extended format
+ *
+ * Must not be called under spinlock.
+ */
+
+#define ckpt_err(ctx, err, fmt, args...) do { \
+ do_ckpt_msg(ctx, err, "[E @ %s:%d]" fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+/*
+ * Same as ckpt_err() but
+ * must be called with ctx->msg_mutex held
+ * can be called under spinlock
+ * must be followed by a call to _ckpt_msg_complete()
+ */
+#define _ckpt_err(ctx, err, fmt, args...) do { \
+ _do_ckpt_msg(ctx, err, "[E @ %s:%d]" fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#endif /* CONFIG_CHECKPOINT */
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CHECKPOINT_H_ */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
new file mode 100644
index 0000000..7ccebc7
--- /dev/null
+++ b/include/linux/checkpoint_hdr.h
@@ -0,0 +1,135 @@
+#ifndef _CHECKPOINT_CKPT_HDR_H_
+#define _CHECKPOINT_CKPT_HDR_H_
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008-2010 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#ifndef __KERNEL__
+#include <sys/types.h>
+#include <linux/types.h>
+#endif
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+
+#ifndef CONFIG_CHECKPOINT
+#error linux/checkpoint_hdr.h included directly (without CONFIG_CHECKPOINT)
+#endif
+
+#endif
+
+#include <linux/utsname.h>
+
+/*
+ * To maintain compatibility between 32-bit and 64-bit architecture flavors,
+ * keep data 64-bit aligned: use padding for structure members, and use
+ * __attribute__((aligned (8))) for the entire structure.
+ *
+ * Quoting Arnd Bergmann:
+ * "This structure has an odd multiple of 32-bit members, which means
+ * that if you put it into a larger structure that also contains 64-bit
+ * members, the larger structure may get different alignment on x86-32
+ * and x86-64, which you might want to avoid. I can't tell if this is
+ * an actual problem here. ... In this case, I'm pretty sure that
+ * sizeof(ckpt_hdr_task) on x86-32 is different from x86-64, since it
+ * will be 32-bit aligned on x86-32."
+ */
+
+/*
+ * header format: 'struct ckpt_hdr' must prefix all other headers. Therfore
+ * when a header is passed around, the information about it (type, size)
+ * is readily available. Structs that include a struct ckpt_hdr are named
+ * struct ckpt_hdr_* by convention (usualy the struct ckpt_hdr is the first
+ * member).
+ */
+struct ckpt_hdr {
+ __u32 type;
+ __u32 len;
+} __attribute__((aligned(8)));
+
+/* header types */
+enum {
+ CKPT_HDR_HEADER = 1,
+#define CKPT_HDR_HEADER CKPT_HDR_HEADER
+ CKPT_HDR_CONTAINER,
+#define CKPT_HDR_CONTAINER CKPT_HDR_CONTAINER
+ CKPT_HDR_BUFFER,
+#define CKPT_HDR_BUFFER CKPT_HDR_BUFFER
+ CKPT_HDR_STRING,
+#define CKPT_HDR_STRING CKPT_HDR_STRING
+
+ CKPT_HDR_TASK = 101,
+#define CKPT_HDR_TASK CKPT_HDR_TASK
+
+ CKPT_HDR_TAIL = 9001,
+#define CKPT_HDR_TAIL CKPT_HDR_TAIL
+
+ CKPT_HDR_ERROR = 9999,
+#define CKPT_HDR_ERROR CKPT_HDR_ERROR
+};
+
+/* kernel constants */
+struct ckpt_const {
+ /* task */
+ __u16 task_comm_len;
+ /* uts */
+ __u16 uts_release_len;
+ __u16 uts_version_len;
+ __u16 uts_machine_len;
+} __attribute__((aligned(8)));
+
+/* checkpoint image header */
+struct ckpt_hdr_header {
+ struct ckpt_hdr h;
+ __u64 magic;
+
+ __u16 _padding;
+
+ __u16 major;
+ __u16 minor;
+ __u16 patch;
+ __u16 rev;
+
+ struct ckpt_const constants;
+
+ __u64 time; /* when checkpoint taken */
+ __u64 uflags; /* uflags from checkpoint */
+
+ /*
+ * the header is followed by three strings:
+ * char release[const.uts_release_len];
+ * char version[const.uts_version_len];
+ * char machine[const.uts_machine_len];
+ */
+} __attribute__((aligned(8)));
+
+/* checkpoint image trailer */
+struct ckpt_hdr_tail {
+ struct ckpt_hdr h;
+ __u64 magic;
+} __attribute__((aligned(8)));
+
+/* container configuration section header */
+struct ckpt_hdr_container {
+ struct ckpt_hdr h;
+} __attribute__((aligned(8)));;
+
+/* task data */
+struct ckpt_hdr_task {
+ struct ckpt_hdr h;
+ __u32 state;
+ __u32 exit_state;
+ __u32 exit_code;
+ __u32 exit_signal;
+
+ __u64 set_child_tid;
+ __u64 clear_child_tid;
+} __attribute__((aligned(8)));
+
+#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
new file mode 100644
index 0000000..13d6dd5
--- /dev/null
+++ b/include/linux/checkpoint_types.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_CHECKPOINT_TYPES_H_
+#define _LINUX_CHECKPOINT_TYPES_H_
+/*
+ * Generic checkpoint-restart
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+
+struct ckpt_ctx {
+ int crid; /* unique checkpoint id */
+
+ pid_t root_pid; /* container identifier */
+
+ unsigned long kflags; /* kerenl flags */
+ unsigned long uflags; /* user flags */
+ unsigned long oflags; /* restart: uflags from checkpoint */
+
+ struct file *file; /* input/output file */
+ struct file *logfile; /* status/debug log file */
+ loff_t total; /* total read/written */
+
+ struct task_struct *tsk;/* checkpoint: current target task */
+ char err_string[256]; /* checkpoint: error string */
+
+ int errno; /* errno that caused failure */
+
+#define CKPT_MSG_LEN 1024
+ char fmt[CKPT_MSG_LEN];
+ char msg[CKPT_MSG_LEN];
+ int msglen;
+ struct mutex msg_mutex;
+};
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CHECKPOINT_TYPES_H_ */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index eb9800f..e04117a 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -58,4 +58,7 @@
#define DEVPTS_SUPER_MAGIC 0x1cd1
#define SOCKFS_MAGIC 0x534F434B
+#define CHECKPOINT_MAGIC_HEAD 0x00feed0cc0a2d200LL
+#define CHECKPOINT_MAGIC_TAIL 0x002d2a0cc0deef00LL
+
#endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d1d1703..057929b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -834,10 +834,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
struct timespec __user *, const sigset_t __user *,
size_t);
-asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags,
- int logfd);
-asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags,
- int logfd);
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
diff --git a/kernel/checkpoint/Makefile b/kernel/checkpoint/Makefile
index 8a32c6f..99364cc 100644
--- a/kernel/checkpoint/Makefile
+++ b/kernel/checkpoint/Makefile
@@ -2,4 +2,8 @@
# Makefile for linux checkpoint/restart.
#
-obj-$(CONFIG_CHECKPOINT) += sys.o
+obj-$(CONFIG_CHECKPOINT) += \
+ sys.o \
+ checkpoint.o \
+ restart.o \
+ process.o
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
new file mode 100644
index 0000000..75b43e6
--- /dev/null
+++ b/kernel/checkpoint/checkpoint.c
@@ -0,0 +1,213 @@
+/*
+ * Checkpoint logic and helpers
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/utsname.h>
+#include <linux/magic.h>
+#include <linux/checkpoint.h>
+
+/* unique checkpoint identifier (FIXME: should be per-container ?) */
+static atomic_t ctx_count = ATOMIC_INIT(0);
+
+/**
+ * ckpt_write_obj - write an object
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ */
+int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+ _ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+ return ckpt_kwrite(ctx, h, h->len);
+}
+EXPORT_SYMBOL(ckpt_write_obj);
+
+/**
+ * ckpt_write_obj_type - write an object (from a pointer)
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ * @type: desired type
+ *
+ * If @ptr is NULL, then write only the header (payload to follow)
+ */
+int ckpt_write_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+ struct ckpt_hdr *h;
+ int ret;
+
+ h = ckpt_hdr_get(ctx, sizeof(*h));
+ if (!h)
+ return -ENOMEM;
+
+ h->type = type;
+ h->len = len + sizeof(*h);
+
+ _ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+ ret = ckpt_kwrite(ctx, h, sizeof(*h));
+ if (ret < 0)
+ goto out;
+ if (ptr)
+ ret = ckpt_kwrite(ctx, ptr, len);
+ out:
+ _ckpt_hdr_put(ctx, h, sizeof(*h));
+ return ret;
+}
+EXPORT_SYMBOL(ckpt_write_obj_type);
+
+/**
+ * ckpt_write_buffer - write an object of type buffer
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ */
+int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+ return ckpt_write_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+EXPORT_SYMBOL(ckpt_write_buffer);
+
+/**
+ * ckpt_write_string - write an object of type string
+ * @ctx: checkpoint context
+ * @str: string pointer
+ * @len: string length
+ */
+int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len)
+{
+ return ckpt_write_obj_type(ctx, str, len, CKPT_HDR_STRING);
+}
+EXPORT_SYMBOL(ckpt_write_string);
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+static void fill_kernel_const(struct ckpt_const *h)
+{
+ struct task_struct *tsk;
+ struct new_utsname *uts;
+
+ /* task */
+ h->task_comm_len = sizeof(tsk->comm);
+ /* uts */
+ h->uts_release_len = sizeof(uts->release);
+ h->uts_version_len = sizeof(uts->version);
+ h->uts_machine_len = sizeof(uts->machine);
+}
+
+/* write the checkpoint header */
+static int checkpoint_write_header(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_header *h;
+ struct new_utsname *uts;
+ struct timeval ktv;
+ int ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+ if (!h)
+ return -ENOMEM;
+
+ do_gettimeofday(&ktv);
+ uts = utsname();
+
+ h->magic = CHECKPOINT_MAGIC_HEAD;
+ h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
+ h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
+ h->patch = (LINUX_VERSION_CODE) & 0xff;
+
+ h->rev = CHECKPOINT_VERSION;
+
+ h->uflags = ctx->uflags;
+ h->time = ktv.tv_sec;
+
+ fill_kernel_const(&h->constants);
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ret;
+
+ down_read(&uts_sem);
+ ret = ckpt_write_buffer(ctx, uts->release, sizeof(uts->release));
+ if (ret < 0)
+ goto up;
+ ret = ckpt_write_buffer(ctx, uts->version, sizeof(uts->version));
+ if (ret < 0)
+ goto up;
+ ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
+ up:
+ up_read(&uts_sem);
+ return ret;
+}
+
+/* write the container configuration section */
+static int checkpoint_container(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_container *h;
+ int ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+ if (!h)
+ return -ENOMEM;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+/* write the checkpoint trailer */
+static int checkpoint_write_tail(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_tail *h;
+ int ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+ if (!h)
+ return -ENOMEM;
+
+ h->magic = CHECKPOINT_MAGIC_TAIL;
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
+{
+ long ret;
+
+ ret = checkpoint_write_header(ctx);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_container(ctx);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_task(ctx, current);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_write_tail(ctx);
+ if (ret < 0)
+ goto out;
+
+ /* on success, return (unique) checkpoint identifier */
+ ctx->crid = atomic_inc_return(&ctx_count);
+ ret = ctx->crid;
+ out:
+ return ret;
+}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
new file mode 100644
index 0000000..abd9025
--- /dev/null
+++ b/kernel/checkpoint/process.c
@@ -0,0 +1,101 @@
+/*
+ * Checkpoint task structure
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DSYS
+
+#include <linux/sched.h>
+#include <linux/checkpoint.h>
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+/* dump the task_struct of a given task */
+static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct ckpt_hdr_task *h;
+ int ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+ if (!h)
+ return -ENOMEM;
+
+ h->state = t->state;
+ h->exit_state = t->exit_state;
+ h->exit_code = t->exit_code;
+ h->exit_signal = t->exit_signal;
+
+ h->set_child_tid = (unsigned long) t->set_child_tid;
+ h->clear_child_tid = (unsigned long) t->clear_child_tid;
+
+ /* FIXME: save remaining relevant task_struct fields */
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ret;
+
+ return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
+}
+
+/* dump the entire state of a given task */
+int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ int ret;
+
+ ctx->tsk = t;
+
+ ret = checkpoint_task_struct(ctx, t);
+ ckpt_debug("task %d\n", ret);
+
+ ctx->tsk = NULL;
+ return ret;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+/* read the task_struct into the current task */
+static int restore_task_struct(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_task *h;
+ struct task_struct *t = current;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ memset(t->comm, 0, TASK_COMM_LEN);
+ ret = _ckpt_read_string(ctx, t->comm, TASK_COMM_LEN);
+ if (ret < 0)
+ goto out;
+
+ t->set_child_tid = (int __user *) (unsigned long) h->set_child_tid;
+ t->clear_child_tid = (int __user *) (unsigned long) h->clear_child_tid;
+
+ /* FIXME: restore remaining relevant task_struct fields */
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+/* read the entire state of the current task */
+int restore_task(struct ckpt_ctx *ctx)
+{
+ int ret;
+
+ ret = restore_task_struct(ctx);
+ ckpt_debug("task %d\n", ret);
+
+ return ret;
+}
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
new file mode 100644
index 0000000..cd9945c
--- /dev/null
+++ b/kernel/checkpoint/restart.c
@@ -0,0 +1,460 @@
+/*
+ * Restart logic and helpers
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/magic.h>
+#include <linux/utsname.h>
+#include <linux/checkpoint.h>
+
+static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+ char *ptr;
+ int len, ret;
+
+ len = h->len - sizeof(*h);
+ ptr = kzalloc(len + 1, GFP_KERNEL);
+ if (!ptr) {
+ ckpt_debug("insufficient memory to report image error\n");
+ return -ENOMEM;
+ }
+
+ ret = ckpt_kread(ctx, ptr, len);
+ if (ret >= 0) {
+ ckpt_debug("%s\n", &ptr[1]);
+ ret = -EIO;
+ }
+
+ kfree(ptr);
+ return ret;
+}
+
+/**
+ * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: desired ckpt_hdr
+ * @ptr: desired buffer
+ * @len: desired object length (if 0, flexible)
+ * @max: maximum object length (if 0, flexible)
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
+ void *ptr, int len, int max)
+{
+ int ret;
+
+ again:
+ ret = ckpt_kread(ctx, h, sizeof(*h));
+ if (ret < 0)
+ return ret;
+ _ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+ h->type, h->len, len, max);
+ if (h->len < sizeof(*h))
+ return -EINVAL;
+
+ if (h->type == CKPT_HDR_ERROR) {
+ ret = _ckpt_read_err(ctx, h);
+ if (ret < 0)
+ return ret;
+ goto again;
+ }
+
+ /* if len specified, enforce, else if maximum specified, enforce */
+ if ((len && h->len != len) || (!len && max && h->len > max))
+ return -EINVAL;
+
+ if (ptr)
+ ret = ckpt_kread(ctx, ptr, h->len - sizeof(struct ckpt_hdr));
+ return ret;
+}
+
+/**
+ * _ckpt_read_obj_type - read an object of some type
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ * @type: buffer type
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: actual _payload_ length
+ */
+int _ckpt_read_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+ struct ckpt_hdr h;
+ int ret;
+
+ if (len)
+ len += sizeof(struct ckpt_hdr);
+ ret = _ckpt_read_obj(ctx, &h, ptr, len, len);
+ if (ret < 0)
+ return ret;
+ if (h.type != type)
+ return -EINVAL;
+ return h.len - sizeof(h);
+}
+EXPORT_SYMBOL(_ckpt_read_obj_type);
+
+/**
+ * _ckpt_read_buffer - read an object of type buffer (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: _payload_ length.
+ */
+int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+ BUG_ON(!len);
+ return _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+EXPORT_SYMBOL(_ckpt_read_buffer);
+
+/**
+ * _ckpt_read_string - read an object of type string (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: string length (including '\0')
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+ int ret;
+
+ BUG_ON(!len);
+ ret = _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_STRING);
+ if (ret < 0)
+ return ret;
+ if (ptr)
+ ((char *) ptr)[len - 1] = '\0'; /* always play it safe */
+ return 0;
+}
+EXPORT_SYMBOL(_ckpt_read_string);
+
+/**
+ * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ * @len: desired total length (if 0, flexible)
+ * @max: maximum total length
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+{
+ struct ckpt_hdr hh;
+ struct ckpt_hdr *h;
+ int ret;
+
+ ret = ckpt_kread(ctx, &hh, sizeof(hh));
+ if (ret < 0)
+ return ERR_PTR(ret);
+ _ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+ hh.type, hh.len, len, max);
+ if (hh.len < sizeof(*h))
+ return ERR_PTR(-EINVAL);
+ /* if len specified, enforce, else if maximum specified, enforce */
+ if ((len && hh.len != len) || (!len && max && hh.len > max))
+ return ERR_PTR(-EINVAL);
+
+ h = ckpt_hdr_get(ctx, hh.len);
+ if (!h)
+ return ERR_PTR(-ENOMEM);
+
+ *h = hh; /* yay ! */
+
+ ret = ckpt_kread(ctx, (h + 1), hh.len - sizeof(struct ckpt_hdr));
+ if (ret < 0) {
+ ckpt_hdr_put(ctx, h);
+ h = ERR_PTR(ret);
+ }
+
+ return h;
+}
+
+/**
+ * ckpt_read_obj_type - allocate and read an object of some type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
+{
+ struct ckpt_hdr *h;
+
+ BUG_ON(!len);
+
+ h = ckpt_read_obj(ctx, len, len);
+ if (IS_ERR(h))
+ return h;
+
+ if (h->type != type) {
+ ckpt_hdr_put(ctx, h);
+ h = ERR_PTR(-EINVAL);
+ }
+
+ return h;
+}
+EXPORT_SYMBOL(ckpt_read_obj_type);
+
+/**
+ * ckpt_read_buf_type - allocate and read an object of some type (flxible)
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @type: desired object type
+ *
+ * This differs from ckpt_read_obj_type() in that the length of the
+ * incoming object is flexible (up to the maximum specified by @max;
+ * unlimited if @max is 0), as determined by the ckpt_hdr data.
+ *
+ * NOTE: for symmetry with checkpoint, @max is the maximum _payload_
+ * size, excluding the header.
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type)
+{
+ struct ckpt_hdr *h;
+
+ if (max)
+ max += sizeof(struct ckpt_hdr);
+
+ h = ckpt_read_obj(ctx, 0, max);
+ if (IS_ERR(h))
+ return h;
+
+ if (h->type != type) {
+ ckpt_hdr_put(ctx, h);
+ h = ERR_PTR(-EINVAL);
+ }
+
+ return h;
+}
+EXPORT_SYMBOL(ckpt_read_buf_type);
+
+/**
+ * ckpt_read_payload - allocate and read the payload of an object
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @str: pointer to buffer to be allocated (caller must free)
+ * @type: desired object type
+ *
+ * This can be used to read a variable-length _payload_ from the checkpoint
+ * stream. @max limits the size of the resulting buffer.
+ *
+ * Return: actual _payload_ length
+ */
+int ckpt_read_payload(struct ckpt_ctx *ctx, void **ptr, int max, int type)
+{
+ int len, ret;
+
+ len = _ckpt_read_obj_type(ctx, NULL, 0, type);
+ if (len < 0)
+ return len;
+ else if (len > max)
+ return -EINVAL;
+
+ *ptr = kmalloc(len, GFP_KERNEL);
+ if (!*ptr)
+ return -ENOMEM;
+
+ ret = ckpt_kread(ctx, *ptr, len);
+ if (ret < 0) {
+ kfree(*ptr);
+ return ret;
+ }
+
+ return len;
+}
+EXPORT_SYMBOL(ckpt_read_payload);
+
+/**
+ * ckpt_read_string - allocate and read a string (variable length)
+ * @ctx: checkpoint context
+ * @max: maximum acceptable length
+ *
+ * Return: allocate string or error pointer
+ */
+char *ckpt_read_string(struct ckpt_ctx *ctx, int max)
+{
+ char *str;
+ int len;
+
+ len = ckpt_read_payload(ctx, (void **)&str, max, CKPT_HDR_STRING);
+ if (len < 0)
+ return ERR_PTR(len);
+ str[len - 1] = '\0'; /* always play it safe */
+ return str;
+}
+EXPORT_SYMBOL(ckpt_read_string);
+
+/**
+ * ckpt_read_consume - consume the next object of expected type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * This can be used to skip an object in the input stream when the
+ * data is unnecessary for the restart. @len indicates the length of
+ * the object); if @len is zero the length is unconstrained.
+ */
+int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type)
+{
+ struct ckpt_hdr *h;
+ int ret = 0;
+
+ h = ckpt_read_obj(ctx, len, 0);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ if (h->type != type)
+ ret = -EINVAL;
+
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+EXPORT_SYMBOL(ckpt_read_consume);
+
+/***********************************************************************
+ * Restart
+ */
+
+static int check_kernel_const(struct ckpt_const *h)
+{
+ struct task_struct *tsk;
+ struct new_utsname *uts;
+
+ /* task */
+ if (h->task_comm_len != sizeof(tsk->comm))
+ return -EINVAL;
+ /* uts */
+ if (h->uts_release_len != sizeof(uts->release))
+ return -EINVAL;
+ if (h->uts_version_len != sizeof(uts->version))
+ return -EINVAL;
+ if (h->uts_machine_len != sizeof(uts->machine))
+ return -EINVAL;
+
+ return 0;
+}
+
+/* read the checkpoint header */
+static int restore_read_header(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_header *h;
+ struct new_utsname *uts = NULL;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ret = -EINVAL;
+ if (h->magic != CHECKPOINT_MAGIC_HEAD ||
+ h->rev != CHECKPOINT_VERSION ||
+ h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
+ h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
+ h->patch != ((LINUX_VERSION_CODE) & 0xff)) {
+ ckpt_err(ctx, ret, "incompatible kernel version");
+ goto out;
+ }
+ if (h->uflags) {
+ ckpt_err(ctx, ret, "incompatible restart user flags");
+ goto out;
+ }
+
+ ret = check_kernel_const(&h->constants);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "incompatible kernel constants");
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ uts = kmalloc(sizeof(*uts), GFP_KERNEL);
+ if (!uts)
+ goto out;
+
+ ctx->oflags = h->uflags;
+
+ /* FIX: verify compatibility of release, version and machine */
+ ret = _ckpt_read_buffer(ctx, uts->release, sizeof(uts->release));
+ if (ret < 0)
+ goto out;
+ ret = _ckpt_read_buffer(ctx, uts->version, sizeof(uts->version));
+ if (ret < 0)
+ goto out;
+ ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
+ out:
+ kfree(uts);
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+/* read the container configuration section */
+static int restore_container(struct ckpt_ctx *ctx)
+{
+ int ret = 0;
+ struct ckpt_hdr_container *h;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+/* read the checkpoint trailer */
+static int restore_read_tail(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_tail *h;
+ int ret = 0;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ if (h->magic != CHECKPOINT_MAGIC_TAIL)
+ ret = -EINVAL;
+
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+long do_restart(struct ckpt_ctx *ctx, pid_t pid)
+{
+ long ret;
+
+ ret = restore_read_header(ctx);
+ if (ret < 0)
+ return ret;
+ ret = restore_container(ctx);
+ if (ret < 0)
+ return ret;
+ ret = restore_task(ctx);
+ if (ret < 0)
+ return ret;
+ ret = restore_read_tail(ctx);
+
+ /* on success, adjust the return value if needed [TODO] */
+ return ret;
+}
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
index a81750a..af8c1bf 100644
--- a/kernel/checkpoint/sys.c
+++ b/kernel/checkpoint/sys.c
@@ -8,12 +8,398 @@
* distribution for more details.
*/
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DSYS
+
#include <linux/sched.h>
+#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/checkpoint.h>
+
+/*
+ * Helpers to write(read) from(to) kernel space to(from) the checkpoint
+ * image file descriptor (similar to how a core-dump is performed).
+ *
+ * _ckpt_kwrite() - write a kernel-space buffer to a file
+ * _ckpt_kread() - read from a file to a kernel-space buffer
+ *
+ * ckpt_kread() - read from the checkpoint image to a kernel-space buffer
+ * ckpt_kwrite() - write a kernel-space buffer to the checkpoint image
+ *
+ * They latter two succeed only if the entire read or write succeeds,
+ * and return 0, or negative error otherwise.
+ */
+
+static ssize_t _ckpt_kwrite(struct file *file, void *addr, size_t count)
+{
+ loff_t pos;
+ int ret;
+
+ pos = file_pos_read(file);
+ ret = kernel_write(file, pos, addr, count);
+ if (ret < 0)
+ return ret;
+ file_pos_write(file, pos + ret);
+ return ret;
+}
+
+/* returns 0 on success */
+int ckpt_kwrite(struct ckpt_ctx *ctx, void *addr, size_t count)
+{
+ int ret;
+
+ ret = _ckpt_kwrite(ctx->file, addr, count);
+ if (ret < 0)
+ return ret;
+
+ ctx->total += count;
+ return 0;
+}
+
+static ssize_t _ckpt_kread(struct file *file, void *addr, size_t count)
+{
+ loff_t pos;
+ int ret;
+
+ pos = file_pos_read(file);
+ ret = kernel_read(file, pos, addr, count);
+ if (ret < 0)
+ return ret;
+ file_pos_write(file, pos + ret);
+ return ret;
+}
+
+/* returns 0 on success */
+int ckpt_kread(struct ckpt_ctx *ctx, void *addr, size_t count)
+{
+ int ret;
+
+ ret = _ckpt_kread(ctx->file, addr, count);
+ if (ret < 0)
+ return ret;
+ if (ret != count)
+ return -EPIPE;
+
+ ctx->total += count;
+ return 0;
+}
+
+/**
+ * ckpt_hdr_get - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: desired length
+ *
+ * Returns pointer to header
+ */
+void *ckpt_hdr_get(struct ckpt_ctx *ctx, int len)
+{
+ return kzalloc(len, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ckpt_hdr_get);
+
+/**
+ * _ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ * @len: header length
+ *
+ * (requiring 'ptr' makes it easily interchangable with kmalloc/kfree
+ */
+void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+ kfree(ptr);
+}
+EXPORT_SYMBOL(_ckpt_hdr_put);
+
+/**
+ * ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ *
+ * It is assumed that @ptr begins with a 'struct ckpt_hdr'.
+ */
+void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct ckpt_hdr *h = (struct ckpt_hdr *) ptr;
+ _ckpt_hdr_put(ctx, ptr, h->len);
+}
+EXPORT_SYMBOL(ckpt_hdr_put);
+
+/**
+ * ckpt_hdr_get_type - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: number of bytes to reserve
+ *
+ * Returns pointer to reserved space on hbuf
+ */
+void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
+{
+ struct ckpt_hdr *h;
+
+ h = ckpt_hdr_get(ctx, len);
+ if (!h)
+ return NULL;
+
+ h->type = type;
+ h->len = len;
+ return h;
+}
+EXPORT_SYMBOL(ckpt_hdr_get_type);
+
+/*
+ * Helpers to manage c/r contexts: allocated for each checkpoint and/or
+ * restart operation, and persists until the operation is completed.
+ */
+
+static void ckpt_ctx_free(struct ckpt_ctx *ctx)
+{
+ if (ctx->file)
+ fput(ctx->file);
+ if (ctx->logfile)
+ fput(ctx->logfile);
+ kfree(ctx);
+}
+
+static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
+ unsigned long kflags, int logfd)
+{
+ struct ckpt_ctx *ctx;
+ int err;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ ctx->uflags = uflags;
+ ctx->kflags = kflags;
+
+ mutex_init(&ctx->msg_mutex);
+
+ err = -EBADF;
+ ctx->file = fget(fd);
+ if (!ctx->file)
+ goto err;
+ if (logfd == CHECKPOINT_FD_NONE)
+ goto nolog;
+ ctx->logfile = fget(logfd);
+ if (!ctx->logfile)
+ goto err;
+ nolog:
+ return ctx;
+ err:
+ ckpt_ctx_free(ctx);
+ return ERR_PTR(err);
+}
+
+static void ckpt_set_error(struct ckpt_ctx *ctx, int err)
+{
+ ctx->errno = err;
+}
+
+/* helpers to handler log/dbg/err messages */
+void ckpt_msg_lock(struct ckpt_ctx *ctx)
+{
+ if (!ctx)
+ return;
+ mutex_lock(&ctx->msg_mutex);
+ ctx->msg[0] = '\0';
+ ctx->msglen = 1;
+}
+
+void ckpt_msg_unlock(struct ckpt_ctx *ctx)
+{
+ if (!ctx)
+ return;
+ mutex_unlock(&ctx->msg_mutex);
+}
+
+static inline int is_special_flag(char *s)
+{
+ if (*s == '%' && s[1] == '(' && s[2] != '\0' && s[3] == ')')
+ return 1;
+ return 0;
+}
+
+/*
+ * _ckpt_generate_fmt - handle the special flags in the enhanced format
+ * strings used by checkpoint/restart error messages.
+ * @ctx: checkpoint context
+ * @fmt: message format
+ *
+ * The special flags are surrounded by %() to help them visually stand
+ * out. For instance, %(O) means an objref. The following special
+ * flags are recognized:
+ * O: objref
+ * P: pointer
+ * T: task
+ * S: string
+ * V: variable
+ *
+ * %(O) will be expanded to "[obj %d]". Likewise P, S, and V, will
+ * also expand to format flags requiring an argument to the subsequent
+ * sprintf or printk. T will be expanded to a string with no flags,
+ * requiring no further arguments.
+ *
+ * These do not accept any extra flags (i.e. min field width, precision,
+ * etc).
+ *
+ * The caller of ckpt_err() and _ckpt_err() must provide
+ * the additional variabes, in order, to match the @fmt (except for
+ * the T key), e.g.:
+ *
+ * ckpt_err(ctx, err, "%(T)FILE flags %d %(O)\n", flags, objref);
+ *
+ * May be called under spinlock.
+ * Must be called with ctx->msg_mutex held. The expanded format
+ * will be placed in ctx->fmt.
+ */
+static void _ckpt_generate_fmt(struct ckpt_ctx *ctx, char *fmt)
+{
+ char *s = ctx->fmt;
+ int len = 0;
+
+ for (; *fmt && len < CKPT_MSG_LEN; fmt++) {
+ if (!is_special_flag(fmt)) {
+ s[len++] = *fmt;
+ continue;
+ }
+ switch (fmt[2]) {
+ case 'O':
+ len += snprintf(s+len, CKPT_MSG_LEN-len, "[obj %%d]");
+ break;
+ case 'P':
+ len += snprintf(s+len, CKPT_MSG_LEN-len, "[ptr %%p]");
+ break;
+ case 'V':
+ len += snprintf(s+len, CKPT_MSG_LEN-len, "[sym %%pS]");
+ break;
+ case 'S':
+ len += snprintf(s+len, CKPT_MSG_LEN-len, "[str %%s]");
+ break;
+ case 'T':
+ if (ctx->tsk)
+ len += snprintf(s+len, CKPT_MSG_LEN-len,
+ "[pid %d tsk %s]",
+ task_pid_vnr(ctx->tsk), ctx->tsk->comm);
+ else
+ len += snprintf(s+len, CKPT_MSG_LEN-len,
+ "[pid -1 tsk NULL]");
+ break;
+ default:
+ printk(KERN_ERR "c/r: bad format specifier %c\n",
+ fmt[2]);
+ BUG();
+ }
+ fmt += 3;
+ }
+ if (len == CKPT_MSG_LEN)
+ s[CKPT_MSG_LEN-1] = '\0';
+ else
+ s[len] = '\0';
+}
+
+static void _ckpt_msg_appendv(struct ckpt_ctx *ctx, int err, char *fmt,
+ va_list ap)
+{
+ int len = ctx->msglen;
+
+ if (err) {
+ len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[err %d]",
+ err);
+ if (len > CKPT_MSG_LEN)
+ goto full;
+ }
+
+ len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[pos %lld]",
+ ctx->total);
+ len += vsnprintf(&ctx->msg[len], CKPT_MSG_LEN-len, fmt, ap);
+ if (len > CKPT_MSG_LEN) {
+full:
+ len = CKPT_MSG_LEN;
+ ctx->msg[CKPT_MSG_LEN-1] = '\0';
+ }
+ ctx->msglen = len;
+}
+
+void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ _ckpt_msg_appendv(ctx, 0, fmt, ap);
+ va_end(ap);
+}
+
+void _ckpt_msg_complete(struct ckpt_ctx *ctx)
+{
+ int ret;
+
+ /* Don't write an empty or uninitialized msg */
+ if (ctx->msglen <= 1)
+ return;
+
+ if (ctx->kflags & CKPT_CTX_CHECKPOINT && ctx->errno) {
+ ret = ckpt_write_obj_type(ctx, NULL, 0, CKPT_HDR_ERROR);
+ if (!ret)
+ ret = ckpt_write_string(ctx, ctx->msg, ctx->msglen);
+ if (ret < 0)
+ printk(KERN_NOTICE "c/r: error string unsaved (%d): %s\n",
+ ret, ctx->msg+1);
+ }
+
+ if (ctx->logfile) {
+ struct file *logfile = ctx->logfile;
+ loff_t pos = file_pos_read(logfile);
+ ret = kernel_write(logfile, pos, ctx->msg+1, ctx->msglen-1);
+ if (ret > 0)
+ file_pos_write(logfile, pos + ret);
+ }
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+ printk(KERN_DEBUG "%s", ctx->msg+1);
+#endif
+
+ ctx->msglen = 0;
+}
+
+#define __do_ckpt_msg(ctx, err, fmt) do { \
+ va_list ap; \
+ _ckpt_generate_fmt(ctx, fmt); \
+ va_start(ap, fmt); \
+ _ckpt_msg_appendv(ctx, err, ctx->fmt, ap); \
+ va_end(ap); \
+} while (0)
+
+void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+ __do_ckpt_msg(ctx, err, fmt);
+}
+
+void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+ if (!ctx)
+ return;
+
+ ckpt_msg_lock(ctx);
+ __do_ckpt_msg(ctx, err, fmt);
+ _ckpt_msg_complete(ctx);
+ ckpt_msg_unlock(ctx);
+
+ if (err)
+ ckpt_set_error(ctx, err);
+}
+EXPORT_SYMBOL(do_ckpt_msg);
+
+/* checkpoint/restart syscalls */
/**
- * sys_checkpoint - checkpoint a container
+ * do_sys_checkpoint - checkpoint a container
* @pid: pid of the container init(1) process
* @fd: file to which dump the checkpoint image
* @flags: checkpoint operation flags
@@ -22,14 +408,32 @@
* Returns positive identifier on success, 0 when returning from restart
* or negative value on error
*/
-SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
- unsigned long, flags, int, logfd)
+long do_sys_checkpoint(pid_t pid, int fd, unsigned long flags, int logfd)
{
- return -ENOSYS;
+ struct ckpt_ctx *ctx;
+ long ret;
+
+ /* no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ if (pid == 0)
+ pid = task_pid_vnr(current);
+ ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT, logfd);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = do_checkpoint(ctx, pid);
+
+ if (!ret)
+ ret = ctx->crid;
+
+ ckpt_ctx_free(ctx);
+ return ret;
}
/**
- * sys_restart - restart a container
+ * do_sys_restart - restart a container
* @pid: pid of task root (in coordinator's namespace), or 0
* @fd: file from which read the checkpoint image
* @flags: restart operation flags
@@ -38,8 +442,49 @@ SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
* Returns negative value on error, or otherwise returns in the realm
* of the original checkpoint
*/
-SYSCALL_DEFINE4(restart, pid_t, pid, int, fd,
- unsigned long, flags, int, logfd)
+long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd)
+{
+ struct ckpt_ctx *ctx = NULL;
+ long ret;
+
+ /* no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART, logfd);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = do_restart(ctx, pid);
+
+ /* restart(2) isn't idempotent: can't restart syscall */
+ if (ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+ ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)
+ ret = -EINTR;
+
+ ckpt_ctx_free(ctx);
+ return ret;
+}
+
+
+/* 'ckpt_debug_level' controls the verbosity level of c/r code */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+
+/* FIX: allow to change during runtime */
+unsigned long __read_mostly ckpt_debug_level = CKPT_DDEFAULT;
+EXPORT_SYMBOL(ckpt_debug_level);
+
+static __init int ckpt_debug_setup(char *s)
{
- return -ENOSYS;
+ long val, ret;
+
+ ret = strict_strtoul(s, 10, &val);
+ if (ret < 0)
+ return ret;
+ ckpt_debug_level = val;
+ return 0;
}
+
+__setup("ckpt_debug=", ckpt_debug_setup);
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 935248b..75d413e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1086,6 +1086,19 @@ config DMA_API_DEBUG
This option causes a performance degredation. Use only if you want
to debug device drivers. If unsure, say N.
+config CHECKPOINT_DEBUG
+ bool "Checkpoint/restart debugging (EXPERIMENTAL)"
+ depends on CHECKPOINT
+ default y
+ help
+ This options turns on the debugging output of checkpoint/restart.
+ The level of verbosity is controlled by 'ckpt_debug_level' and can
+ be set at boot time with "ckpt_debug=" option.
+
+ Turning this option off will reduce the size of the c/r code. If
+ turned on, it is unlikely to incur visible overhead if the debug
+ level is set to zero.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 036/100] c/r: introduce vfs_fcntl()
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (2 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 022/100] c/r: basic infrastructure for checkpoint/restart Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 037/100] c/r: introduce new 'file_operations': ->checkpoint, ->collect() Oren Laadan
` (16 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
This patch introduces vfs_fcntl() so that it can be called from
restart (see patch adding restart of files).
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/fcntl.c | 21 +++++++++++++--------
include/linux/fs.h | 2 ++
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f..2079af0 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -418,6 +418,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
return err;
}
+int vfs_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp)
+{
+ int err;
+
+ err = security_file_fcntl(filp, cmd, arg);
+ if (err)
+ goto out;
+ err = do_fcntl(fd, cmd, arg, filp);
+ out:
+ return err;
+}
+
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
struct file *filp;
@@ -427,14 +439,7 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
if (!filp)
goto out;
- err = security_file_fcntl(filp, cmd, arg);
- if (err) {
- fput(filp);
- return err;
- }
-
- err = do_fcntl(fd, cmd, arg, filp);
-
+ err = vfs_fcntl(fd, cmd, arg, filp);
fput(filp);
out:
return err;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e8b171..65ffe9c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1096,6 +1096,8 @@ struct file_lock {
#include <linux/fcntl.h>
+extern int vfs_fcntl(int fd, unsigned cmd, unsigned long arg, struct file *fp);
+
extern void send_sigio(struct fown_struct *fown, int fd, int band);
#ifdef CONFIG_FILE_LOCKING
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 037/100] c/r: introduce new 'file_operations': ->checkpoint, ->collect()
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (3 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 036/100] c/r: introduce vfs_fcntl() Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 038/100] c/r: checkpoint and restart open file descriptors Oren Laadan
` (15 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
While we assume all normal files and directories can be checkpointed,
there are, as usual in the VFS, specialized places that will always
need an ability to override these defaults. Although we could do this
completely in the checkpoint code, that would bitrot quickly.
This adds a new 'file_operations' function for checkpointing a file.
It is assumed that there should be a dirt-simple way to make something
(un)checkpointable that fits in with current code.
As you can see in the ext[234] patches down the road, all that we have
to do to make something simple be supported is add a single "generic"
f_op entry.
Also adds a new 'file_operations' function for 'collecting' a file for
leak-detection during full-container checkpoint. This is useful for
those files that hold references to other "collectable" objects. Two
examples are pty files that point to corresponding tty objects, and
eventpoll files that refer to the files they are monitoring.
Finally, this patch introduces vfs_fcntl() so that it can be called
from restart (see patch adding restart of files).
Changelog[v21]
- Update Documentation/filesystem/vfs.txt
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[v17]
- Introduce 'collect' method
Changelog[v17]
- Forward-declare 'ckpt_ctx' et-al, don't use checkpoint_types.h
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
Documentation/filesystems/vfs.txt | 13 ++++++++++++-
include/linux/fs.h | 5 +++++
2 files changed, 17 insertions(+), 1 deletions(-)
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3de2f32..a78355d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -711,7 +711,7 @@ struct file_operations
----------------------
This describes how the VFS can manipulate an open file. As of kernel
-2.6.22, the following members are defined:
+2.6.34, the following members are defined:
struct file_operations {
struct module *owner;
@@ -742,6 +742,10 @@ struct file_operations {
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
+#ifdef CONFIG_CHECKPOINT
+ int (*checkpoint)(struct ckpt_ctx *, struct file *);
+ int (*collect)(struct ckpt_ctx *, struct file *);
+#endif
};
Again, all methods are called without any locks being held, unless
@@ -813,6 +817,13 @@ otherwise noted.
splice_read: called by the VFS to splice data from file to a pipe. This
method is used by the splice(2) system call
+ checkpoint: called by checkpoint(2) system call to checkpoint the
+ state of a file descriptor.
+
+ collect: called by the checkpoint(2) system call to track references to
+ file descriptors, to detect leaks in full-container checkpoint
+ (see Documentation/checkpoint/readme.txt).
+
Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65ffe9c..c06c157 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -397,6 +397,7 @@ struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
+struct ckpt_ctx;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -1511,6 +1512,10 @@ struct file_operations {
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **);
+#ifdef CONFIG_CHECKPOINT
+ int (*checkpoint)(struct ckpt_ctx *, struct file *);
+ int (*collect)(struct ckpt_ctx *, struct file *);
+#endif
};
struct inode_operations {
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 038/100] c/r: checkpoint and restart open file descriptors
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (4 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 037/100] c/r: introduce new 'file_operations': ->checkpoint, ->collect() Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 039/100] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
` (14 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
Checkpoint: dump the file table with 'struct ckpt_hdr_file_table,
followed by all open file descriptors. Because the 'struct file'
corresponding to an fd can be shared, they are assigned an objref and
registered in the object hash. A reference to the 'file *' is kept for
as long as it lives in the hash (the hash is only cleaned up at the
end of the checkpoint).
Also provide generic_checkpoint_file() and generic_restore_file()
which is good for normal files and directories. It does not support
yet unlinked files or directories.
Restart: for each fd read 'struct ckpt_hdr_file_desc' and lookup
objref in the hash table; If not found in the hash table, (first
occurence), read in 'struct ckpt_hdr_file', create a new file and
register in the hash. Otherwise attach the file pointer from the hash
as an FD.
Changelog[v21]:
- Do not include checkpoint_hdr.h explicitly
- Replace __initcall() with late_initcall()
- [Serge] Print out full path of file which crossed mnt_ns
- Reorganize code into fs/*
- Merge files dump/restore into a single patch
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[v19]:
- Fix false negative of test for unlinked files at checkpoint
Changelog[v19-rc3]:
- [Serge Hallyn] Rename fs_mnt to root_fs_path
- [Dave Hansen] Error out on file locks and leases
- [Serge Hallyn] Refuse checkpoint of file with f_owner
Changelog[v19-rc1]:
- Fix lockdep complaint in restore_obj_files()
- [Matt Helsley] Add cpp definitions for enums
- Restore thread/cpu state early
- Ensure null-termination of file names read from image
- Fix compile warning in restore_open_fname()
Changelog[v18]:
- Add a few more ckpt_write_err()s
- [Dan Smith] Export fill_fname() as ckpt_fill_fname()
- Introduce ckpt_collect_file() that also uses file->collect method
- In collect_file_stabl() use retval from ckpt_obj_collect() to
test for first-time-object
- Invoke set_close_on_exec() unconditionally on restart
Changelog[v17]:
- Validate f_mode after restore against saved f_mode
- Fail if f_flags have O_CREAT|O_EXCL|O_NOCTTY|O_TRUNC
- Reorder patch (move earlier in series)
- Handle shared files_struct objects
- Only collect sub-objects of files_struct once
- Better file error debugging
- Use (new) d_unlinked()
Changelog[v16]:
- Fix compile warning in checkpoint_bad()
Changelog[v16]:
- Reorder patch (move earlier in series)
- Handle shared files_struct objects
Changelog[v14]:
- File objects are dumped/restored prior to the first reference
- Introduce a per file-type restore() callback
- Use struct file_operations->checkpoint()
- Put code for generic file descriptors in a separate function
- Use one CKPT_FILE_GENERIC for both regular files and dirs
- Revert change to pr_debug(), back to ckpt_debug()
- Use only unsigned fields in checkpoint headers
- Rename: ckpt_write_files() => checkpoint_fd_table()
- Rename: ckpt_write_fd_data() => checkpoint_file()
- Rename: ckpt_read_fd_data() => restore_file()
- Rename: restore_files() => restore_fd_table()
- Check whether calls to ckpt_hbuf_get() fail
- Discard field 'h->parent'
Changelog[v12]:
- Replace obsolete ckpt_debug() with pr_debug()
Changelog[v11]:
- Discard handling of opened symlinks (there is no such thing)
- ckpt_scan_fds() retries from scratch if hits size limits
Changelog[v9]:
- Fix a couple of leaks in ckpt_write_files()
- Drop useless kfree from ckpt_scan_fds()
Changelog[v8]:
- initialize 'coe' to workaround gcc false warning
Changelog[v6]:
- Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
(even though it's not really needed)
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/Makefile | 1 +
fs/checkpoint.c | 816 ++++++++++++++++++++++++++++++++++++++
fs/locks.c | 35 ++
include/linux/checkpoint.h | 22 +
include/linux/checkpoint_hdr.h | 59 +++
include/linux/checkpoint_types.h | 5 +
include/linux/fs.h | 10 +
kernel/checkpoint/checkpoint.c | 11 +
kernel/checkpoint/process.c | 53 +++-
kernel/checkpoint/sys.c | 9 +
10 files changed, 1020 insertions(+), 1 deletions(-)
create mode 100644 fs/checkpoint.c
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f..aa25755 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
nfsd-$(CONFIG_NFSD) := nfsctl.o
obj-y += $(nfsd-y) $(nfsd-m)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
new file mode 100644
index 0000000..95a51e9
--- /dev/null
+++ b/fs/checkpoint.c
@@ -0,0 +1,816 @@
+/*
+ * Checkpoint file descriptors
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DFILE
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify.h>
+#include <linux/syscalls.h>
+#include <linux/deferqueue.h>
+#include <linux/checkpoint.h>
+
+/**************************************************************************
+ * Checkpoint
+ */
+
+/**
+ * ckpt_fill_fname - return pathname of a given file
+ * @path: path name
+ * @root: relative root
+ * @buf: buffer for pathname
+ * @len: buffer length (in) and pathname length (out)
+ */
+char *ckpt_fill_fname(struct path *path, struct path *root, char *buf, int *len)
+{
+ struct path tmp = *root;
+ char *fname;
+
+ BUG_ON(!buf);
+ spin_lock(&dcache_lock);
+ fname = __d_path(path, &tmp, buf, *len);
+ spin_unlock(&dcache_lock);
+ if (IS_ERR(fname))
+ return fname;
+ *len = (buf + (*len) - fname);
+ /*
+ * FIX: if __d_path() changed these, it must have stepped out of
+ * init's namespace. Since currently we require a unified namespace
+ * within the container: simply fail.
+ */
+ if (tmp.mnt != root->mnt || tmp.dentry != root->dentry) {
+ ckpt_debug("file %s was opened in an alien mnt_ns\n", fname);
+ fname = ERR_PTR(-EBADF);
+ }
+
+ return fname;
+}
+
+/**
+ * checkpoint_fname - write a file name
+ * @ctx: checkpoint context
+ * @path: path name
+ * @root: relative root
+ */
+int checkpoint_fname(struct ckpt_ctx *ctx, struct path *path, struct path *root)
+{
+ char *buf, *fname;
+ int ret, flen;
+
+ /*
+ * FIXME: we can optimize and save memory (and storage) if we
+ * share strings (through objhash) and reference them instead
+ */
+
+ flen = PATH_MAX;
+ buf = kmalloc(flen, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ fname = ckpt_fill_fname(path, root, buf, &flen);
+ if (!IS_ERR(fname)) {
+ ret = ckpt_write_obj_type(ctx, fname, flen,
+ CKPT_HDR_FILE_NAME);
+ } else {
+ ret = PTR_ERR(fname);
+ ckpt_err(ctx, ret, "%(T)%(S)Obtain filename\n",
+ path->dentry->d_name.name);
+ }
+
+ kfree(buf);
+ return ret;
+}
+
+#define CKPT_DEFAULT_FDTABLE 256 /* an initial guess */
+
+/**
+ * scan_fds - scan file table and construct array of open fds
+ * @files: files_struct pointer
+ * @fdtable: (output) array of open fds
+ *
+ * Returns the number of open fds found, and also the file table
+ * array via *fdtable. The caller should free the array.
+ *
+ * The caller must validate the file descriptors collected in the
+ * array before using them, e.g. by using fcheck_files(), in case
+ * the task's fdtable changes in the meantime.
+ */
+static int scan_fds(struct files_struct *files, int **fdtable)
+{
+ struct fdtable *fdt;
+ int *fds = NULL;
+ int i = 0, n = 0;
+ int tot = CKPT_DEFAULT_FDTABLE;
+
+ /*
+ * We assume that all tasks possibly sharing the file table are
+ * frozen (or we are a single process and we checkpoint ourselves).
+ * Therefore, we can safely proceed after krealloc() from where we
+ * left off. Otherwise the file table may be modified by another
+ * task after we scan it. The behavior is this case is undefined,
+ * and either checkpoint or restart will likely fail.
+ */
+ retry:
+ fds = krealloc(fds, tot * sizeof(*fds), GFP_KERNEL);
+ if (!fds)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ for (/**/; i < fdt->max_fds; i++) {
+ if (!fcheck_files(files, i))
+ continue;
+ if (n == tot) {
+ rcu_read_unlock();
+ tot *= 2; /* won't overflow: kmalloc will fail */
+ goto retry;
+ }
+ fds[n++] = i;
+ }
+ rcu_read_unlock();
+
+ *fdtable = fds;
+ return n;
+}
+
+int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
+ struct ckpt_hdr_file *h)
+{
+ h->f_flags = file->f_flags;
+ h->f_mode = file->f_mode;
+ h->f_pos = file->f_pos;
+ h->f_version = file->f_version;
+
+ ckpt_debug("file %s", file->f_dentry->d_name.name);
+
+ /* FIX: need also file->uid, file->gid, file->f_owner, etc */
+
+ return 0;
+}
+
+int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct ckpt_hdr_file_generic *h;
+ int ret;
+
+ /*
+ * FIXME: when we'll add support for unlinked files/dirs, we'll
+ * need to distinguish between unlinked filed and unlinked dirs.
+ */
+ if (d_unlinked(file->f_dentry)) {
+ ckpt_err(ctx, -EBADF, "%(T)%(P)Unlinked files unsupported\n",
+ file);
+ return -EBADF;
+ }
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+
+ h->common.f_type = CKPT_FILE_GENERIC;
+
+ ret = checkpoint_file_common(ctx, file, &h->common);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, &h->common.h);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_fname(ctx, &file->f_path, &ctx->root_fs_path);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_checkpoint);
+
+/* checkpoint callback for file pointer */
+static int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct file *file = (struct file *) ptr;
+ int ret;
+
+ if (!file->f_op || !file->f_op->checkpoint) {
+ ckpt_err(ctx, -EBADF, "%(T)%(P)%(V)f_op lacks checkpoint\n",
+ file, file->f_op);
+ return -EBADF;
+ }
+
+ ret = file->f_op->checkpoint(ctx, file);
+ if (ret < 0)
+ ckpt_err(ctx, ret, "%(T)%(P)file checkpoint failed\n", file);
+ return ret;
+}
+
+/**
+ * ckpt_write_file_desc - dump the state of a given file descriptor
+ * @ctx: checkpoint context
+ * @files: files_struct pointer
+ * @fd: file descriptor
+ *
+ * Saves the state of the file descriptor; looks up the actual file
+ * pointer in the hash table, and if found saves the matching objref,
+ * otherwise calls ckpt_write_file to dump the file pointer too.
+ */
+static int checkpoint_file_desc(struct ckpt_ctx *ctx,
+ struct files_struct *files, int fd)
+{
+ struct ckpt_hdr_file_desc *h;
+ struct file *file = NULL;
+ struct fdtable *fdt;
+ int objref, ret;
+ int coe = 0; /* avoid gcc warning */
+ pid_t pid;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
+ if (!h)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ file = fcheck_files(files, fd);
+ if (file) {
+ coe = FD_ISSET(fd, fdt->close_on_exec);
+ get_file(file);
+ }
+ rcu_read_unlock();
+
+ ret = find_locks_with_owner(file, files);
+ /*
+ * find_locks_with_owner() returns an error when there
+ * are no locks found, so we *want* it to return an error
+ * code. Its success means we have to fail the checkpoint.
+ */
+ if (!ret) {
+ ret = -EBADF;
+ ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
+ goto out;
+ }
+
+ /* sanity check (although this shouldn't happen) */
+ ret = -EBADF;
+ if (!file) {
+ ckpt_err(ctx, ret, "%(T)fd %d gone?\n", fd);
+ goto out;
+ }
+
+ /*
+ * TODO: Implement c/r of fowner and f_sigio. Should be
+ * trivial, but for now we just refuse its checkpoint
+ */
+ pid = f_getown(file);
+ if (pid) {
+ ret = -EBUSY;
+ ckpt_err(ctx, ret, "%(T)fd %d has an owner (%d)\n", fd);
+ goto out;
+ }
+
+ /*
+ * if seen first time, this will add 'file' to the objhash, keep
+ * a reference to it, dump its state while at it.
+ */
+ objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
+ ckpt_debug("fd %d objref %d file %p coe %d)\n", fd, objref, file, coe);
+ if (objref < 0) {
+ ret = objref;
+ goto out;
+ }
+
+ h->fd_objref = objref;
+ h->fd_descriptor = fd;
+ h->fd_close_on_exec = coe;
+
+ ret = ckpt_write_obj(ctx, &h->h);
+out:
+ ckpt_hdr_put(ctx, h);
+ if (file)
+ fput(file);
+ return ret;
+}
+
+/* checkpoint callback for file table */
+static int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct files_struct *files = ptr;
+ struct ckpt_hdr_file_table *h;
+ int *fdtable = NULL;
+ int nfds, n, ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
+ if (!h)
+ return -ENOMEM;
+
+ nfds = scan_fds(files, &fdtable);
+ if (nfds < 0) {
+ ret = nfds;
+ goto out;
+ }
+
+ h->fdt_nfds = nfds;
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ goto out;
+
+ ckpt_debug("nfds %d\n", nfds);
+ for (n = 0; n < nfds; n++) {
+ ret = checkpoint_file_desc(ctx, files, fdtable[n]);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = deferqueue_run(ctx->files_deferq);
+ ckpt_debug("files_deferq ran %d entries\n", ret);
+ if (ret > 0)
+ ret = 0;
+ out:
+ kfree(fdtable);
+ return ret;
+}
+
+/* checkpoint wrapper for file table */
+int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct files_struct *files;
+ int objref;
+
+ files = get_files_struct(t);
+ if (!files)
+ return -EBUSY;
+ objref = checkpoint_obj(ctx, files, CKPT_OBJ_FILE_TABLE);
+ put_files_struct(files);
+
+ return objref;
+}
+
+/***********************************************************************
+ * Collect
+ */
+
+int ckpt_collect_file(struct ckpt_ctx *ctx, struct file *file)
+{
+ int ret;
+
+ ret = ckpt_obj_collect(ctx, file, CKPT_OBJ_FILE);
+ if (ret <= 0)
+ return ret;
+ /* if first time for this file (ret > 0), invoke ->collect() */
+ if (file->f_op->collect)
+ ret = file->f_op->collect(ctx, file);
+ if (ret < 0)
+ ckpt_err(ctx, ret, "%(T)%(P)File collect\n", file);
+ return ret;
+}
+
+static int collect_file_desc(struct ckpt_ctx *ctx,
+ struct files_struct *files, int fd)
+{
+ struct fdtable *fdt;
+ struct file *file;
+ int ret;
+
+ rcu_read_lock();
+ fdt = files_fdtable(files);
+ file = fcheck_files(files, fd);
+ if (file)
+ get_file(file);
+ rcu_read_unlock();
+
+ if (!file) {
+ ckpt_err(ctx, -EBUSY, "%(T)%(P)File removed\n", file);
+ return -EBUSY;
+ }
+
+ ret = ckpt_collect_file(ctx, file);
+ fput(file);
+
+ return ret;
+}
+
+static int collect_file_table(struct ckpt_ctx *ctx, struct files_struct *files)
+{
+ int *fdtable;
+ int nfds, n;
+ int ret;
+
+ /* if already exists (ret == 0), nothing to do */
+ ret = ckpt_obj_collect(ctx, files, CKPT_OBJ_FILE_TABLE);
+ if (ret <= 0)
+ return ret;
+
+ /* if first time for this file table (ret > 0), proceed inside */
+ nfds = scan_fds(files, &fdtable);
+ if (nfds < 0)
+ return nfds;
+
+ for (n = 0; n < nfds; n++) {
+ ret = collect_file_desc(ctx, files, fdtable[n]);
+ if (ret < 0)
+ break;
+ }
+
+ kfree(fdtable);
+ return ret;
+}
+
+int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct files_struct *files;
+ int ret;
+
+ files = get_files_struct(t);
+ if (!files) {
+ ckpt_err(ctx, -EBUSY, "%(T)files_struct missing\n");
+ return -EBUSY;
+ }
+ ret = collect_file_table(ctx, files);
+ put_files_struct(files);
+
+ return ret;
+}
+
+/**************************************************************************
+ * Restart
+ */
+
+/**
+ * restore_open_fname - read a file name and open a file
+ * @ctx: checkpoint context
+ * @flags: file flags
+ */
+struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags)
+{
+ struct file *file;
+ char *fname;
+ int len;
+
+ /* prevent bad input from doing bad things */
+ if (flags & (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC))
+ return ERR_PTR(-EINVAL);
+
+ len = ckpt_read_payload(ctx, (void **) &fname,
+ PATH_MAX, CKPT_HDR_FILE_NAME);
+ if (len < 0)
+ return ERR_PTR(len);
+ fname[len - 1] = '\0'; /* always play if safe */
+ ckpt_debug("fname '%s' flags %#x\n", fname, flags);
+
+ file = filp_open(fname, flags, 0);
+ kfree(fname);
+
+ return file;
+}
+
+static int close_all_fds(struct files_struct *files)
+{
+ int *fdtable;
+ int nfds;
+
+ nfds = scan_fds(files, &fdtable);
+ if (nfds < 0)
+ return nfds;
+ while (nfds--)
+ sys_close(fdtable[nfds]);
+ kfree(fdtable);
+ return 0;
+}
+
+/**
+ * attach_file - attach a lonely file ptr to a file descriptor
+ * @file: lonely file pointer
+ */
+static int attach_file(struct file *file)
+{
+ int fd = get_unused_fd_flags(0);
+
+ if (fd >= 0) {
+ get_file(file);
+ fsnotify_open(file->f_path.dentry);
+ fd_install(fd, file);
+ }
+ return fd;
+}
+
+#define CKPT_SETFL_MASK \
+ (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
+
+int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
+ struct ckpt_hdr_file *h)
+{
+ fmode_t new_mode = file->f_mode;
+ fmode_t saved_mode = (__force fmode_t) h->f_mode;
+ int ret;
+
+ /* FIX: need to restore uid, gid, owner etc */
+
+ /* safe to set 1st arg (fd) to 0, as command is F_SETFL */
+ ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Normally f_mode is set by open, and modified only via
+ * fcntl(), so its value now should match that at checkpoint.
+ * However, a file may be downgraded from (read-)write to
+ * read-only, e.g:
+ * - mark_files_ro() unsets FMODE_WRITE
+ * - nfs4_file_downgrade() too, and also sert FMODE_READ
+ * Validate the new f_mode against saved f_mode, allowing:
+ * - new with FMODE_WRITE, saved without FMODE_WRITE
+ * - new without FMODE_READ, saved with FMODE_READ
+ */
+ if ((new_mode & FMODE_WRITE) && !(saved_mode & FMODE_WRITE)) {
+ new_mode &= ~FMODE_WRITE;
+ if (!(new_mode & FMODE_READ) && (saved_mode & FMODE_READ))
+ new_mode |= FMODE_READ;
+ }
+ /* finally, at this point new mode should match saved mode */
+ if (new_mode ^ saved_mode)
+ return -EINVAL;
+
+ if (file->f_mode & FMODE_LSEEK)
+ ret = vfs_llseek(file, h->f_pos, SEEK_SET);
+
+ return ret;
+}
+
+static struct file *generic_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr)
+{
+ struct file *file;
+ int ret;
+
+ if (ptr->h.type != CKPT_HDR_FILE ||
+ ptr->h.len != sizeof(*ptr) || ptr->f_type != CKPT_FILE_GENERIC)
+ return ERR_PTR(-EINVAL);
+
+ file = restore_open_fname(ctx, ptr->f_flags);
+ if (IS_ERR(file))
+ return file;
+
+ ret = restore_file_common(ctx, file, ptr);
+ if (ret < 0) {
+ fput(file);
+ file = ERR_PTR(ret);
+ }
+ return file;
+}
+
+struct restore_file_ops {
+ char *file_name;
+ enum file_type file_type;
+ struct file * (*restore) (struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr);
+};
+
+static struct restore_file_ops restore_file_ops[] = {
+ /* ignored file */
+ {
+ .file_name = "IGNORE",
+ .file_type = CKPT_FILE_IGNORE,
+ .restore = NULL,
+ },
+ /* regular file/directory */
+ {
+ .file_name = "GENERIC",
+ .file_type = CKPT_FILE_GENERIC,
+ .restore = generic_file_restore,
+ },
+};
+
+static void *restore_file(struct ckpt_ctx *ctx)
+{
+ struct restore_file_ops *ops;
+ struct ckpt_hdr_file *h;
+ struct file *file = ERR_PTR(-EINVAL);
+
+ /*
+ * All 'struct ckpt_hdr_file_...' begin with ckpt_hdr_file,
+ * but the actual object depends on the file type. The length
+ * should never be more than page.
+ */
+ h = ckpt_read_buf_type(ctx, PAGE_SIZE, CKPT_HDR_FILE);
+ if (IS_ERR(h))
+ return (void *)h;
+ ckpt_debug("flags %#x mode %#x type %d\n",
+ h->f_flags, h->f_mode, h->f_type);
+
+ if (h->f_type >= CKPT_FILE_MAX)
+ goto out;
+
+ ops = &restore_file_ops[h->f_type];
+ BUG_ON(ops->file_type != h->f_type);
+
+ if (ops->restore)
+ file = ops->restore(ctx, h);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return (void *)file;
+}
+
+/**
+ * ckpt_read_file_desc - restore the state of a given file descriptor
+ * @ctx: checkpoint context
+ *
+ * Restores the state of a file descriptor; looks up the objref (in the
+ * header) in the hash table, and if found picks the matching file and
+ * use it; otherwise calls restore_file to restore the file too.
+ */
+static int restore_file_desc(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_file_desc *h;
+ struct file *file;
+ int newfd, ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+ ckpt_debug("ref %d fd %d c.o.e %d\n",
+ h->fd_objref, h->fd_descriptor, h->fd_close_on_exec);
+
+ ret = -EINVAL;
+ if (h->fd_objref <= 0 || h->fd_descriptor < 0)
+ goto out;
+
+ file = ckpt_obj_fetch(ctx, h->fd_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto out;
+ }
+
+ newfd = attach_file(file);
+ if (newfd < 0) {
+ ret = newfd;
+ goto out;
+ }
+
+ ckpt_debug("newfd got %d wanted %d\n", newfd, h->fd_descriptor);
+
+ /* reposition if newfd isn't desired fd */
+ if (newfd != h->fd_descriptor) {
+ ret = sys_dup2(newfd, h->fd_descriptor);
+ if (ret < 0)
+ goto out;
+ sys_close(newfd);
+ }
+
+ set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
+ ret = 0;
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+/* restore callback for file table */
+static void *restore_file_table(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_file_table *h;
+ struct files_struct *files;
+ int i, ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
+ if (IS_ERR(h))
+ return (void *)h;
+
+ ckpt_debug("nfds %d\n", h->fdt_nfds);
+
+ ret = -EMFILE;
+ if (h->fdt_nfds < 0 || h->fdt_nfds > sysctl_nr_open)
+ goto out;
+
+ /*
+ * We assume that restarting tasks, as created in user-space,
+ * have distinct files_struct objects each. If not, we need to
+ * call dup_fd() to make sure we don't overwrite an already
+ * restored one.
+ */
+
+ /* point of no return -- close all file descriptors */
+ ret = close_all_fds(current->files);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < h->fdt_nfds; i++) {
+ ret = restore_file_desc(ctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = deferqueue_run(ctx->files_deferq);
+ ckpt_debug("files_deferq ran %d entries\n", ret);
+ if (ret > 0)
+ ret = 0;
+ out:
+ ckpt_hdr_put(ctx, h);
+ if (!ret) {
+ files = current->files;
+ atomic_inc(&files->count);
+ } else {
+ files = ERR_PTR(ret);
+ }
+ return (void *)files;
+}
+
+int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
+{
+ struct files_struct *files;
+
+ files = ckpt_obj_fetch(ctx, files_objref, CKPT_OBJ_FILE_TABLE);
+ if (IS_ERR(files))
+ return PTR_ERR(files);
+
+ if (files != current->files) {
+ struct files_struct *prev;
+
+ task_lock(current);
+ prev = current->files;
+ current->files = files;
+ atomic_inc(&files->count);
+ task_unlock(current);
+
+ put_files_struct(prev);
+ }
+
+ return 0;
+}
+
+/*
+ * fs-related checkpoint objects
+ */
+static int obj_file_table_grab(void *ptr)
+{
+ atomic_inc(&((struct files_struct *) ptr)->count);
+ return 0;
+}
+
+static void obj_file_table_drop(void *ptr, int lastref)
+{
+ put_files_struct((struct files_struct *) ptr);
+}
+
+static int obj_file_table_users(void *ptr)
+{
+ return atomic_read(&((struct files_struct *) ptr)->count);
+}
+
+static int obj_file_grab(void *ptr)
+{
+ get_file((struct file *) ptr);
+ return 0;
+}
+
+static void obj_file_drop(void *ptr, int lastref)
+{
+ fput((struct file *) ptr);
+}
+
+static int obj_file_users(void *ptr)
+{
+ return atomic_long_read(&((struct file *) ptr)->f_count);
+}
+
+/* files_struct object */
+static const struct ckpt_obj_ops ckpt_obj_files_struct_ops = {
+ .obj_name = "FILE_TABLE",
+ .obj_type = CKPT_OBJ_FILE_TABLE,
+ .ref_drop = obj_file_table_drop,
+ .ref_grab = obj_file_table_grab,
+ .ref_users = obj_file_table_users,
+ .checkpoint = checkpoint_file_table,
+ .restore = restore_file_table,
+};
+
+/* file object */
+static const struct ckpt_obj_ops ckpt_obj_file_ops = {
+ .obj_name = "FILE",
+ .obj_type = CKPT_OBJ_FILE,
+ .ref_drop = obj_file_drop,
+ .ref_grab = obj_file_grab,
+ .ref_users = obj_file_users,
+ .checkpoint = checkpoint_file,
+ .restore = restore_file,
+};
+
+static __init int checkpoint_register_fs(void)
+{
+ int ret;
+
+ ret = register_checkpoint_obj(&ckpt_obj_files_struct_ops);
+ if (ret < 0)
+ return ret;
+ ret = register_checkpoint_obj(&ckpt_obj_file_ops);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+late_initcall(checkpoint_register_fs);
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49..9cd859e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1995,6 +1995,41 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
EXPORT_SYMBOL(locks_remove_posix);
+int find_locks_with_owner(struct file *filp, fl_owner_t owner)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct file_lock **inode_fl;
+ int ret = -EEXIST;
+
+ lock_kernel();
+ for_each_lock(inode, inode_fl) {
+ struct file_lock *fl = *inode_fl;
+ /*
+ * We could use posix_same_owner() along with a 'fake'
+ * file_lock. But, the fake file will never have the
+ * same fl_lmops as the fl that we are looking for and
+ * posix_same_owner() would just fall back to this
+ * check anyway.
+ */
+ if (IS_POSIX(fl)) {
+ if (fl->fl_owner == owner) {
+ ret = 0;
+ break;
+ }
+ } else if (IS_FLOCK(fl) || IS_LEASE(fl)) {
+ if (fl->fl_file == filp) {
+ ret = 0;
+ break;
+ }
+ } else {
+ WARN(1, "unknown file lock type, fl_flags: %x",
+ fl->fl_flags);
+ }
+ }
+ unlock_kernel();
+ return ret;
+}
+
/*
* This function is called on the last close of an open file.
*/
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d438d08..f07209c 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -80,6 +80,9 @@ extern int ckpt_read_payload(struct ckpt_ctx *ctx,
extern char *ckpt_read_string(struct ckpt_ctx *ctx, int max);
extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
+extern char *ckpt_fill_fname(struct path *path, struct path *root,
+ char *buf, int *len);
+
/* ckpt kflags */
#define ckpt_set_ctx_kflag(__ctx, __kflag) \
set_bit(__kflag##_BIT, &(__ctx)->kflags)
@@ -158,6 +161,24 @@ extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
struct task_struct *t);
extern int restore_restart_block(struct ckpt_ctx *ctx);
+/* file table */
+extern int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
+ struct task_struct *t);
+extern int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref);
+
+/* files */
+extern int checkpoint_fname(struct ckpt_ctx *ctx,
+ struct path *path, struct path *root);
+extern struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags);
+
+extern int ckpt_collect_file(struct ckpt_ctx *ctx, struct file *file);
+
+extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
+ struct ckpt_hdr_file *h);
+extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
+ struct ckpt_hdr_file *h);
+
static inline int ckpt_validate_errno(int errno)
{
return (errno >= 0) && (errno < MAX_ERRNO);
@@ -168,6 +189,7 @@ static inline int ckpt_validate_errno(int errno)
#define CKPT_DSYS 0x2 /* generic (system) */
#define CKPT_DRW 0x4 /* image read/write */
#define CKPT_DOBJ 0x8 /* shared objects */
+#define CKPT_DFILE 0x10 /* files and filesystem */
#define CKPT_DDEFAULT 0xffff /* default debug level */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 5545ef6..ec3257a 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -76,6 +76,8 @@ enum {
#define CKPT_HDR_TREE CKPT_HDR_TREE
CKPT_HDR_TASK,
#define CKPT_HDR_TASK CKPT_HDR_TASK
+ CKPT_HDR_TASK_OBJS,
+#define CKPT_HDR_TASK_OBJS CKPT_HDR_TASK_OBJS
CKPT_HDR_RESTART_BLOCK,
#define CKPT_HDR_RESTART_BLOCK CKPT_HDR_RESTART_BLOCK
CKPT_HDR_THREAD,
@@ -85,6 +87,15 @@ enum {
/* 201-299: reserved for arch-dependent */
+ CKPT_HDR_FILE_TABLE = 301,
+#define CKPT_HDR_FILE_TABLE CKPT_HDR_FILE_TABLE
+ CKPT_HDR_FILE_DESC,
+#define CKPT_HDR_FILE_DESC CKPT_HDR_FILE_DESC
+ CKPT_HDR_FILE_NAME,
+#define CKPT_HDR_FILE_NAME CKPT_HDR_FILE_NAME
+ CKPT_HDR_FILE,
+#define CKPT_HDR_FILE CKPT_HDR_FILE
+
CKPT_HDR_TAIL = 9001,
#define CKPT_HDR_TAIL CKPT_HDR_TAIL
@@ -111,6 +122,10 @@ struct ckpt_hdr_objref {
enum obj_type {
CKPT_OBJ_IGNORE = 0,
#define CKPT_OBJ_IGNORE CKPT_OBJ_IGNORE
+ CKPT_OBJ_FILE_TABLE,
+#define CKPT_OBJ_FILE_TABLE CKPT_OBJ_FILE_TABLE
+ CKPT_OBJ_FILE,
+#define CKPT_OBJ_FILE CKPT_OBJ_FILE
CKPT_OBJ_MAX
#define CKPT_OBJ_MAX CKPT_OBJ_MAX
};
@@ -193,6 +208,12 @@ struct ckpt_hdr_task {
__u64 clear_child_tid;
} __attribute__((aligned(8)));
+/* task's shared resources */
+struct ckpt_hdr_task_objs {
+ struct ckpt_hdr h;
+ __s32 files_objref;
+} __attribute__((aligned(8)));
+
/* restart blocks */
struct ckpt_hdr_restart_block {
struct ckpt_hdr h;
@@ -225,4 +246,42 @@ enum restart_block_type {
#define CKPT_RESTART_BLOCK_FUTEX CKPT_RESTART_BLOCK_FUTEX
};
+/* file system */
+struct ckpt_hdr_file_table {
+ struct ckpt_hdr h;
+ __s32 fdt_nfds;
+} __attribute__((aligned(8)));
+
+/* file descriptors */
+struct ckpt_hdr_file_desc {
+ struct ckpt_hdr h;
+ __s32 fd_objref;
+ __s32 fd_descriptor;
+ __u32 fd_close_on_exec;
+} __attribute__((aligned(8)));
+
+enum file_type {
+ CKPT_FILE_IGNORE = 0,
+#define CKPT_FILE_IGNORE CKPT_FILE_IGNORE
+ CKPT_FILE_GENERIC,
+#define CKPT_FILE_GENERIC CKPT_FILE_GENERIC
+ CKPT_FILE_MAX
+#define CKPT_FILE_MAX CKPT_FILE_MAX
+};
+
+/* file objects */
+struct ckpt_hdr_file {
+ struct ckpt_hdr h;
+ __u32 f_type;
+ __u32 f_mode;
+ __u32 f_flags;
+ __u32 _padding;
+ __u64 f_pos;
+ __u64 f_version;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_file_generic {
+ struct ckpt_hdr_file common;
+} __attribute__((aligned(8)));
+
#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index ce68f54..ce46a59 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -14,6 +14,8 @@
#include <linux/sched.h>
#include <linux/nsproxy.h>
+#include <linux/list.h>
+#include <linux/path.h>
#include <linux/fs.h>
#include <linux/ktime.h>
#include <linux/wait.h>
@@ -40,6 +42,9 @@ struct ckpt_ctx {
atomic_t refcount;
struct ckpt_obj_hash *obj_hash; /* repository for shared objects */
+ struct deferqueue_head *files_deferq; /* deferred file-table work */
+
+ struct path root_fs_path; /* container root (FIXME) */
struct task_struct *tsk;/* checkpoint: current target task */
char err_string[256]; /* checkpoint: error string */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c06c157..c0a59ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1123,6 +1123,7 @@ extern void locks_remove_posix(struct file *, fl_owner_t);
extern void locks_remove_flock(struct file *);
extern void locks_release_private(struct file_lock *);
extern void posix_test_lock(struct file *, struct file_lock *);
+extern int find_locks_with_owner(struct file *filp, fl_owner_t owner);
extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
extern int posix_lock_file_wait(struct file *, struct file_lock *);
extern int posix_unblock_lock(struct file *, struct file_lock *);
@@ -1191,6 +1192,11 @@ static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
return;
}
+static inline int find_locks_with_owner(struct file *filp, fl_owner_t owner)
+{
+ return -ENOENT;
+}
+
static inline void locks_remove_flock(struct file *filp)
{
return;
@@ -2323,6 +2329,10 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes);
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
+#ifdef CONFIG_CHECKPOINT
+extern int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+#endif
+
extern int vfs_readdir(struct file *, filldir_t, void *);
extern int vfs_stat(char __user *, struct kstat *);
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
index 4640e80..a4f6854 100644
--- a/kernel/checkpoint/checkpoint.c
+++ b/kernel/checkpoint/checkpoint.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/fs_struct.h>
#include <linux/dcache.h>
#include <linux/mount.h>
#include <linux/utsname.h>
@@ -491,6 +492,7 @@ static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
{
struct task_struct *task;
struct nsproxy *nsproxy;
+ struct fs_struct *fs;
/*
* No need for explicit cleanup here, because if an error
@@ -532,6 +534,15 @@ static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
return -EINVAL; /* cleanup by ckpt_ctx_free() */
}
+ /* root vfs (FIX: WILL CHANGE with mnt-ns etc */
+ task_lock(ctx->root_task);
+ fs = ctx->root_task->fs;
+ read_lock(&fs->lock);
+ ctx->root_fs_path = fs->root;
+ path_get(&ctx->root_fs_path);
+ read_unlock(&fs->lock);
+ task_unlock(ctx->root_task);
+
return 0;
}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
index f17f040..3880fa6 100644
--- a/kernel/checkpoint/process.c
+++ b/kernel/checkpoint/process.c
@@ -103,6 +103,29 @@ static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
}
+static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct ckpt_hdr_task_objs *h;
+ int files_objref;
+ int ret;
+
+ files_objref = checkpoint_obj_file_table(ctx, t);
+ ckpt_debug("files: objref %d\n", files_objref);
+ if (files_objref < 0) {
+ ckpt_err(ctx, files_objref, "%(T)files_struct\n");
+ return files_objref;
+ }
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
+ if (!h)
+ return -ENOMEM;
+ h->files_objref = files_objref;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
/* dump the task_struct of a given task */
int checkpoint_restart_block(struct ckpt_ctx *ctx, struct task_struct *t)
{
@@ -239,6 +262,10 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
goto out;
ret = checkpoint_cpu(ctx, t);
ckpt_debug("cpu %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_task_objs(ctx, t);
+ ckpt_debug("objs %d\n", ret);
out:
ctx->tsk = NULL;
return ret;
@@ -246,7 +273,11 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
int ckpt_collect_task(struct ckpt_ctx *ctx, struct task_struct *t)
{
- return 0;
+ int ret;
+
+ ret = ckpt_collect_file_table(ctx, t);
+
+ return ret;
}
/***********************************************************************
@@ -316,6 +347,22 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
return ret;
}
+static int restore_task_objs(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_task_objs *h;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ret = restore_obj_file_table(ctx, h->files_objref);
+ ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
+
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
int restore_restart_block(struct ckpt_ctx *ctx)
{
struct ckpt_hdr_restart_block *h;
@@ -445,6 +492,10 @@ int restore_task(struct ckpt_ctx *ctx)
goto out;
ret = restore_cpu(ctx);
ckpt_debug("cpu %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = restore_task_objs(ctx);
+ ckpt_debug("objs %d\n", ret);
out:
return ret;
}
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
index 5e84915..0f4a74f 100644
--- a/kernel/checkpoint/sys.c
+++ b/kernel/checkpoint/sys.c
@@ -22,6 +22,7 @@
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
+#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
/*
@@ -198,12 +199,16 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
if (ctx->kflags & CKPT_CTX_RESTART)
restore_debug_free(ctx);
+ if (ctx->files_deferq)
+ deferqueue_destroy(ctx->files_deferq);
+
if (ctx->file)
fput(ctx->file);
if (ctx->logfile)
fput(ctx->logfile);
ckpt_obj_hash_free(ctx);
+ path_put(&ctx->root_fs_path);
if (ctx->tasks_arr)
task_arr_free(ctx);
@@ -262,6 +267,10 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
if (ckpt_obj_hash_alloc(ctx) < 0)
goto err;
+ ctx->files_deferq = deferqueue_create();
+ if (!ctx->files_deferq)
+ goto err;
+
atomic_inc(&ctx->refcount);
return ctx;
err:
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 039/100] c/r: introduce method '->checkpoint()' in struct vm_operations_struct
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (5 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 038/100] c/r: checkpoint and restart open file descriptors Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 041/100] c/r: dump memory address space (private memory) Oren Laadan
` (13 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel, linux-mm
Changelog[v17]
- Forward-declare 'ckpt_ctx et-al, don't use checkpoint_types.h
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
include/linux/mm.h | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 462acaf..4dfaf69 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -20,6 +20,7 @@ struct file_ra_state;
struct user_struct;
struct writeback_control;
struct rlimit;
+struct ckpt_ctx;
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;
@@ -221,6 +222,9 @@ struct vm_operations_struct {
int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
const nodemask_t *to, unsigned long flags);
#endif
+#ifdef CONFIG_CHECKPOINT
+ int (*checkpoint)(struct ckpt_ctx *ctx, struct vm_area_struct *vma);
+#endif
};
struct mmu_gather;
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 041/100] c/r: dump memory address space (private memory)
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (6 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 039/100] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 042/100] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
` (12 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-mm, linux-fsdevel
For each vma, there is a 'struct ckpt_vma'; Then comes the actual
contents, in one or more chunk: each chunk begins with a header that
specifies how many pages it holds, then the virtual addresses of all
the dumped pages in that chunk, followed by the actual contents of all
dumped pages. A header with zero number of pages marks the end of the
contents. Then comes the next vma and so on.
To checkpoint a vma, call the ops->checkpoint() method of that vma.
Normally the per-vma function will invoke generic_vma_checkpoint()
which first writes the vma description, followed by the specific
logic to dump the contents of the pages.
Currently for private mapped memory we save the pathname of the file
that is mapped (restart will use it to re-open it and then map it).
Later we change that to reference a file object.
Restoring the memory address space begins with nuking the existing one
of the current process, and then reading the vma state and contents.
Call do_mmap_pgoffset() for each vma and then read in the data.
Changelog[v21]:
- Do not include checkpoint_hdr.h explicitly
- Replace __initcall() with late_initcall()
- Merge mm dump/restore into a single patch
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[v20]:
- Only use arch_setup_additional_pages() if supported by arch
Changelog[v19]:
- [Serge Hallyn] Checkpoint saved_auxv as u64s
- [Serge Hallyn] do_munmap(): remove unused local vars
Changelog[v19-rc3]:
- Separate __get_dirty_page() into its own patch
- Export filemap_checkpoint()
- [Serge Hallyn] Disallow checkpoint of tasks with aio requests
- Fix compilation failure when !CONFIG_CHEKCPOINT (regression)
- [Serge Hallyn] move destroy_mm into mmap.c and remove size check
- [Serge Hallyn] fill vdso (syscall32_setup_pages) for TIF_IA32/x86_64
- Do not hold mmap_sem when reading memory pages on restart
Changelog[v19-rc2]:
- Expose page write functions
- Take mmap_sem() around vma_fill_pgarr() (fix regression)
- Move consider_private_page() to mm/memory.c:__get_dirty_page()
- Expose page write functions
- [Serge Hallyn] Fix return value of read_pages_contents()
Changelog[v19-rc1]:
- [Matt Helsley] Add cpp definitions for enums
- Do not hold mmap_sem while checkpointing vma's
Changelog[v18]:
- Tighten checks on supported vma to checkpoint or restart
- Add a few more ckpt_write_err()s
- [Serge Hallyn] Export filemap_checkpoint() (used later for ext4)
- Use ckpt_collect_file() instead of ckpt_obj_collect() for files
- In collect_mm() use retval from ckpt_obj_collect() to test for
first-time-object
- Tighten checks on supported vma to checkpoint or restart
Changelog[v17]:
- Only collect sub-objects of mm_struct once
- Save mm->{flags,def_flags,saved_auxv}
- Restore mm->{flags,def_flags,saved_auxv}
- Fix bogus warning in do_restore_mm()
Changelog[v16]:
- Precede vaddrs/pages with a buffer header
- Checkpoint mm->exe_file
- Handle shared task->mm
- Restore mm->exe_file
Changelog[v14]:
- Modify the ops->checkpoint method to be much more powerful
- Improve support for VDSO (with special_mapping checkpoint callback)
- Save new field 'vdso' in mm_context
- Revert change to pr_debug(), back to ckpt_debug()
- Check whether calls to ckpt_hbuf_get() fail
- Discard field 'h->parent'
- Introduce per vma-type restore() function
- Merge restart code into same file as checkpoint (memory.c)
- Compare saved 'vdso' field of mm_context with current value
Changelog[v13]:
- pgprot_t is an abstract type; use the proper accessor (fix for
64-bit powerpc (Nathan Lynch <ntl@pobox.com>)
- Avoid access to hh->vma_type after the header is freed (restart)
- Test for no vma's in exit_mmap() before calling unmap_vma() (or
it may crash if restart fails after having removed all vma's)
Changelog[v12]:
- Hide pgarr management inside ckpt_private_vma_fill_pgarr()
- Fix management of pgarr chain reset and alloc/expand: keep empty
pgarr in a pool chain
- Replace obsolete ckpt_debug() with pr_debug()
Changelog[v11]:
- Copy contents of 'init->fs->root' instead of pointing to them.
- Add missing test for VM_MAYSHARE when dumping memory
Changelog[v10]:
- Acquire dcache_lock around call to __d_path() in ckpt_fill_name()
Changelog[v9]:
- Introduce ckpt_ctx_checkpoint() for checkpoint-specific ctx setup
- Test if __d_path() changes mnt/dentry (when crossing filesystem
namespace boundary). for now ckpt_fill_fname() fails the checkpoint.
Changelog[v7]:
- Fix argument given to kunmap_atomic() in memory dump/restore
Changelog[v6]:
- Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
(even though it's not really needed)
Changelog[v5]:
- Improve memory dump code (following Dave Hansen's comments)
- Change dump format (and code) to allow chunks of <vaddrs, pages>
instead of one long list of each
- Fix use of follow_page() to avoid faulting in non-present pages
- Memory restore now maps user pages explicitly to copy data into them,
instead of reading directly to user space; got rid of mprotect_fixup()
Changelog[v4]:
- Use standard list_... for ckpt_pgarr
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
arch/x86/include/asm/checkpoint_hdr.h | 9 +
arch/x86/include/asm/ldt.h | 7 +
arch/x86/kernel/checkpoint.c | 95 +++
fs/aio.c | 17 +
fs/exec.c | 2 +-
include/linux/aio.h | 2 +
include/linux/checkpoint.h | 34 +
include/linux/checkpoint_hdr.h | 62 ++
include/linux/checkpoint_types.h | 7 +
include/linux/mm.h | 19 +
kernel/checkpoint/checkpoint.c | 2 +
kernel/checkpoint/process.c | 15 +
kernel/checkpoint/restart.c | 3 +
kernel/checkpoint/sys.c | 9 +
mm/Makefile | 1 +
mm/checkpoint.c | 1226 +++++++++++++++++++++++++++++++++
mm/filemap.c | 45 ++
mm/mmap.c | 101 +++-
18 files changed, 1650 insertions(+), 6 deletions(-)
create mode 100644 mm/checkpoint.c
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index 6f600dd..292bf50 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -48,6 +48,8 @@
enum {
CKPT_HDR_CPU_FPU = 201,
#define CKPT_HDR_CPU_FPU CKPT_HDR_CPU_FPU
+ CKPT_HDR_MM_CONTEXT_LDT,
+#define CKPT_HDR_MM_CONTEXT_LDT CKPT_HDR_MM_CONTEXT_LDT
};
struct ckpt_hdr_header_arch {
@@ -115,4 +117,11 @@ struct ckpt_hdr_cpu {
#define CKPT_X86_SEG_TLS 0x4000 /* 0100 0000 0000 00xx */
#define CKPT_X86_SEG_LDT 0x8000 /* 100x xxxx xxxx xxxx */
+struct ckpt_hdr_mm_context {
+ struct ckpt_hdr h;
+ __u64 vdso;
+ __u32 ldt_entry_size;
+ __u32 nldt;
+} __attribute__((aligned(8)));
+
#endif /* __ASM_X86_CKPT_HDR__H */
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
index 46727eb..f2845f9 100644
--- a/arch/x86/include/asm/ldt.h
+++ b/arch/x86/include/asm/ldt.h
@@ -37,4 +37,11 @@ struct user_desc {
#define MODIFY_LDT_CONTENTS_CODE 2
#endif /* !__ASSEMBLY__ */
+
+#ifdef __KERNEL__
+#include <linux/linkage.h>
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+ unsigned long bytecount);
+#endif
+
#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/kernel/checkpoint.c b/arch/x86/kernel/checkpoint.c
index 3976318..0258245 100644
--- a/arch/x86/kernel/checkpoint.c
+++ b/arch/x86/kernel/checkpoint.c
@@ -13,6 +13,7 @@
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/elf.h>
#include <linux/checkpoint.h>
@@ -206,6 +207,37 @@ int checkpoint_write_header_arch(struct ckpt_ctx *ctx)
return ret;
}
+/* dump the mm->context state */
+int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_mm_context *h;
+ int ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+ if (!h)
+ return -ENOMEM;
+
+ mutex_lock(&mm->context.lock);
+
+ h->vdso = (unsigned long) mm->context.vdso;
+ h->ldt_entry_size = LDT_ENTRY_SIZE;
+ h->nldt = mm->context.size;
+
+ ckpt_debug("nldt %d vdso %#llx\n", h->nldt, h->vdso);
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ goto out;
+
+ ret = ckpt_write_obj_type(ctx, mm->context.ldt,
+ mm->context.size * LDT_ENTRY_SIZE,
+ CKPT_HDR_MM_CONTEXT_LDT);
+ out:
+ mutex_unlock(&mm->context.lock);
+ return ret;
+}
+
/**************************************************************************
* Restart
*/
@@ -432,3 +464,66 @@ int restore_read_header_arch(struct ckpt_ctx *ctx)
ckpt_hdr_put(ctx, h);
return ret;
}
+
+int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_mm_context *h;
+ unsigned int n;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ckpt_debug("nldt %d vdso %#lx (%p)\n",
+ h->nldt, (unsigned long) h->vdso, mm->context.vdso);
+
+ ret = -EINVAL;
+ if (h->vdso != (unsigned long) mm->context.vdso)
+ goto out;
+ if (h->ldt_entry_size != LDT_ENTRY_SIZE)
+ goto out;
+
+ ret = _ckpt_read_obj_type(ctx, NULL,
+ h->nldt * LDT_ENTRY_SIZE,
+ CKPT_HDR_MM_CONTEXT_LDT);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * to utilize the syscall modify_ldt() we first convert the data
+ * in the checkpoint image from 'struct desc_struct' to 'struct
+ * user_desc' with reverse logic of include/asm/desc.h:fill_ldt()
+ */
+ for (n = 0; n < h->nldt; n++) {
+ struct user_desc info;
+ struct desc_struct desc;
+ mm_segment_t old_fs;
+
+ ret = ckpt_kread(ctx, &desc, LDT_ENTRY_SIZE);
+ if (ret < 0)
+ break;
+
+ info.entry_number = n;
+ info.base_addr = desc.base0 | (desc.base1 << 16);
+ info.limit = desc.limit0;
+ info.seg_32bit = desc.d;
+ info.contents = desc.type >> 2;
+ info.read_exec_only = (desc.type >> 1) ^ 1;
+ info.limit_in_pages = desc.g;
+ info.seg_not_present = desc.p ^ 1;
+ info.useable = desc.avl;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ ret = sys_modify_ldt(1, (struct user_desc __user *) &info,
+ sizeof(info));
+ set_fs(old_fs);
+
+ if (ret < 0)
+ break;
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3..b3e1532 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1806,3 +1806,20 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
return ret;
}
+
+int check_for_outstanding_aio(struct mm_struct *mm)
+{
+ struct kioctx *ctx;
+ struct hlist_node *n;
+ int ret = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
+ if (!ctx->dead) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
diff --git a/fs/exec.c b/fs/exec.c
index 7bacb6a..06f93d8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -769,7 +769,7 @@ ssize_t kernel_write(struct file *file, loff_t offset,
EXPORT_SYMBOL(kernel_write);
-static int exec_mmap(struct mm_struct *mm)
+int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct * old_mm, *active_mm;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 811dbb3..e0b1808 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -212,6 +212,7 @@ extern void kick_iocb(struct kiocb *iocb);
extern int aio_complete(struct kiocb *iocb, long res, long res2);
struct mm_struct;
extern void exit_aio(struct mm_struct *mm);
+extern int check_for_outstanding_aio(struct mm_struct *mm);
#else
static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
static inline int aio_put_req(struct kiocb *iocb) { return 0; }
@@ -219,6 +220,7 @@ static inline void kick_iocb(struct kiocb *iocb) { }
static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
struct mm_struct;
static inline void exit_aio(struct mm_struct *mm) { }
+static inline int check_for_outstanding_aio(struct mm_struct *mm) { return 0; }
#endif /* CONFIG_AIO */
static inline struct kiocb *list_kiocb(struct list_head *h)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index f07209c..c9efeb3 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -83,6 +83,9 @@ extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
extern char *ckpt_fill_fname(struct path *path, struct path *root,
char *buf, int *len);
+extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
+extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);
+
/* ckpt kflags */
#define ckpt_set_ctx_kflag(__ctx, __kflag) \
set_bit(__kflag##_BIT, &(__ctx)->kflags)
@@ -152,10 +155,12 @@ extern int restore_task(struct ckpt_ctx *ctx);
extern int checkpoint_write_header_arch(struct ckpt_ctx *ctx);
extern int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t);
extern int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
extern int restore_read_header_arch(struct ckpt_ctx *ctx);
extern int restore_thread(struct ckpt_ctx *ctx);
extern int restore_cpu(struct ckpt_ctx *ctx);
+extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
struct task_struct *t);
@@ -179,6 +184,33 @@ extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
struct ckpt_hdr_file *h);
+/* memory */
+extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
+
+extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type,
+ int vma_objref);
+extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type,
+ int vma_objref);
+
+extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
+
+extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+
+extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct file *file, struct ckpt_hdr_vma *h);
+
+
+#define CKPT_VMA_NOT_SUPPORTED \
+ (VM_SHARED | VM_MAYSHARE | VM_IO | VM_HUGETLB | \
+ VM_NONLINEAR | VM_PFNMAP | VM_RESERVED | VM_NORESERVE \
+ | VM_HUGETLB | VM_NONLINEAR | VM_MAPPED_COPY | \
+ VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
+
static inline int ckpt_validate_errno(int errno)
{
return (errno >= 0) && (errno < MAX_ERRNO);
@@ -190,6 +222,8 @@ static inline int ckpt_validate_errno(int errno)
#define CKPT_DRW 0x4 /* image read/write */
#define CKPT_DOBJ 0x8 /* shared objects */
#define CKPT_DFILE 0x10 /* files and filesystem */
+#define CKPT_DMEM 0x20 /* memory state */
+#define CKPT_DPAGE 0x40 /* memory pages */
#define CKPT_DDEFAULT 0xffff /* default debug level */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index ec3257a..f2c67ee 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -96,6 +96,15 @@ enum {
CKPT_HDR_FILE,
#define CKPT_HDR_FILE CKPT_HDR_FILE
+ CKPT_HDR_MM = 401,
+#define CKPT_HDR_MM CKPT_HDR_MM
+ CKPT_HDR_VMA,
+#define CKPT_HDR_VMA CKPT_HDR_VMA
+ CKPT_HDR_PGARR,
+#define CKPT_HDR_PGARR CKPT_HDR_PGARR
+ CKPT_HDR_MM_CONTEXT,
+#define CKPT_HDR_MM_CONTEXT CKPT_HDR_MM_CONTEXT
+
CKPT_HDR_TAIL = 9001,
#define CKPT_HDR_TAIL CKPT_HDR_TAIL
@@ -126,6 +135,8 @@ enum obj_type {
#define CKPT_OBJ_FILE_TABLE CKPT_OBJ_FILE_TABLE
CKPT_OBJ_FILE,
#define CKPT_OBJ_FILE CKPT_OBJ_FILE
+ CKPT_OBJ_MM,
+#define CKPT_OBJ_MM CKPT_OBJ_MM
CKPT_OBJ_MAX
#define CKPT_OBJ_MAX CKPT_OBJ_MAX
};
@@ -134,6 +145,8 @@ enum obj_type {
struct ckpt_const {
/* task */
__u16 task_comm_len;
+ /* mm */
+ __u16 at_vector_size;
/* uts */
__u16 uts_release_len;
__u16 uts_version_len;
@@ -212,6 +225,7 @@ struct ckpt_hdr_task {
struct ckpt_hdr_task_objs {
struct ckpt_hdr h;
__s32 files_objref;
+ __s32 mm_objref;
} __attribute__((aligned(8)));
/* restart blocks */
@@ -284,4 +298,52 @@ struct ckpt_hdr_file_generic {
struct ckpt_hdr_file common;
} __attribute__((aligned(8)));
+/* memory layout */
+struct ckpt_hdr_mm {
+ struct ckpt_hdr h;
+ __u32 map_count;
+ __s32 exe_objref;
+
+ __u64 def_flags;
+ __u64 flags;
+
+ __u64 start_code, end_code, start_data, end_data;
+ __u64 start_brk, brk, start_stack;
+ __u64 arg_start, arg_end, env_start, env_end;
+} __attribute__((aligned(8)));
+
+/* vma subtypes - index into restore_vma_dispatch[] */
+enum vma_type {
+ CKPT_VMA_IGNORE = 0,
+#define CKPT_VMA_IGNORE CKPT_VMA_IGNORE
+ CKPT_VMA_VDSO, /* special vdso vma */
+#define CKPT_VMA_VDSO CKPT_VMA_VDSO
+ CKPT_VMA_ANON, /* private anonymous */
+#define CKPT_VMA_ANON CKPT_VMA_ANON
+ CKPT_VMA_FILE, /* private mapped file */
+#define CKPT_VMA_FILE CKPT_VMA_FILE
+ CKPT_VMA_MAX
+#define CKPT_VMA_MAX CKPT_VMA_MAX
+};
+
+/* vma descriptor */
+struct ckpt_hdr_vma {
+ struct ckpt_hdr h;
+ __u32 vma_type;
+ __s32 vma_objref; /* objref of backing file */
+
+ __u64 vm_start;
+ __u64 vm_end;
+ __u64 vm_page_prot;
+ __u64 vm_flags;
+ __u64 vm_pgoff;
+} __attribute__((aligned(8)));
+
+/* page array */
+struct ckpt_hdr_pgarr {
+ struct ckpt_hdr h;
+ __u64 nr_pages; /* number of pages to saved */
+} __attribute__((aligned(8)));
+
+
#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index ce46a59..e150182 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -15,6 +15,8 @@
#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
#include <linux/path.h>
#include <linux/fs.h>
#include <linux/ktime.h>
@@ -52,6 +54,11 @@ struct ckpt_ctx {
int errno; /* errno that caused failure */
struct completion errno_sync; /* protect errno setting */
+ struct list_head pgarr_list; /* page array to dump VMA contents */
+ struct list_head pgarr_pool; /* pool of empty page arrays chain */
+
+ void *scratch_page; /* scratch buffer for page I/O */
+
/* [multi-process checkpoint] */
struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
int nr_tasks; /* size of tasks array */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5e9ad0d..913c7fe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1287,9 +1287,13 @@ out:
}
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+extern int destroy_mm(struct mm_struct *);
extern unsigned long do_brk(unsigned long, unsigned long);
+/* fs/exec.c */
+extern int exec_mmap(struct mm_struct *mm);
+
/* filemap.c */
extern unsigned long page_unuse(struct page *);
extern void truncate_inode_pages(struct address_space *, loff_t);
@@ -1299,10 +1303,25 @@ extern void truncate_inode_pages_range(struct address_space *,
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+#ifdef CONFIG_CHECKPOINT
+/* generic vm_area_ops exported for mapped files checkpoint */
+extern int filemap_checkpoint(struct ckpt_ctx *, struct vm_area_struct *);
+#endif
+
/* mm/page-writeback.c */
int write_one_page(struct page *page, int wait);
void task_dirty_inc(struct task_struct *tsk);
+
+/* checkpoint/restart */
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_hdr_vma;
+extern int filemap_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct ckpt_hdr_vma *hh);
+extern int special_mapping_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct ckpt_hdr_vma *hh);
+#endif
+
/* readahead.c */
#define VM_MAX_READAHEAD 128 /* kbytes */
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
index a4f6854..0f83cca 100644
--- a/kernel/checkpoint/checkpoint.c
+++ b/kernel/checkpoint/checkpoint.c
@@ -111,6 +111,8 @@ static void fill_kernel_const(struct ckpt_const *h)
/* task */
h->task_comm_len = sizeof(tsk->comm);
+ /* mm->saved_auxv size */
+ h->at_vector_size = AT_VECTOR_SIZE;
/* uts */
h->uts_release_len = sizeof(uts->release);
h->uts_version_len = sizeof(uts->version);
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
index 3880fa6..1ec5c6a 100644
--- a/kernel/checkpoint/process.c
+++ b/kernel/checkpoint/process.c
@@ -107,6 +107,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
{
struct ckpt_hdr_task_objs *h;
int files_objref;
+ int mm_objref;
int ret;
files_objref = checkpoint_obj_file_table(ctx, t);
@@ -116,10 +117,18 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
return files_objref;
}
+ mm_objref = checkpoint_obj_mm(ctx, t);
+ ckpt_debug("mm: objref %d\n", mm_objref);
+ if (mm_objref < 0) {
+ ckpt_err(ctx, mm_objref, "%(T)mm_struct\n");
+ return mm_objref;
+ }
+
h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
if (!h)
return -ENOMEM;
h->files_objref = files_objref;
+ h->mm_objref = mm_objref;
ret = ckpt_write_obj(ctx, &h->h);
ckpt_hdr_put(ctx, h);
@@ -276,6 +285,9 @@ int ckpt_collect_task(struct ckpt_ctx *ctx, struct task_struct *t)
int ret;
ret = ckpt_collect_file_table(ctx, t);
+ if (ret < 0)
+ return ret;
+ ret = ckpt_collect_mm(ctx, t);
return ret;
}
@@ -359,6 +371,9 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
ret = restore_obj_file_table(ctx, h->files_objref);
ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
+ ret = restore_obj_mm(ctx, h->mm_objref);
+ ckpt_debug("mm: ret %d (%p)\n", ret, current->mm);
+
ckpt_hdr_put(ctx, h);
return ret;
}
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
index 437de4f..c0fe147 100644
--- a/kernel/checkpoint/restart.c
+++ b/kernel/checkpoint/restart.c
@@ -565,6 +565,9 @@ static int check_kernel_const(struct ckpt_const *h)
/* task */
if (h->task_comm_len != sizeof(tsk->comm))
return -EINVAL;
+ /* mm->saved_auxv size */
+ if (h->at_vector_size != AT_VECTOR_SIZE)
+ return -EINVAL;
/* uts */
if (h->uts_release_len != sizeof(uts->release))
return -EINVAL;
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
index 0f4a74f..b258874 100644
--- a/kernel/checkpoint/sys.c
+++ b/kernel/checkpoint/sys.c
@@ -209,6 +209,7 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
ckpt_obj_hash_free(ctx);
path_put(&ctx->root_fs_path);
+ ckpt_pgarr_free(ctx);
if (ctx->tasks_arr)
task_arr_free(ctx);
@@ -220,6 +221,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
if (ctx->root_freezer)
put_task_struct(ctx->root_freezer);
+ free_page((unsigned long) ctx->scratch_page);
+
kfree(ctx->pids_arr);
kfree(ctx);
@@ -240,6 +243,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
ctx->ktime_begin = ktime_get();
atomic_set(&ctx->refcount, 0);
+ INIT_LIST_HEAD(&ctx->pgarr_list);
+ INIT_LIST_HEAD(&ctx->pgarr_pool);
init_waitqueue_head(&ctx->waitq);
init_completion(&ctx->complete);
@@ -271,6 +276,10 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
if (!ctx->files_deferq)
goto err;
+ ctx->scratch_page = (void *) __get_free_page(GFP_KERNEL);
+ if (!ctx->scratch_page)
+ goto err;
+
atomic_inc(&ctx->refcount);
return ctx;
err:
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a..e779b69 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,6 +38,7 @@ obj-y += percpu.o
else
obj-y += percpu_up.o
endif
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/checkpoint.c b/mm/checkpoint.c
new file mode 100644
index 0000000..d53025b
--- /dev/null
+++ b/mm/checkpoint.c
@@ -0,0 +1,1226 @@
+/*
+ * Checkpoint/restart memory contents
+ *
+ * Copyright (C) 2008-2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DMEM
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/aio.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/mm_types.h>
+#include <linux/proc_fs.h>
+#include <linux/checkpoint.h>
+
+/*
+ * page-array chains: each ckpt_pgarr describes a set of <struct page *,vaddr>
+ * tuples (where vaddr is the virtual address of a page in a particular mm).
+ * Specifically, we use separate arrays so that all vaddrs can be written
+ * and read at once.
+ */
+
+struct ckpt_pgarr {
+ unsigned long *vaddrs;
+ struct page **pages;
+ unsigned int nr_used;
+ struct list_head list;
+};
+
+#define CKPT_PGARR_TOTAL (PAGE_SIZE / sizeof(void *))
+#define CKPT_PGARR_BATCH (16 * CKPT_PGARR_TOTAL)
+
+static inline int pgarr_is_full(struct ckpt_pgarr *pgarr)
+{
+ return (pgarr->nr_used == CKPT_PGARR_TOTAL);
+}
+
+static inline int pgarr_nr_free(struct ckpt_pgarr *pgarr)
+{
+ return CKPT_PGARR_TOTAL - pgarr->nr_used;
+}
+
+/*
+ * utilities to alloc, free, and handle 'struct ckpt_pgarr' (page-arrays)
+ * (common to ckpt_mem.c and rstr_mem.c).
+ *
+ * The checkpoint context structure has two members for page-arrays:
+ * ctx->pgarr_list: list head of populated page-array chain
+ * ctx->pgarr_pool: list head of empty page-array pool chain
+ *
+ * During checkpoint (and restart) the chain tracks the dirty pages (page
+ * pointer and virtual address) of each MM. For a particular MM, these are
+ * always added to the head of the page-array chain (ctx->pgarr_list).
+ * Before the next chunk of pages, the chain is reset (by dereferencing
+ * all pages) but not freed; instead, empty descsriptors are kept in pool.
+ *
+ * The head of the chain page-array ("current") advances as necessary. When
+ * it gets full, a new page-array descriptor is pushed in front of it. The
+ * new descriptor is taken from first empty descriptor (if one exists, for
+ * instance, after a chain reset), or allocated on-demand.
+ *
+ * When dumping the data, the chain is traversed in reverse order.
+ */
+
+/* return first page-array in the chain */
+static inline struct ckpt_pgarr *pgarr_first(struct ckpt_ctx *ctx)
+{
+ if (list_empty(&ctx->pgarr_list))
+ return NULL;
+ return list_first_entry(&ctx->pgarr_list, struct ckpt_pgarr, list);
+}
+
+/* return (and detach) first empty page-array in the pool, if exists */
+static inline struct ckpt_pgarr *pgarr_from_pool(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr;
+
+ if (list_empty(&ctx->pgarr_pool))
+ return NULL;
+ pgarr = list_first_entry(&ctx->pgarr_pool, struct ckpt_pgarr, list);
+ list_del(&pgarr->list);
+ return pgarr;
+}
+
+/* release pages referenced by a page-array */
+static void pgarr_release_pages(struct ckpt_pgarr *pgarr)
+{
+ ckpt_debug("total pages %d\n", pgarr->nr_used);
+ /*
+ * both checkpoint and restart use 'nr_used', however we only
+ * collect pages during checkpoint; in restart we simply return
+ * because pgarr->pages remains NULL.
+ */
+ if (pgarr->pages) {
+ struct page **pages = pgarr->pages;
+ int nr = pgarr->nr_used;
+
+ while (nr--)
+ page_cache_release(pages[nr]);
+ }
+
+ pgarr->nr_used = 0;
+}
+
+/* free a single page-array object */
+static void pgarr_free_one(struct ckpt_pgarr *pgarr)
+{
+ pgarr_release_pages(pgarr);
+ kfree(pgarr->pages);
+ kfree(pgarr->vaddrs);
+ kfree(pgarr);
+}
+
+/* free the chains of page-arrays (populated and empty pool) */
+void ckpt_pgarr_free(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr, *tmp;
+
+ list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
+ list_del(&pgarr->list);
+ pgarr_free_one(pgarr);
+ }
+
+ list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_pool, list) {
+ list_del(&pgarr->list);
+ pgarr_free_one(pgarr);
+ }
+}
+
+/* allocate a single page-array object */
+static struct ckpt_pgarr *pgarr_alloc_one(unsigned long flags)
+{
+ struct ckpt_pgarr *pgarr;
+
+ pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
+ if (!pgarr)
+ return NULL;
+ pgarr->vaddrs = kmalloc(CKPT_PGARR_TOTAL * sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!pgarr->vaddrs)
+ goto nomem;
+
+ /* pgarr->pages is needed only for checkpoint */
+ if (flags & CKPT_CTX_CHECKPOINT) {
+ pgarr->pages = kmalloc(CKPT_PGARR_TOTAL *
+ sizeof(struct page *), GFP_KERNEL);
+ if (!pgarr->pages)
+ goto nomem;
+ }
+
+ return pgarr;
+ nomem:
+ pgarr_free_one(pgarr);
+ return NULL;
+}
+
+/* pgarr_current - return the next available page-array in the chain
+ * @ctx: checkpoint context
+ *
+ * Returns the first page-array in the list that has space. Otherwise,
+ * try the next page-array after the last non-empty one, and move it to
+ * the front of the chain. Extends the list if none has space.
+ */
+static struct ckpt_pgarr *pgarr_current(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr;
+
+ pgarr = pgarr_first(ctx);
+ if (pgarr && !pgarr_is_full(pgarr))
+ return pgarr;
+
+ pgarr = pgarr_from_pool(ctx);
+ if (!pgarr)
+ pgarr = pgarr_alloc_one(ctx->kflags);
+ if (!pgarr)
+ return NULL;
+
+ list_add(&pgarr->list, &ctx->pgarr_list);
+ return pgarr;
+}
+
+/* reset the page-array chain (dropping page references if necessary) */
+static void pgarr_reset_all(struct ckpt_ctx *ctx)
+{
+ struct ckpt_pgarr *pgarr;
+
+ list_for_each_entry(pgarr, &ctx->pgarr_list, list)
+ pgarr_release_pages(pgarr);
+ list_splice_init(&ctx->pgarr_list, &ctx->pgarr_pool);
+}
+
+/**************************************************************************
+ * Checkpoint
+ *
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+/**
+ * consider_private_page - return page pointer for dirty pages
+ * @vma - target vma
+ * @addr - page address
+ *
+ * Looks up the page that correspond to the address in the vma, and
+ * returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_private_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ return __get_dirty_page(vma, addr);
+}
+
+/**
+ * vma_fill_pgarr - fill a page-array with addr/page tuples
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ * @start - start address (updated)
+ *
+ * Returns the number of pages collected
+ */
+static int vma_fill_pgarr(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ unsigned long *start)
+{
+ unsigned long end = vma->vm_end;
+ unsigned long addr = *start;
+ struct ckpt_pgarr *pgarr;
+ int nr_used;
+ int cnt = 0;
+
+ BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+
+ if (vma)
+ down_read(&vma->vm_mm->mmap_sem);
+ do {
+ pgarr = pgarr_current(ctx);
+ if (!pgarr) {
+ cnt = -ENOMEM;
+ goto out;
+ }
+
+ nr_used = pgarr->nr_used;
+
+ while (addr < end) {
+ struct page *page;
+
+ page = consider_private_page(vma, addr);
+ if (IS_ERR(page)) {
+ cnt = PTR_ERR(page);
+ goto out;
+ }
+
+ if (page) {
+ _ckpt_debug(CKPT_DPAGE,
+ "got page %#lx\n", addr);
+ pgarr->pages[pgarr->nr_used] = page;
+ pgarr->vaddrs[pgarr->nr_used] = addr;
+ pgarr->nr_used++;
+ }
+
+ addr += PAGE_SIZE;
+
+ if (pgarr_is_full(pgarr))
+ break;
+ }
+
+ cnt += pgarr->nr_used - nr_used;
+
+ } while ((cnt < CKPT_PGARR_BATCH) && (addr < end));
+ out:
+ if (vma)
+ up_read(&vma->vm_mm->mmap_sem);
+ *start = addr;
+ return cnt;
+}
+
+/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
+int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page)
+{
+ void *ptr;
+
+ ptr = kmap_atomic(page, KM_USER1);
+ memcpy(ctx->scratch_page, ptr, PAGE_SIZE);
+ kunmap_atomic(ptr, KM_USER1);
+
+ return ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
+}
+
+/**
+ * vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ *
+ * First dump all virtual addresses, followed by the contents of all pages
+ */
+static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
+{
+ struct ckpt_pgarr *pgarr;
+ int i, ret = 0;
+
+ if (!total)
+ return 0;
+
+ i = total * (sizeof(unsigned long) + PAGE_SIZE);
+ ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
+ if (ret < 0)
+ return ret;
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ ret = ckpt_kwrite(ctx, pgarr->vaddrs,
+ pgarr->nr_used * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ }
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ for (i = 0; i < pgarr->nr_used; i++) {
+ ret = checkpoint_dump_page(ctx, pgarr->pages[i]);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * checkpoint_memory_contents - dump contents of a VMA with private memory
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ *
+ * Collect lists of pages that needs to be dumped, and corresponding
+ * virtual addresses into ctx->pgarr_list page-array chain. Then dump
+ * the addresses, followed by the page contents.
+ */
+static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma)
+{
+ struct ckpt_hdr_pgarr *h;
+ unsigned long addr, end;
+ int cnt, ret;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ /*
+ * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
+ * in each round. Each iterations is divided into two steps:
+ *
+ * (1) scan: scan through the PTEs of the vma to collect the pages
+ * to dump (later we'll also make them COW), while keeping a list
+ * of pages and their corresponding addresses on ctx->pgarr_list.
+ *
+ * (2) dump: write out a header specifying how many pages, followed
+ * by the addresses of all pages in ctx->pgarr_list, followed by
+ * the actual contents of all pages. (Then, release the references
+ * to the pages and reset the page-array chain).
+ *
+ * (This split makes the logic simpler by first counting the pages
+ * that need saving. More importantly, it allows for a future
+ * optimization that will reduce application downtime by deferring
+ * the actual write-out of the data to after the application is
+ * allowed to resume execution).
+ *
+ * After dumping the entire contents, conclude with a header that
+ * specifies 0 pages to mark the end of the contents.
+ */
+
+ while (addr < end) {
+ cnt = vma_fill_pgarr(ctx, vma, &addr);
+ if (cnt == 0)
+ break;
+ else if (cnt < 0)
+ return cnt;
+
+ ckpt_debug("collected %d pages\n", cnt);
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (!h)
+ return -ENOMEM;
+
+ h->nr_pages = cnt;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ret;
+
+ ret = vma_dump_pages(ctx, cnt);
+ if (ret < 0)
+ return ret;
+
+ pgarr_reset_all(ctx);
+ }
+
+ /* mark end of contents with header saying "0" pages */
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (!h)
+ return -ENOMEM;
+ h->nr_pages = 0;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+/**
+ * generic_vma_checkpoint - dump metadata of vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
+ enum vma_type type, int vma_objref)
+{
+ struct ckpt_hdr_vma *h;
+ int ret;
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx type %d\n",
+ vma->vm_start, vma->vm_end, vma->vm_flags, type);
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+ if (!h)
+ return -ENOMEM;
+
+ h->vma_type = type;
+ h->vma_objref = vma_objref;
+ h->vm_start = vma->vm_start;
+ h->vm_end = vma->vm_end;
+ h->vm_page_prot = pgprot_val(vma->vm_page_prot);
+ h->vm_flags = vma->vm_flags;
+ h->vm_pgoff = vma->vm_pgoff;
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+/**
+ * private_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int private_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type, int vma_objref)
+{
+ int ret;
+
+ BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+
+ ret = generic_vma_checkpoint(ctx, vma, type, vma_objref);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_memory_contents(ctx, vma);
+ out:
+ return ret;
+}
+
+/**
+ * anonymous_checkpoint - dump contents of private-anonymous vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ */
+static int anonymous_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma)
+{
+ /* should be private anonymous ... verify that this is the case */
+ BUG_ON(vma->vm_flags & VM_MAYSHARE);
+ BUG_ON(vma->vm_file);
+
+ return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
+}
+
+static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma, *next;
+ int map_count = 0;
+ int ret = 0;
+
+ vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+ if (!vma)
+ return -ENOMEM;
+
+ /*
+ * Must not hold mm->mmap_sem when writing to image file, so
+ * can't simply traverse the vma list. Instead, use find_vma()
+ * to get the @next and make a local "copy" of it.
+ */
+ while (1) {
+ down_read(&mm->mmap_sem);
+ next = find_vma(mm, vma->vm_end);
+ if (!next) {
+ up_read(&mm->mmap_sem);
+ break;
+ }
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ *vma = *next;
+ if (vma->vm_file)
+ get_file(vma->vm_file);
+ up_read(&mm->mmap_sem);
+
+ map_count++;
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx\n",
+ vma->vm_start, vma->vm_end, vma->vm_flags);
+
+ if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+ ckpt_err(ctx, -ENOSYS, "%(T)vma: bad flags (%#lx)\n",
+ vma->vm_flags);
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (!vma->vm_ops)
+ ret = anonymous_checkpoint(ctx, vma);
+ else if (vma->vm_ops->checkpoint)
+ ret = (*vma->vm_ops->checkpoint)(ctx, vma);
+ else
+ ret = -ENOSYS;
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)vma: failed\n");
+ break;
+ }
+ /*
+ * The file was collected, but not always checkpointed;
+ * be safe and mark as visited to appease leak detection
+ */
+ if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
+ ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
+
+ kfree(vma);
+
+ return ret < 0 ? ret : map_count;
+}
+
+#define CKPT_AT_SZ (AT_VECTOR_SIZE * sizeof(u64))
+/*
+ * We always write saved_auxv out as an array of u64s, though it is
+ * an array of u32s on 32-bit arch.
+ */
+static int ckpt_write_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ int i, ret;
+ u64 *buf = kzalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+ for (i = 0; i < AT_VECTOR_SIZE; i++)
+ buf[i] = mm->saved_auxv[i];
+ ret = ckpt_write_buffer(ctx, buf, CKPT_AT_SZ);
+ kfree(buf);
+ return ret;
+}
+
+static int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct mm_struct *mm = ptr;
+ struct ckpt_hdr_mm *h;
+ struct file *exe_file = NULL;
+ int ret;
+
+ if (check_for_outstanding_aio(mm)) {
+ ckpt_err(ctx, -EBUSY, "(%T)Outstanding aio\n");
+ return -EBUSY;
+ }
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
+ if (!h)
+ return -ENOMEM;
+
+ down_read(&mm->mmap_sem);
+
+ h->flags = mm->flags;
+ h->def_flags = mm->def_flags;
+
+ h->start_code = mm->start_code;
+ h->end_code = mm->end_code;
+ h->start_data = mm->start_data;
+ h->end_data = mm->end_data;
+ h->start_brk = mm->start_brk;
+ h->brk = mm->brk;
+ h->start_stack = mm->start_stack;
+ h->arg_start = mm->arg_start;
+ h->arg_end = mm->arg_end;
+ h->env_start = mm->env_start;
+ h->env_end = mm->env_end;
+
+ h->map_count = mm->map_count;
+
+ if (mm->exe_file) { /* checkpoint the ->exe_file */
+ exe_file = mm->exe_file;
+ get_file(exe_file);
+ }
+
+ /*
+ * Drop mm->mmap_sem before writing data to checkpoint image
+ * to avoid reverse locking order (inode must come before mm).
+ */
+ up_read(&mm->mmap_sem);
+
+ if (exe_file) {
+ h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
+ if (h->exe_objref < 0) {
+ ret = h->exe_objref;
+ goto out;
+ }
+ }
+
+ ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ ret = ckpt_write_auxv(ctx, mm);
+ if (ret < 0)
+ return ret;
+
+ ret = checkpoint_vmas(ctx, mm);
+ if (ret != h->map_count && ret >= 0)
+ ret = -EBUSY; /* checkpoint mm leak */
+ if (ret < 0)
+ goto out;
+
+ ret = checkpoint_mm_context(ctx, mm);
+ out:
+ if (exe_file)
+ fput(exe_file);
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct mm_struct *mm;
+ int objref;
+
+ mm = get_task_mm(t);
+ objref = checkpoint_obj(ctx, mm, CKPT_OBJ_MM);
+ mmput(mm);
+
+ return objref;
+}
+
+/***********************************************************************
+ * Collect
+ */
+
+static int collect_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct file *file;
+ int ret;
+
+ /* if already exists (ret == 0), nothing to do */
+ ret = ckpt_obj_collect(ctx, mm, CKPT_OBJ_MM);
+ if (ret <= 0)
+ return ret;
+
+ /* if first time for this mm (ret > 0), proceed inside */
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file) {
+ ret = ckpt_collect_file(ctx, mm->exe_file);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)mm: collect exe_file\n");
+ goto out;
+ }
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ file = vma->vm_file;
+ if (!file)
+ continue;
+ ret = ckpt_collect_file(ctx, file);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)mm: collect vm_file\n");
+ break;
+ }
+ }
+ out:
+ up_read(&mm->mmap_sem);
+ return ret;
+
+}
+
+int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(t);
+ ret = collect_mm(ctx, mm);
+ mmput(mm);
+
+ return ret;
+}
+
+/***********************************************************************
+ * Restart
+ *
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read into the address space of the current process.
+ */
+
+/**
+ * read_pages_vaddrs - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @nr_pages - number of address to read
+ */
+static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
+{
+ struct ckpt_pgarr *pgarr;
+ unsigned long *vaddrp;
+ int nr, ret;
+
+ while (nr_pages) {
+ pgarr = pgarr_current(ctx);
+ if (!pgarr)
+ return -ENOMEM;
+ nr = pgarr_nr_free(pgarr);
+ if (nr > nr_pages)
+ nr = nr_pages;
+ vaddrp = &pgarr->vaddrs[pgarr->nr_used];
+ ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ pgarr->nr_used += nr;
+ nr_pages -= nr;
+ }
+ return 0;
+}
+
+int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
+{
+ void *ptr;
+ int ret;
+
+ ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+
+ ptr = kmap_atomic(page, KM_USER1);
+ memcpy(ptr, ctx->scratch_page, PAGE_SIZE);
+ kunmap_atomic(ptr, KM_USER1);
+
+ return 0;
+}
+
+/**
+ * read_pages_contents - read in data of pages in page-array chain
+ * @ctx - restart context
+ */
+static int read_pages_contents(struct ckpt_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+ struct ckpt_pgarr *pgarr;
+ unsigned long *vaddrs;
+ int i, ret = 0;
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ vaddrs = pgarr->vaddrs;
+ for (i = 0; i < pgarr->nr_used; i++) {
+ struct page *page;
+
+ /* TODO: do in chunks to reduce mmap_sem overhead */
+ _ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, mm, vaddrs[i],
+ 1, 1, 1, &page, NULL);
+ up_read(¤t->mm->mmap_sem);
+ if (ret < 0)
+ return ret;
+
+ ret = restore_read_page(ctx, page);
+ page_cache_release(page);
+
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+/**
+ * restore_memory_contents - restore contents of a VMA with private memory
+ * @ctx - restart context
+ *
+ * Reads a header that specifies how many pages will follow, then reads
+ * a list of virtual addresses into ctx->pgarr_list page-array chain,
+ * followed by the actual contents of the corresponding pages. Iterates
+ * these steps until reaching a header specifying "0" pages, which marks
+ * the end of the contents.
+ */
+static int restore_memory_contents(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_pgarr *h;
+ unsigned long nr_pages;
+ int len, ret = 0;
+
+ while (1) {
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (IS_ERR(h))
+ break;
+
+ ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
+
+ nr_pages = h->nr_pages;
+ ckpt_hdr_put(ctx, h);
+
+ if (!nr_pages)
+ break;
+
+ len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
+ ret = _ckpt_read_buffer(ctx, NULL, len);
+ if (ret < 0)
+ break;
+
+ ret = read_pages_vaddrs(ctx, nr_pages);
+ if (ret < 0)
+ break;
+ ret = read_pages_contents(ctx);
+ if (ret < 0)
+ break;
+ pgarr_reset_all(ctx);
+ }
+
+ return ret;
+}
+
+/**
+ * calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_prot = 0;
+
+ if (orig_vm_flags & VM_READ)
+ vm_prot |= PROT_READ;
+ if (orig_vm_flags & VM_WRITE)
+ vm_prot |= PROT_WRITE;
+ if (orig_vm_flags & VM_EXEC)
+ vm_prot |= PROT_EXEC;
+ if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */
+ vm_prot |= PROT_SEM;
+
+ return vm_prot;
+}
+
+/**
+ * calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_flags = 0;
+
+ vm_flags = MAP_FIXED;
+ if (orig_vm_flags & VM_GROWSDOWN)
+ vm_flags |= MAP_GROWSDOWN;
+ if (orig_vm_flags & VM_DENYWRITE)
+ vm_flags |= MAP_DENYWRITE;
+ if (orig_vm_flags & VM_EXECUTABLE)
+ vm_flags |= MAP_EXECUTABLE;
+ if (orig_vm_flags & VM_MAYSHARE)
+ vm_flags |= MAP_SHARED;
+ else
+ vm_flags |= MAP_PRIVATE;
+
+ return vm_flags;
+}
+
+/**
+ * generic_vma_restore - restore a vma
+ * @mm - address space
+ * @file - file to map (NULL for anonymous)
+ * @h - vma header data
+ */
+static unsigned long generic_vma_restore(struct mm_struct *mm,
+ struct file *file,
+ struct ckpt_hdr_vma *h)
+{
+ unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
+ unsigned long addr;
+
+ if (h->vm_end < h->vm_start)
+ return -EINVAL;
+ if (h->vma_objref < 0)
+ return -EINVAL;
+
+ vm_start = h->vm_start;
+ vm_pgoff = h->vm_pgoff;
+ vm_size = h->vm_end - h->vm_start;
+ vm_prot = calc_map_prot_bits(h->vm_flags);
+ vm_flags = calc_map_flags_bits(h->vm_flags);
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap_pgoff(file, vm_start, vm_size,
+ vm_prot, vm_flags, vm_pgoff);
+ up_write(&mm->mmap_sem);
+ ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
+ vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+ return addr;
+}
+
+/**
+ * private_vma_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @file: file to use for mapping
+ * @h - vma header data
+ */
+int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct file *file, struct ckpt_hdr_vma *h)
+{
+ unsigned long addr;
+
+ if (h->vm_flags & (VM_SHARED | VM_MAYSHARE))
+ return -EINVAL;
+
+ addr = generic_vma_restore(mm, file, h);
+ if (IS_ERR((void *) addr))
+ return PTR_ERR((void *) addr);
+
+ return restore_memory_contents(ctx);
+}
+
+/**
+ * anon_private_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @h - vma header data
+ */
+static int anon_private_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ /*
+ * vm_pgoff for anonymous mapping is the "global" page
+ * offset (namely from addr 0x0), so we force a zero
+ */
+ h->vm_pgoff = 0;
+
+ return private_vma_restore(ctx, mm, NULL, h);
+}
+
+/* callbacks to restore vma per its type: */
+struct restore_vma_ops {
+ char *vma_name;
+ enum vma_type vma_type;
+ int (*restore) (struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *ptr);
+};
+
+static struct restore_vma_ops restore_vma_ops[] = {
+ /* ignored vma */
+ {
+ .vma_name = "IGNORE",
+ .vma_type = CKPT_VMA_IGNORE,
+ .restore = NULL,
+ },
+ /* special mapping (vdso) */
+ {
+ .vma_name = "VDSO",
+ .vma_type = CKPT_VMA_VDSO,
+ .restore = special_mapping_restore,
+ },
+ /* anonymous private */
+ {
+ .vma_name = "ANON PRIVATE",
+ .vma_type = CKPT_VMA_ANON,
+ .restore = anon_private_restore,
+ },
+ /* file-mapped private */
+ {
+ .vma_name = "FILE PRIVATE",
+ .vma_type = CKPT_VMA_FILE,
+ .restore = filemap_restore,
+ },
+};
+
+/**
+ * restore_vma - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ */
+static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_vma *h;
+ struct restore_vma_ops *ops;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d\n",
+ (unsigned long) h->vm_start, (unsigned long) h->vm_end,
+ (unsigned long) h->vm_flags, (int) h->vma_type,
+ (int) h->vma_objref);
+
+ ret = -EINVAL;
+ if (h->vm_end < h->vm_start)
+ goto out;
+ if (h->vma_objref < 0)
+ goto out;
+ if (h->vma_type >= CKPT_VMA_MAX)
+ goto out;
+ if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+ return -ENOSYS;
+
+ ops = &restore_vma_ops[h->vma_type];
+
+ /* make sure we don't change this accidentally */
+ BUG_ON(ops->vma_type != h->vma_type);
+
+ if (ops->restore) {
+ ckpt_debug("vma type %s\n", ops->vma_name);
+ ret = ops->restore(ctx, mm, h);
+ } else {
+ ckpt_debug("vma ignored\n");
+ ret = 0;
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int ckpt_read_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ int i, ret;
+ u64 *buf = kmalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+ ret = _ckpt_read_buffer(ctx, buf, CKPT_AT_SZ);
+ if (ret < 0)
+ goto out;
+
+ ret = -E2BIG;
+ for (i = 0; i < AT_VECTOR_SIZE; i++)
+ if (buf[i] > (u64) ULONG_MAX)
+ goto out;
+
+ for (i = 0; i < AT_VECTOR_SIZE - 1; i++)
+ mm->saved_auxv[i] = buf[i];
+ /* sanitize the input: force AT_NULL in last entry */
+ mm->saved_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+
+ ret = 0;
+ out:
+ kfree(buf);
+ return ret;
+}
+
+static void *restore_mm(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_mm *h;
+ struct mm_struct *mm = NULL;
+ struct file *file;
+ unsigned int nr;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
+ if (IS_ERR(h))
+ return (void *) h;
+
+ ckpt_debug("map_count %d\n", h->map_count);
+
+ /* XXX need more sanity checks */
+
+ ret = -EINVAL;
+ if ((h->start_code > h->end_code) ||
+ (h->start_data > h->end_data))
+ goto out;
+ if (h->exe_objref < 0)
+ goto out;
+ if (h->def_flags & ~VM_LOCKED)
+ goto out;
+ if (h->flags & ~(MMF_DUMP_FILTER_MASK |
+ ((1 << MMF_DUMP_FILTER_BITS) - 1)))
+ goto out;
+
+ mm = current->mm;
+
+ /* point of no return -- destruct current mm */
+ down_write(&mm->mmap_sem);
+ ret = destroy_mm(mm);
+ if (ret < 0) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ mm->flags = h->flags;
+ mm->def_flags = h->def_flags;
+
+ mm->start_code = h->start_code;
+ mm->end_code = h->end_code;
+ mm->start_data = h->start_data;
+ mm->end_data = h->end_data;
+ mm->start_brk = h->start_brk;
+ mm->brk = h->brk;
+ mm->start_stack = h->start_stack;
+ mm->arg_start = h->arg_start;
+ mm->arg_end = h->arg_end;
+ mm->env_start = h->env_start;
+ mm->env_end = h->env_end;
+
+ /* restore the ->exe_file */
+ if (h->exe_objref) {
+ file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(file)) {
+ up_write(&mm->mmap_sem);
+ ret = PTR_ERR(file);
+ goto out;
+ }
+ set_mm_exe_file(mm, file);
+ }
+ up_write(&mm->mmap_sem);
+
+ ret = ckpt_read_auxv(ctx, mm);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "Error restoring auxv\n");
+ goto out;
+ }
+
+ for (nr = h->map_count; nr; nr--) {
+ ret = restore_vma(ctx, mm);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = restore_mm_context(ctx, mm);
+ out:
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ /* restore_obj() expect an extra reference */
+ atomic_inc(&mm->mm_users);
+ return (void *)mm;
+}
+
+int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
+ if (mm == current->mm)
+ return 0;
+
+ ret = exec_mmap(mm);
+ if (ret < 0)
+ return ret;
+
+ atomic_inc(&mm->mm_users);
+ return 0;
+}
+
+/*
+ * mm-related checkpoint objects
+ */
+
+static int obj_mm_grab(void *ptr)
+{
+ atomic_inc(&((struct mm_struct *) ptr)->mm_users);
+ return 0;
+}
+
+static void obj_mm_drop(void *ptr, int lastref)
+{
+ mmput((struct mm_struct *) ptr);
+}
+
+static int obj_mm_users(void *ptr)
+{
+ return atomic_read(&((struct mm_struct *) ptr)->mm_users);
+}
+
+/* mm object */
+static const struct ckpt_obj_ops ckpt_obj_mm_ops = {
+ .obj_name = "MM",
+ .obj_type = CKPT_OBJ_MM,
+ .ref_drop = obj_mm_drop,
+ .ref_grab = obj_mm_grab,
+ .ref_users = obj_mm_users,
+ .checkpoint = checkpoint_mm,
+ .restore = restore_mm,
+};
+
+static int __init checkpoint_register_mm(void)
+{
+ return register_checkpoint_obj(&ckpt_obj_mm_ops);
+}
+late_initcall(checkpoint_register_mm);
diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda..24d4c54 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/checkpoint.h>
#include "internal.h"
/*
@@ -1590,8 +1591,52 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
+#ifdef CONFIG_CHECKPOINT
+int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ int vma_objref;
+
+ if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+ pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
+ return -ENOSYS;
+ }
+
+ BUG_ON(!file);
+
+ vma_objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
+ if (vma_objref < 0)
+ return vma_objref;
+
+ return private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE, vma_objref);
+}
+EXPORT_SYMBOL(filemap_checkpoint);
+
+int filemap_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ struct file *file;
+ int ret;
+
+ if (h->vma_type == CKPT_VMA_FILE &&
+ (h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
+ return -EINVAL;
+
+ file = ckpt_obj_fetch(ctx, h->vma_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = private_vma_restore(ctx, mm, file, h);
+ return ret;
+}
+#endif
+
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = filemap_checkpoint,
+#endif
};
/* This is used for a general mmap of a disk file */
diff --git a/mm/mmap.c b/mm/mmap.c
index f90ea92..9d4891f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
+#include <linux/checkpoint.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -2009,14 +2010,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap_nocheck(struct mm_struct *mm, unsigned long start, size_t len)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
-
if ((len = PAGE_ALIGN(len)) == 0)
return -EINVAL;
@@ -2090,8 +2088,39 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+ if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ return do_munmap_nocheck(mm, start, len);
+}
+
EXPORT_SYMBOL(do_munmap);
+/*
+ * called with mm->mmap-sem held
+ * only called from checkpoint/memory.c:restore_mm()
+ */
+int destroy_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *vmnext = mm->mmap;
+ struct vm_area_struct *vma;
+ int ret;
+
+ while (vmnext) {
+ vma = vmnext;
+ vmnext = vmnext->vm_next;
+ ret = do_munmap_nocheck(mm, vma->vm_start,
+ vma->vm_end-vma->vm_start);
+ if (ret < 0) {
+ pr_warning("%s: failed munmap (%d)\n", __func__, ret);
+ return ret;
+ }
+ }
+ return 0;
+}
+
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
int ret;
@@ -2248,7 +2277,7 @@ void exit_mmap(struct mm_struct *mm)
tlb = tlb_gather_mmu(mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ end = vma ? unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL) : 0;
vm_unacct_memory(nr_accounted);
free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
@@ -2414,9 +2443,71 @@ static void special_mapping_close(struct vm_area_struct *vma)
{
}
+#ifdef CONFIG_CHECKPOINT
+/*
+ * FIX:
+ * - checkpoint vdso pages (once per distinct vdso is enough)
+ * - check for compatilibility between saved and current vdso
+ * - accommodate for dynamic kernel data in vdso page
+ *
+ * Current, we require COMPAT_VDSO which somewhat mitigates the issue
+ */
+static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma)
+{
+ const char *name;
+
+ /*
+ * FIX:
+ * Currently, we only handle VDSO/vsyscall special handling.
+ * Even that, is very basic - we just skip the contents and
+ * hope for the best in terms of compatilibity upon restart.
+ */
+
+ if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+ return -ENOSYS;
+
+ name = arch_vma_name(vma);
+ if (!name || strcmp(name, "[vdso]"))
+ return -ENOSYS;
+
+ return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0);
+}
+
+int special_mapping_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ int ret = 0;
+
+ /*
+ * FIX:
+ * Currently, we only handle VDSO/vsyscall special handling.
+ * Even that, is very basic - call arch_setup_additional_pages
+ * requiring the same mapping (start address) as before.
+ */
+
+ BUG_ON(h->vma_type != CKPT_VMA_VDSO);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+ if (test_thread_flag(TIF_IA32))
+ ret = syscall32_setup_pages(NULL, h->vm_start, 0);
+ else
+#endif
+ ret = arch_setup_additional_pages(NULL, h->vm_start, 0);
+#endif
+
+ return ret;
+}
+#endif
+
static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = special_mapping_checkpoint,
+#endif
};
/*
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 042/100] c/r: add generic '->checkpoint' f_op to ext fses
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (7 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 041/100] c/r: dump memory address space (private memory) Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 043/100] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
` (11 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Dave Hansen, linux-ext4, linux-fsdevel,
Oren Laadan
From: Dave Hansen <dave@linux.vnet.ibm.com>
This marks ext[234] as being checkpointable. There will be many
more to do this to, but this is a start.
Changelog[ckpt-v21]:
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[ckpt-v19-rc3]:
- Rebase to kernel 2.6.33 (ext2)
Changelog[v1]:
- [Serge Hallyn] Use filemap_checkpoint() in ext4_file_vm_ops
Cc: linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/ext2/dir.c | 3 +++
fs/ext2/file.c | 6 ++++++
fs/ext3/dir.c | 3 +++
fs/ext3/file.c | 3 +++
fs/ext4/dir.c | 3 +++
fs/ext4/file.c | 6 ++++++
6 files changed, 24 insertions(+), 0 deletions(-)
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957..cdcb065 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -722,4 +722,7 @@ const struct file_operations ext2_dir_operations = {
.compat_ioctl = ext2_compat_ioctl,
#endif
.fsync = ext2_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0..975ea9c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -76,6 +76,9 @@ const struct file_operations ext2_file_operations = {
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif /* CONFIG_CHECKPOINT */
};
#ifdef CONFIG_EXT2_FS_XIP
@@ -91,6 +94,9 @@ const struct file_operations ext2_xip_file_operations = {
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif /* CONFIG_CHECKPOINT */
};
#endif
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90..a4ef201 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -48,6 +48,9 @@ const struct file_operations ext3_dir_operations = {
#endif
.fsync = ext3_sync_file, /* BKL held */
.release = ext3_release_dir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index f55df0e..2cf4ef2 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -68,6 +68,9 @@ const struct file_operations ext3_file_operations = {
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d8..9681c2f 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -48,6 +48,9 @@ const struct file_operations ext4_dir_operations = {
#endif
.fsync = ext4_sync_file,
.release = ext4_release_dir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e4..7c7e120 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -85,6 +85,9 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ext4_page_mkwrite,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = filemap_checkpoint,
+#endif
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -145,6 +148,9 @@ const struct file_operations ext4_file_operations = {
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations ext4_file_inode_operations = {
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 043/100] c/r: add generic '->checkpoint()' f_op to simple devices
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (8 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 042/100] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 044/100] c/r: add checkpoint operation for opened files of generic filesystems Oren Laadan
` (10 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
* /dev/null
* /dev/zero
* /dev/random
* /dev/urandom
Changelog [v21]:
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
drivers/char/mem.c | 6 ++++++
drivers/char/random.c | 6 ++++++
2 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f54dab8..4e48384 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -767,6 +767,9 @@ static const struct file_operations null_fops = {
.read = read_null,
.write = write_null,
.splice_write = splice_write_null,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
#ifdef CONFIG_DEVPORT
@@ -783,6 +786,9 @@ static const struct file_operations zero_fops = {
.read = read_zero,
.write = write_zero,
.mmap = mmap_zero,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
/*
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2fd3d39..47a224c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1169,6 +1169,9 @@ const struct file_operations random_fops = {
.poll = random_poll,
.unlocked_ioctl = random_ioctl,
.fasync = random_fasync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct file_operations urandom_fops = {
@@ -1176,6 +1179,9 @@ const struct file_operations urandom_fops = {
.write = random_write,
.unlocked_ioctl = random_ioctl,
.fasync = random_fasync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
/***************************************************************
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 044/100] c/r: add checkpoint operation for opened files of generic filesystems
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (9 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 043/100] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 046/100] c/r: dump anonymous- and file-mapped- shared memory Oren Laadan
` (9 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, linux-fsdevel
From: Matt Helsley <matthltc@us.ibm.com>
These patches extend the use of the generic file checkpoint operation to
non-extX filesystems which have lseek operations that ensure we can save
and restore the files for later use. Note that this does not include
things like FUSE, network filesystems, or pseudo-filesystem kernel
interfaces.
Only compile and boot tested (on x86-32).
[Oren Laadan] Folded patch series into a single patch; original post
included 36 separate patches for individual filesystems:
[PATCH 01/36] Add the checkpoint operation for affs files and directories.
[PATCH 02/36] Add the checkpoint operation for befs directories.
[PATCH 03/36] Add the checkpoint operation for bfs files and directories.
[PATCH 04/36] Add the checkpoint operation for btrfs files and directories.
[PATCH 05/36] Add the checkpoint operation for cramfs directories.
[PATCH 06/36] Add the checkpoint operation for ecryptfs files and directories.
[PATCH 07/36] Add the checkpoint operation for fat files and directories.
[PATCH 08/36] Add the checkpoint operation for freevxfs directories.
[PATCH 09/36] Add the checkpoint operation for hfs files and directories.
[PATCH 10/36] Add the checkpoint operation for hfsplus files and directories.
[PATCH 11/36] Add the checkpoint operation for hpfs files and directories.
[PATCH 12/36] Add the checkpoint operation for hppfs files and directories.
[PATCH 13/36] Add the checkpoint operation for iso directories.
[PATCH 14/36] Add the checkpoint operation for jffs2 files and directories.
[PATCH 15/36] Add the checkpoint operation for jfs files and directories.
[PATCH 16/36] Add the checkpoint operation for regular nfs files and directories. Skip the various /proc files for now.
[PATCH 17/36] Add the checkpoint operation for ntfs directories.
[PATCH 18/36] Add the checkpoint operation for openromfs directories. Explicitly skip the properties for now.
[PATCH 19/36] Add the checkpoint operation for qnx4 files and directories.
[PATCH 20/36] Add the checkpoint operation for reiserfs files and directories.
[PATCH 21/36] Add the checkpoint operation for romfs directories.
[PATCH 22/36] Add the checkpoint operation for squashfs directories.
[PATCH 23/36] Add the checkpoint operation for sysv filesystem files and directories.
[PATCH 24/36] Add the checkpoint operation for ubifs files and directories.
[PATCH 25/36] Add the checkpoint operation for udf filesystem files and directories.
[PATCH 26/36] Add the checkpoint operation for xfs files and directories.
[PATCH 27/36] Add checkpoint operation for efs directories.
[PATCH 28/36] Add the checkpoint operation for generic, read-only files. At present, some/all files of the following filesystems use this generic definition:
[PATCH 29/36] Add checkpoint operation for minix filesystem files and directories.
[PATCH 30/36] Add checkpoint operations for omfs files and directories.
[PATCH 31/36] Add checkpoint operations for ufs files and directories.
[PATCH 32/36] Add checkpoint operations for ramfs files. NOTE: since simple_dir_operations are shared between multiple filesystems including ramfs, it's not currently possible to checkpoint open ramfs directories.
[PATCH 33/36] Add the checkpoint operation for adfs files and directories.
[PATCH 34/36] Add the checkpoint operation to exofs files and directories.
[PATCH 35/36] Add the checkpoint operation to nilfs2 files and directories.
[PATCH 36/36] Add checkpoint operations for UML host filesystem files and directories.
Changelog[v21]:
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[v19-rc3]:
- [Suka] Enable C/R while executing over NFS
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/adfs/dir.c | 3 +++
fs/adfs/file.c | 3 +++
fs/affs/dir.c | 3 +++
fs/affs/file.c | 3 +++
fs/befs/linuxvfs.c | 3 +++
fs/bfs/dir.c | 3 +++
fs/bfs/file.c | 3 +++
fs/btrfs/file.c | 3 +++
fs/btrfs/inode.c | 3 +++
fs/btrfs/super.c | 3 +++
fs/cramfs/inode.c | 3 +++
fs/ecryptfs/file.c | 6 ++++++
fs/ecryptfs/miscdev.c | 3 +++
fs/efs/dir.c | 3 +++
fs/exofs/dir.c | 3 +++
fs/exofs/file.c | 3 +++
fs/fat/dir.c | 3 +++
fs/fat/file.c | 3 +++
fs/freevxfs/vxfs_lookup.c | 3 +++
fs/hfs/dir.c | 3 +++
fs/hfs/inode.c | 3 +++
fs/hfsplus/dir.c | 3 +++
fs/hfsplus/inode.c | 3 +++
fs/hostfs/hostfs_kern.c | 6 ++++++
fs/hpfs/dir.c | 3 +++
fs/hpfs/file.c | 3 +++
fs/hppfs/hppfs.c | 6 ++++++
fs/isofs/dir.c | 3 +++
fs/jffs2/dir.c | 3 +++
fs/jffs2/file.c | 3 +++
fs/jfs/file.c | 3 +++
fs/jfs/namei.c | 3 +++
fs/minix/dir.c | 3 +++
fs/minix/file.c | 3 +++
fs/nfs/dir.c | 3 +++
fs/nfs/file.c | 6 ++++++
fs/nilfs2/dir.c | 4 +++-
fs/nilfs2/file.c | 3 +++
fs/ntfs/dir.c | 3 +++
fs/ntfs/file.c | 5 ++++-
fs/omfs/dir.c | 3 +++
fs/omfs/file.c | 3 +++
fs/openpromfs/inode.c | 6 ++++++
fs/qnx4/dir.c | 3 +++
fs/ramfs/file-mmu.c | 3 +++
fs/ramfs/file-nommu.c | 3 +++
fs/read_write.c | 3 +++
fs/reiserfs/dir.c | 3 +++
fs/reiserfs/file.c | 3 +++
fs/romfs/mmap-nommu.c | 3 +++
fs/romfs/super.c | 3 +++
fs/squashfs/dir.c | 5 ++++-
fs/sysv/dir.c | 3 +++
fs/sysv/file.c | 3 +++
fs/ubifs/debug.c | 3 +++
fs/ubifs/dir.c | 3 +++
fs/ubifs/file.c | 3 +++
fs/udf/dir.c | 3 +++
fs/udf/file.c | 3 +++
fs/ufs/dir.c | 3 +++
fs/ufs/file.c | 3 +++
fs/xfs/linux-2.6/xfs_file.c | 6 ++++++
62 files changed, 206 insertions(+), 3 deletions(-)
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f..c205b40 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -198,6 +198,9 @@ const struct file_operations adfs_dir_operations = {
.llseek = generic_file_llseek,
.readdir = adfs_readdir,
.fsync = simple_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34..09ce6c7 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -30,6 +30,9 @@ const struct file_operations adfs_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations adfs_file_inode_operations = {
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 8ca8f3a..02511bf 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -22,6 +22,9 @@ const struct file_operations affs_dir_operations = {
.llseek = generic_file_llseek,
.readdir = affs_readdir,
.fsync = affs_file_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c..22577fa 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -36,6 +36,9 @@ const struct file_operations affs_file_operations = {
.release = affs_file_release,
.fsync = affs_file_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations affs_file_inode_operations = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda8..71488ba 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -67,6 +67,9 @@ static const struct file_operations befs_dir_operations = {
.read = generic_read_dir,
.readdir = befs_readdir,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static const struct inode_operations befs_dir_inode_operations = {
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aad..18bea30 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -80,6 +80,9 @@ const struct file_operations bfs_dir_operations = {
.readdir = bfs_readdir,
.fsync = simple_fsync,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
extern void dump_imap(const char *, struct super_block *);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 88b9a3f..844ff41 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -29,6 +29,9 @@ const struct file_operations bfs_file_operations = {
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int bfs_move_block(unsigned long from, unsigned long to,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749..be325e1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1170,4 +1170,7 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc64..7ee1a98 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5954,6 +5954,9 @@ static const struct file_operations btrfs_dir_file_operations = {
#endif
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static struct extent_io_ops btrfs_extent_io_ops = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff..2e31e14 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -829,6 +829,9 @@ static const struct file_operations btrfs_ctl_fops = {
.unlocked_ioctl = btrfs_control_ioctl,
.compat_ioctl = btrfs_control_ioctl,
.owner = THIS_MODULE,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static struct miscdevice btrfs_misc = {
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index dd3634e..01ee36d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -532,6 +532,9 @@ static const struct file_operations cramfs_directory_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = cramfs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6..e34e59a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -306,6 +306,9 @@ const struct file_operations ecryptfs_dir_fops = {
.fsync = ecryptfs_fsync,
.fasync = ecryptfs_fasync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct file_operations ecryptfs_main_fops = {
@@ -323,6 +326,9 @@ const struct file_operations ecryptfs_main_fops = {
.fsync = ecryptfs_fsync,
.fasync = ecryptfs_fasync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f61..9aa5d8b 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,9 @@ static const struct file_operations ecryptfs_miscdev_fops = {
.read = ecryptfs_miscdev_read,
.write = ecryptfs_miscdev_write,
.release = ecryptfs_miscdev_release,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 7ee6f7e..ff08a3b 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -13,6 +13,9 @@ const struct file_operations efs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = efs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations efs_dir_inode_operations = {
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1c..62347ff 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -667,4 +667,7 @@ const struct file_operations exofs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = exofs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc..ec7e15a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -84,6 +84,9 @@ const struct file_operations exofs_file_operations = {
.flush = exofs_flush,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca..4a4c7bb 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,6 +841,9 @@ const struct file_operations fat_dir_operations = {
.compat_ioctl = fat_compat_dir_ioctl,
#endif
.fsync = fat_file_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159d..38132c2 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -162,6 +162,9 @@ const struct file_operations fat_file_operations = {
.ioctl = fat_generic_ioctl,
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int fat_cont_expand(struct inode *inode, loff_t size)
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049c..41dfea9 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -58,6 +58,9 @@ const struct inode_operations vxfs_dir_inode_ops = {
const struct file_operations vxfs_dir_operations = {
.readdir = vxfs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b861..f4dafc5 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -329,6 +329,9 @@ const struct file_operations hfs_dir_operations = {
.readdir = hfs_readdir,
.llseek = generic_file_llseek,
.release = hfs_dir_release,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations hfs_dir_inode_operations = {
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 14f5cb1..3b10d84 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -607,6 +607,9 @@ static const struct file_operations hfs_file_operations = {
.fsync = file_fsync,
.open = hfs_file_open,
.release = hfs_file_release,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static const struct inode_operations hfs_file_inode_operations = {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f40236..7656143 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -497,4 +497,7 @@ const struct file_operations hfsplus_dir_operations = {
.ioctl = hfsplus_ioctl,
.llseek = generic_file_llseek,
.release = hfsplus_dir_release,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597..43a6da2 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -286,6 +286,9 @@ static const struct file_operations hfsplus_file_operations = {
.open = hfsplus_file_open,
.release = hfsplus_file_release,
.ioctl = hfsplus_ioctl,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8..1514aee 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -427,12 +427,18 @@ static const struct file_operations hostfs_file_fops = {
.open = hostfs_file_open,
.release = NULL,
.fsync = hostfs_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static const struct file_operations hostfs_dir_fops = {
.llseek = generic_file_llseek,
.readdir = hostfs_readdir,
.read = generic_read_dir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
int hostfs_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130..a2e95fe 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -323,4 +323,7 @@ const struct file_operations hpfs_dir_ops =
.readdir = hpfs_readdir,
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff..7a78261 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -139,6 +139,9 @@ const struct file_operations hpfs_file_ops =
.release = hpfs_file_release,
.fsync = hpfs_file_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations hpfs_file_iops =
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8..d059b38 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -546,6 +546,9 @@ static const struct file_operations hppfs_file_fops = {
.read = hppfs_read,
.write = hppfs_write,
.open = hppfs_open,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
struct hppfs_dirent {
@@ -597,6 +600,9 @@ static const struct file_operations hppfs_dir_fops = {
.readdir = hppfs_readdir,
.open = hppfs_dir_open,
.fsync = hppfs_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b..fe41d7f 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -274,6 +274,9 @@ const struct file_operations isofs_dir_operations =
{
.read = generic_read_dir,
.readdir = isofs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
/*
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417..482e34a 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -41,6 +41,9 @@ const struct file_operations jffs2_dir_operations =
.unlocked_ioctl=jffs2_ioctl,
.fsync = jffs2_fsync,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c1..c2bcb4d 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -49,6 +49,9 @@ const struct file_operations jffs2_file_operations =
.mmap = generic_file_readonly_mmap,
.fsync = jffs2_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
/* jffs2_file_inode_operations */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982..cda38cb 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -143,4 +143,7 @@ const struct file_operations jfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = jfs_compat_ioctl,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f3..83f2490 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1571,6 +1571,9 @@ const struct file_operations jfs_dir_operations = {
.compat_ioctl = jfs_compat_ioctl,
#endif
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731..715dd03 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -23,6 +23,9 @@ const struct file_operations minix_dir_operations = {
.read = generic_read_dir,
.readdir = minix_readdir,
.fsync = simple_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static inline void dir_put_page(struct page *page)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e6..8e8f6a9 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -21,6 +21,9 @@ const struct file_operations minix_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations minix_file_inode_operations = {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index be46f26..ac4c291 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -63,6 +63,9 @@ const struct file_operations nfs_dir_operations = {
.open = nfs_opendir,
.release = nfs_release,
.fsync = nfs_fsync_dir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations nfs_dir_inode_operations = {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8d965bd..456e861 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -78,6 +78,9 @@ const struct file_operations nfs_file_operations = {
.splice_write = nfs_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations nfs_file_inode_operations = {
@@ -583,6 +586,9 @@ out_unlock:
static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nfs_vm_page_mkwrite,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = filemap_checkpoint,
+#endif
};
static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89df..d677449 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -702,5 +702,7 @@ const struct file_operations nilfs_dir_operations = {
.compat_ioctl = nilfs_ioctl,
#endif /* CONFIG_COMPAT */
.fsync = nilfs_sync_file,
-
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df..9306d6f 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -149,6 +149,9 @@ const struct file_operations nilfs_file_operations = {
/* .release = nilfs_release_file, */
.fsync = nilfs_sync_file,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations nilfs_file_inode_operations = {
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3f..229d00b 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1573,4 +1573,7 @@ const struct file_operations ntfs_dir_ops = {
/*.ioctl = ,*/ /* Perform function on the
mounted filesystem. */
.open = ntfs_dir_open, /* Open directory. */
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f09..303ee8e 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2225,7 +2225,7 @@ const struct file_operations ntfs_file_ops = {
mounted filesystem. */
.mmap = generic_file_mmap, /* Mmap file. */
.open = ntfs_file_open, /* Open file. */
- .splice_read = generic_file_splice_read /* Zero-copy data send with
+ .splice_read = generic_file_splice_read, /* Zero-copy data send with
the data source being on
the ntfs partition. We do
not need to care about the
@@ -2235,6 +2235,9 @@ const struct file_operations ntfs_file_ops = {
on the ntfs partition. We
do not need to care about
the data source. */
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b42d624..3b08c84 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -502,4 +502,7 @@ const struct file_operations omfs_dir_operations = {
.read = generic_read_dir,
.readdir = omfs_readdir,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c..fb5fa02 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -331,6 +331,9 @@ const struct file_operations omfs_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations omfs_file_inops = {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f..f5e4649 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -160,6 +160,9 @@ static const struct file_operations openpromfs_prop_ops = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = NULL,
+#endif
};
static int openpromfs_readdir(struct file *, void *, filldir_t);
@@ -168,6 +171,9 @@ static const struct file_operations openprom_operations = {
.read = generic_read_dir,
.readdir = openpromfs_readdir,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d..3414bf8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -80,6 +80,9 @@ const struct file_operations qnx4_dir_operations =
.read = generic_read_dir,
.readdir = qnx4_readdir,
.fsync = simple_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613c..34dc7f2 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -47,6 +47,9 @@ const struct file_operations ramfs_file_operations = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations ramfs_file_inode_operations = {
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad8..7aa0d43 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -46,6 +46,9 @@ const struct file_operations ramfs_file_operations = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations ramfs_file_inode_operations = {
diff --git a/fs/read_write.c b/fs/read_write.c
index 67b7d83..d566214 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -27,6 +27,9 @@ const struct file_operations generic_ro_fops = {
.aio_read = generic_file_aio_read,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
EXPORT_SYMBOL(generic_ro_fops);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index f8a6075..1fb4ce6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -25,6 +25,9 @@ const struct file_operations reiserfs_dir_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = reiserfs_compat_ioctl,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c127..5b2d720 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -297,6 +297,9 @@ const struct file_operations reiserfs_file_operations = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations reiserfs_file_inode_operations = {
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index f0511e8..f488a8a 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -72,4 +72,7 @@ const struct file_operations romfs_ro_fops = {
.splice_read = generic_file_splice_read,
.mmap = romfs_mmap,
.get_unmapped_area = romfs_get_unmapped_area,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d2135..6a07e29 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,9 @@ error:
static const struct file_operations romfs_dir_operations = {
.read = generic_read_dir,
.readdir = romfs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static const struct inode_operations romfs_dir_inode_operations = {
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933a..1bc7cc0 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,8 @@ failed_read:
const struct file_operations squashfs_dir_ops = {
.read = generic_read_dir,
- .readdir = squashfs_readdir
+ .readdir = squashfs_readdir,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286..e1030d4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -25,6 +25,9 @@ const struct file_operations sysv_dir_operations = {
.read = generic_read_dir,
.readdir = sysv_readdir,
.fsync = simple_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c0..78b7e65 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -28,6 +28,9 @@ const struct file_operations sysv_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct inode_operations sysv_file_inode_operations = {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68ba..b357ca3 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2625,6 +2625,9 @@ static const struct file_operations dfs_fops = {
.open = open_debugfs_file,
.write = write_debugfs_file,
.owner = THIS_MODULE,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
/**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503..a72070e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1228,4 +1228,7 @@ const struct file_operations ubifs_dir_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf7..566e172 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1583,4 +1583,7 @@ const struct file_operations ubifs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a43..43bf889 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -211,4 +211,7 @@ const struct file_operations udf_dir_operations = {
.readdir = udf_readdir,
.ioctl = udf_ioctl,
.fsync = simple_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 4b6a46c..0c73a47 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -216,6 +216,9 @@ const struct file_operations udf_file_operations = {
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
int udf_setattr(struct dentry *dentry, struct iattr *iattr)
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d4..9841034 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -668,4 +668,7 @@ const struct file_operations ufs_dir_operations = {
.readdir = ufs_readdir,
.fsync = simple_fsync,
.llseek = generic_file_llseek,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962ce..f180014 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -44,4 +44,7 @@ const struct file_operations ufs_file_operations = {
.open = dquot_file_open,
.fsync = simple_fsync,
.splice_read = generic_file_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bc..8aea381 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -1015,6 +1015,9 @@ const struct file_operations xfs_file_operations = {
#ifdef HAVE_FOP_OPEN_EXEC
.open_exec = xfs_file_open_exec,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
const struct file_operations xfs_dir_file_operations = {
@@ -1027,6 +1030,9 @@ const struct file_operations xfs_dir_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.fsync = xfs_file_fsync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = generic_file_checkpoint,
+#endif
};
static const struct vm_operations_struct xfs_file_vm_ops = {
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 046/100] c/r: dump anonymous- and file-mapped- shared memory
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (10 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 044/100] c/r: add checkpoint operation for opened files of generic filesystems Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 047/100] splice: export pipe/file-to-pipe/file functionality Oren Laadan
` (8 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-mm, linux-fsdevel
We now handle anonymous and file-mapped shared memory. Support for IPC
shared memory requires support for IPC first. We extend ckpt_write_vma()
to detect shared memory VMAs and handle it separately than private
memory.
There is not much to do for file-mapped shared memory, except to force
msync() on the region to ensure that the file system is consistent
with the checkpoint image. Use our internal type CKPT_VMA_SHM_FILE.
Anonymous shared memory is always backed by inode in shmem filesystem.
We use that inode to look up the VMA in the objhash and register it if
not found (on first encounter). In this case, the type of the VMA is
CKPT_VMA_SHM_ANON, and we dump the contents. On the other hand, if it is
found there, we must have already saved it before, so we change the
type to CKPT_VMA_SHM_ANON_SKIP and skip it.
To dump the contents of a shmem VMA, we loop through the pages of the
inode in the shmem filesystem, and dump the contents of each dirty
(allocated) page - unallocated pages must be clean.
Note that we save the original size of a shmem VMA because it may have
been re-mapped partially. The format itself remains like with private
VMAs, except that instead of addresses we record _indices_ (page nr)
into the backing inode.
During restore, the bulk of the work is in ckpt_read_vma(), which has
been refactored: the part that create the suitable 'struct file *' for
the mapping is now larger and moved to a separate function. What's
left is to read the VMA description, get the file pointer, create the
mapping, and proceed to read the contents in.
Both anonymous shared VMAs that have been read earlier (as indicated
by a look up to objhash) and file-mapped shared VMAs are skipped.
Anonymous shared VMAs seen for the first time have their contents read
in directly to the backing inode, as indexed by the page numbers (as
opposed to virtual addresses).
Changelog[v21]:
- Replace __initcall() with late_initcall()
- Merge shmem dump/restore into a single patch
- [Serge Hallyn] s390: Register inode checkpoint ops in a separate
__initcall since we don't need to be in the early init paths.
Also fixes a bug on s390 where CKPT_OBJ_INODE wouldn't get
registered because of early return predicated on hashdist.
Changelog[v19-rc3]:
- Rebase to kernel 2.6.33
Changelog[v19-rc1]:
- [Matt Helsley] Add cpp definitions for enums
Changelog[v18]:
- Mark the backing file as visited at chekcpoint
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/inode.c | 26 +++++
include/linux/checkpoint.h | 21 +++-
include/linux/checkpoint_hdr.h | 12 +++
include/linux/mm.h | 2 +
kernel/checkpoint/objhash.c | 2 +
mm/checkpoint.c | 209 +++++++++++++++++++++++++++++++++-------
mm/filemap.c | 52 ++++++++++-
mm/mmap.c | 2 +-
mm/shmem.c | 84 ++++++++++++++++
9 files changed, 368 insertions(+), 42 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 407bf39..3496c51 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,7 @@
#include <linux/mount.h>
#include <linux/async.h>
#include <linux/posix_acl.h>
+#include <linux/checkpoint.h>
/*
* This is needed for the following functions:
@@ -1560,6 +1561,31 @@ void __init inode_init_early(void)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
+#ifdef CONFIG_CHECKPOINT
+static int obj_inode_grab(void *ptr)
+{
+ return igrab((struct inode *) ptr) ? 0 : -EBADF;
+}
+
+static void obj_inode_drop(void *ptr, int lastref)
+{
+ iput((struct inode *) ptr);
+}
+
+static const struct ckpt_obj_ops ckpt_obj_inode_ops = {
+ .obj_name = "INODE",
+ .obj_type = CKPT_OBJ_INODE,
+ .ref_drop = obj_inode_drop,
+ .ref_grab = obj_inode_grab,
+};
+
+static int __init inode_checkpoint_init(void)
+{
+ return register_checkpoint_obj(&ckpt_obj_inode_ops);
+}
+late_initcall(inode_checkpoint_init);
+#endif
+
void __init inode_init(void)
{
int loop;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index c9efeb3..24ad717 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -190,26 +190,35 @@ extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
struct vm_area_struct *vma,
enum vma_type type,
- int vma_objref);
+ int vma_objref, int ino_objref);
extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
struct vm_area_struct *vma,
enum vma_type type,
int vma_objref);
+extern int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
+ struct vm_area_struct *vma,
+ enum vma_type type,
+ int ino_objref);
extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern unsigned long generic_vma_restore(struct mm_struct *mm,
+ struct file *file,
+ struct ckpt_hdr_vma *h);
+
extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
struct file *file, struct ckpt_hdr_vma *h);
+extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
+
-#define CKPT_VMA_NOT_SUPPORTED \
- (VM_SHARED | VM_MAYSHARE | VM_IO | VM_HUGETLB | \
- VM_NONLINEAR | VM_PFNMAP | VM_RESERVED | VM_NORESERVE \
- | VM_HUGETLB | VM_NONLINEAR | VM_MAPPED_COPY | \
- VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
+#define CKPT_VMA_NOT_SUPPORTED \
+ (VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP | \
+ VM_RESERVED | VM_NORESERVE | VM_HUGETLB | VM_NONLINEAR | \
+ VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
static inline int ckpt_validate_errno(int errno)
{
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f2c67ee..86cab42 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -131,6 +131,8 @@ struct ckpt_hdr_objref {
enum obj_type {
CKPT_OBJ_IGNORE = 0,
#define CKPT_OBJ_IGNORE CKPT_OBJ_IGNORE
+ CKPT_OBJ_INODE,
+#define CKPT_OBJ_INODE CKPT_OBJ_INODE
CKPT_OBJ_FILE_TABLE,
#define CKPT_OBJ_FILE_TABLE CKPT_OBJ_FILE_TABLE
CKPT_OBJ_FILE,
@@ -224,6 +226,7 @@ struct ckpt_hdr_task {
/* task's shared resources */
struct ckpt_hdr_task_objs {
struct ckpt_hdr h;
+
__s32 files_objref;
__s32 mm_objref;
} __attribute__((aligned(8)));
@@ -322,6 +325,12 @@ enum vma_type {
#define CKPT_VMA_ANON CKPT_VMA_ANON
CKPT_VMA_FILE, /* private mapped file */
#define CKPT_VMA_FILE CKPT_VMA_FILE
+ CKPT_VMA_SHM_ANON, /* shared anonymous */
+#define CKPT_VMA_SHM_ANON CKPT_VMA_SHM_ANON
+ CKPT_VMA_SHM_ANON_SKIP, /* shared anonymous (skip contents) */
+#define CKPT_VMA_SHM_ANON_SKIP CKPT_VMA_SHM_ANON_SKIP
+ CKPT_VMA_SHM_FILE, /* shared mapped file, only msync */
+#define CKPT_VMA_SHM_FILE CKPT_VMA_SHM_FILE
CKPT_VMA_MAX
#define CKPT_VMA_MAX CKPT_VMA_MAX
};
@@ -331,6 +340,9 @@ struct ckpt_hdr_vma {
struct ckpt_hdr h;
__u32 vma_type;
__s32 vma_objref; /* objref of backing file */
+ __s32 ino_objref; /* objref of shared segment */
+ __u32 _padding;
+ __u64 ino_size; /* size of shared segment */
__u64 vm_start;
__u64 vm_end;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5ebb781..31520e5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1331,6 +1331,8 @@ extern int filemap_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
struct ckpt_hdr_vma *hh);
extern int special_mapping_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
struct ckpt_hdr_vma *hh);
+extern int shmem_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct ckpt_hdr_vma *hh);
#endif
/* readahead.c */
diff --git a/kernel/checkpoint/objhash.c b/kernel/checkpoint/objhash.c
index 75bf2da..1d78dbf 100644
--- a/kernel/checkpoint/objhash.c
+++ b/kernel/checkpoint/objhash.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/file.h>
#include <linux/checkpoint.h>
struct ckpt_obj {
@@ -45,6 +46,7 @@ static const struct ckpt_obj_ops ckpt_obj_ignored_ops = {
.ref_grab = NULL,
};
+/* objects array */
static const struct ckpt_obj_ops *ckpt_obj_ops[CKPT_OBJ_MAX] = {
[CKPT_OBJ_IGNORE] = &ckpt_obj_ignored_ops,
};
diff --git a/mm/checkpoint.c b/mm/checkpoint.c
index d53025b..6d71180 100644
--- a/mm/checkpoint.c
+++ b/mm/checkpoint.c
@@ -22,6 +22,7 @@
#include <linux/pagemap.h>
#include <linux/mm_types.h>
#include <linux/proc_fs.h>
+#include <linux/swap.h>
#include <linux/checkpoint.h>
/*
@@ -227,6 +228,54 @@ static struct page *consider_private_page(struct vm_area_struct *vma,
}
/**
+ * consider_shared_page - return page pointer for dirty pages
+ * @ino - inode of shmem object
+ * @idx - page index in shmem object
+ *
+ * Looks up the page that corresponds to the index in the shmem object,
+ * and returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_shared_page(struct inode *ino, unsigned long idx)
+{
+ struct page *page = NULL;
+ int ret;
+
+ /*
+ * Inspired by do_shmem_file_read(): very simplified version.
+ *
+ * FIXME: consolidate with do_shmem_file_read()
+ */
+
+ ret = shmem_getpage(ino, idx, &page, SGP_READ, NULL);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ /*
+ * Only care about dirty pages; shmem_getpage() only returns
+ * pages that have been allocated, so they must be dirty. The
+ * pages returned are locked and referenced.
+ */
+
+ if (page) {
+ unlock_page(page);
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(ino->i_mapping))
+ flush_dcache_page(page);
+ /*
+ * Mark the page accessed if we read the beginning.
+ */
+ mark_page_accessed(page);
+ }
+
+ return page;
+}
+
+/**
* vma_fill_pgarr - fill a page-array with addr/page tuples
* @ctx - checkpoint context
* @vma - vma to scan
@@ -235,16 +284,15 @@ static struct page *consider_private_page(struct vm_area_struct *vma,
* Returns the number of pages collected
*/
static int vma_fill_pgarr(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long *start)
+ struct vm_area_struct *vma, struct inode *inode,
+ unsigned long *start, unsigned long end)
{
- unsigned long end = vma->vm_end;
unsigned long addr = *start;
struct ckpt_pgarr *pgarr;
int nr_used;
int cnt = 0;
- BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+ BUG_ON(inode && vma);
if (vma)
down_read(&vma->vm_mm->mmap_sem);
@@ -260,7 +308,11 @@ static int vma_fill_pgarr(struct ckpt_ctx *ctx,
while (addr < end) {
struct page *page;
- page = consider_private_page(vma, addr);
+ if (vma)
+ page = consider_private_page(vma, addr);
+ else
+ page = consider_shared_page(inode, addr);
+
if (IS_ERR(page)) {
cnt = PTR_ERR(page);
goto out;
@@ -274,7 +326,10 @@ static int vma_fill_pgarr(struct ckpt_ctx *ctx,
pgarr->nr_used++;
}
- addr += PAGE_SIZE;
+ if (vma)
+ addr += PAGE_SIZE;
+ else
+ addr++;
if (pgarr_is_full(pgarr))
break;
@@ -341,23 +396,32 @@ static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
}
/**
- * checkpoint_memory_contents - dump contents of a VMA with private memory
+ * checkpoint_memory_contents - dump contents of a memory region
* @ctx - checkpoint context
- * @vma - vma to scan
+ * @vma - vma to scan (--or--)
+ * @inode - inode to scan
*
* Collect lists of pages that needs to be dumped, and corresponding
* virtual addresses into ctx->pgarr_list page-array chain. Then dump
* the addresses, followed by the page contents.
*/
static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma,
+ struct inode *inode)
{
struct ckpt_hdr_pgarr *h;
unsigned long addr, end;
int cnt, ret;
- addr = vma->vm_start;
- end = vma->vm_end;
+ BUG_ON(vma && inode);
+
+ if (vma) {
+ addr = vma->vm_start;
+ end = vma->vm_end;
+ } else {
+ addr = 0;
+ end = PAGE_ALIGN(i_size_read(inode)) >> PAGE_CACHE_SHIFT;
+ }
/*
* Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
@@ -383,7 +447,7 @@ static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
*/
while (addr < end) {
- cnt = vma_fill_pgarr(ctx, vma, &addr);
+ cnt = vma_fill_pgarr(ctx, vma, inode, &addr, end);
if (cnt == 0)
break;
else if (cnt < 0)
@@ -427,7 +491,7 @@ static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
* @vma_objref: vma objref
*/
int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
- enum vma_type type, int vma_objref)
+ enum vma_type type, int vma_objref, int ino_objref)
{
struct ckpt_hdr_vma *h;
int ret;
@@ -441,6 +505,13 @@ int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
h->vma_type = type;
h->vma_objref = vma_objref;
+ h->ino_objref = ino_objref;
+
+ if (vma->vm_file)
+ h->ino_size = i_size_read(vma->vm_file->f_dentry->d_inode);
+ else
+ h->ino_size = 0;
+
h->vm_start = vma->vm_start;
h->vm_end = vma->vm_end;
h->vm_page_prot = pgprot_val(vma->vm_page_prot);
@@ -468,10 +539,37 @@ int private_vma_checkpoint(struct ckpt_ctx *ctx,
BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
- ret = generic_vma_checkpoint(ctx, vma, type, vma_objref);
+ ret = generic_vma_checkpoint(ctx, vma, type, vma_objref, 0);
+ if (ret < 0)
+ goto out;
+ ret = checkpoint_memory_contents(ctx, vma, NULL);
+ out:
+ return ret;
+}
+
+/**
+ * shmem_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @objref: vma object id
+ */
+int shmem_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
+ enum vma_type type, int ino_objref)
+{
+ struct file *file = vma->vm_file;
+ int ret;
+
+ ckpt_debug("type %d, ino_ref %d\n", type, ino_objref);
+ BUG_ON(!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)));
+ BUG_ON(!file);
+
+ ret = generic_vma_checkpoint(ctx, vma, type, 0, ino_objref);
if (ret < 0)
goto out;
- ret = checkpoint_memory_contents(ctx, vma);
+ if (type == CKPT_VMA_SHM_ANON_SKIP)
+ goto out;
+ ret = checkpoint_memory_contents(ctx, NULL, file->f_dentry->d_inode);
out:
return ret;
}
@@ -772,16 +870,39 @@ int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
return 0;
}
+static struct page *bring_private_page(unsigned long addr)
+{
+ struct page *page;
+ int ret;
+
+ ret = get_user_pages(current, current->mm, addr, 1, 1, 1, &page, NULL);
+ if (ret < 0)
+ page = ERR_PTR(ret);
+ return page;
+}
+
+static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
+{
+ struct page *page = NULL;
+ int ret;
+
+ ret = shmem_getpage(ino, idx, &page, SGP_WRITE, NULL);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (page)
+ unlock_page(page);
+ return page;
+}
+
/**
* read_pages_contents - read in data of pages in page-array chain
* @ctx - restart context
*/
-static int read_pages_contents(struct ckpt_ctx *ctx)
+static int read_pages_contents(struct ckpt_ctx *ctx, struct inode *inode)
{
- struct mm_struct *mm = current->mm;
struct ckpt_pgarr *pgarr;
unsigned long *vaddrs;
- int i, ret = 0;
+ int i, ret;
list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
vaddrs = pgarr->vaddrs;
@@ -791,11 +912,14 @@ static int read_pages_contents(struct ckpt_ctx *ctx)
/* TODO: do in chunks to reduce mmap_sem overhead */
_ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
down_read(¤t->mm->mmap_sem);
- ret = get_user_pages(current, mm, vaddrs[i],
- 1, 1, 1, &page, NULL);
+ if (inode)
+ page = bring_shared_page(vaddrs[i], inode);
+ else
+ page = bring_private_page(vaddrs[i]);
up_read(¤t->mm->mmap_sem);
- if (ret < 0)
- return ret;
+
+ if (IS_ERR(page))
+ return PTR_ERR(page);
ret = restore_read_page(ctx, page);
page_cache_release(page);
@@ -804,12 +928,13 @@ static int read_pages_contents(struct ckpt_ctx *ctx)
return ret;
}
}
- return ret;
+ return 0;
}
/**
- * restore_memory_contents - restore contents of a VMA with private memory
+ * restore_memory_contents - restore contents of a memory region
* @ctx - restart context
+ * @inode - backing inode
*
* Reads a header that specifies how many pages will follow, then reads
* a list of virtual addresses into ctx->pgarr_list page-array chain,
@@ -817,7 +942,7 @@ static int read_pages_contents(struct ckpt_ctx *ctx)
* these steps until reaching a header specifying "0" pages, which marks
* the end of the contents.
*/
-static int restore_memory_contents(struct ckpt_ctx *ctx)
+int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode)
{
struct ckpt_hdr_pgarr *h;
unsigned long nr_pages;
@@ -844,7 +969,7 @@ static int restore_memory_contents(struct ckpt_ctx *ctx)
ret = read_pages_vaddrs(ctx, nr_pages);
if (ret < 0)
break;
- ret = read_pages_contents(ctx);
+ ret = read_pages_contents(ctx, inode);
if (ret < 0)
break;
pgarr_reset_all(ctx);
@@ -902,9 +1027,9 @@ static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
* @file - file to map (NULL for anonymous)
* @h - vma header data
*/
-static unsigned long generic_vma_restore(struct mm_struct *mm,
- struct file *file,
- struct ckpt_hdr_vma *h)
+unsigned long generic_vma_restore(struct mm_struct *mm,
+ struct file *file,
+ struct ckpt_hdr_vma *h)
{
unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
unsigned long addr;
@@ -949,7 +1074,7 @@ int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
if (IS_ERR((void *) addr))
return PTR_ERR((void *) addr);
- return restore_memory_contents(ctx);
+ return restore_memory_contents(ctx, NULL);
}
/**
@@ -1005,6 +1130,24 @@ static struct restore_vma_ops restore_vma_ops[] = {
.vma_type = CKPT_VMA_FILE,
.restore = filemap_restore,
},
+ /* anonymous shared */
+ {
+ .vma_name = "ANON SHARED",
+ .vma_type = CKPT_VMA_SHM_ANON,
+ .restore = shmem_restore,
+ },
+ /* anonymous shared (skipped) */
+ {
+ .vma_name = "ANON SHARED (skip)",
+ .vma_type = CKPT_VMA_SHM_ANON_SKIP,
+ .restore = shmem_restore,
+ },
+ /* file-mapped shared */
+ {
+ .vma_name = "FILE SHARED",
+ .vma_type = CKPT_VMA_SHM_FILE,
+ .restore = filemap_restore,
+ },
};
/**
@@ -1022,15 +1165,15 @@ static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
if (IS_ERR(h))
return PTR_ERR(h);
- ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d\n",
+ ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d inoref %d\n",
(unsigned long) h->vm_start, (unsigned long) h->vm_end,
(unsigned long) h->vm_flags, (int) h->vma_type,
- (int) h->vma_objref);
+ (int) h->vma_objref, (int) h->ino_objref);
ret = -EINVAL;
if (h->vm_end < h->vm_start)
goto out;
- if (h->vma_objref < 0)
+ if (h->vma_objref < 0 || h->ino_objref < 0)
goto out;
if (h->vma_type >= CKPT_VMA_MAX)
goto out;
diff --git a/mm/filemap.c b/mm/filemap.c
index 24d4c54..3d6c497 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1596,6 +1596,8 @@ int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
int vma_objref;
+ int ino_objref;
+ int first, ret;
if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
@@ -1608,7 +1610,42 @@ int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
if (vma_objref < 0)
return vma_objref;
- return private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE, vma_objref);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ /*
+ * Citing mmap(2): "Updates to the mapping are visible
+ * to other processes that map this file, and are
+ * carried through to the underlying file. The file
+ * may not actually be updated until msync(2) or
+ * munmap(2) is called"
+ *
+ * Citing msync(2): "Without use of this call there is
+ * no guarantee that changes are written back before
+ * munmap(2) is called."
+ *
+ * Force msync for region of shared mapped files, to
+ * ensure that that the file system is consistent with
+ * the checkpoint image. (inspired by sys_msync).
+ */
+
+ ino_objref = ckpt_obj_lookup_add(ctx, file->f_dentry->d_inode,
+ CKPT_OBJ_INODE, &first);
+ if (ino_objref < 0)
+ return ino_objref;
+
+ if (first) {
+ ret = vfs_fsync(file, file->f_path.dentry, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = generic_vma_checkpoint(ctx, vma, CKPT_VMA_SHM_FILE,
+ vma_objref, ino_objref);
+ } else {
+ ret = private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE,
+ vma_objref);
+ }
+
+ return ret;
}
EXPORT_SYMBOL(filemap_checkpoint);
@@ -1617,17 +1654,28 @@ int filemap_restore(struct ckpt_ctx *ctx,
struct ckpt_hdr_vma *h)
{
struct file *file;
+ unsigned long addr;
int ret;
if (h->vma_type == CKPT_VMA_FILE &&
(h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
return -EINVAL;
+ if (h->vma_type == CKPT_VMA_SHM_FILE &&
+ !(h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
+ return -EINVAL;
file = ckpt_obj_fetch(ctx, h->vma_objref, CKPT_OBJ_FILE);
if (IS_ERR(file))
return PTR_ERR(file);
- ret = private_vma_restore(ctx, mm, file, h);
+ if (h->vma_type == CKPT_VMA_FILE) {
+ /* private mapped file */
+ ret = private_vma_restore(ctx, mm, file, h);
+ } else {
+ /* shared mapped file */
+ addr = generic_vma_restore(mm, file, h);
+ ret = (IS_ERR((void *) addr) ? PTR_ERR((void *) addr) : 0);
+ }
return ret;
}
#endif
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d4891f..ddbe589 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2471,7 +2471,7 @@ static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
if (!name || strcmp(name, "[vdso]"))
return -ENOSYS;
- return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0);
+ return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0, 0);
}
int special_mapping_restore(struct ckpt_ctx *ctx,
diff --git a/mm/shmem.c b/mm/shmem.c
index d93c394..1f361a6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,6 +29,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/checkpoint.h>
static struct vfsmount *shm_mnt;
@@ -2393,6 +2394,86 @@ static void shmem_destroy_inode(struct inode *inode)
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
+#ifdef CONFIG_CHECKPOINT
+static int shmem_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
+{
+ enum vma_type vma_type;
+ int ino_objref;
+ int ret, first;
+
+ /* should be private anonymous ... verify that this is the case */
+ if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+ pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
+ return -ENOSYS;
+ }
+
+ BUG_ON(!vma->vm_file);
+
+ /* we collected the file but we don't checkpoint it per-se */
+ ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+ if (ret < 0)
+ return ret;
+
+ ino_objref = ckpt_obj_lookup_add(ctx, vma->vm_file->f_dentry->d_inode,
+ CKPT_OBJ_INODE, &first);
+ if (ino_objref < 0)
+ return ino_objref;
+
+ vma_type = (first ? CKPT_VMA_SHM_ANON : CKPT_VMA_SHM_ANON_SKIP);
+
+ return shmem_vma_checkpoint(ctx, vma, vma_type, ino_objref);
+}
+
+int shmem_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm, struct ckpt_hdr_vma *h)
+{
+ unsigned long addr;
+ struct file *file;
+ int ret = 0;
+
+ file = ckpt_obj_try_fetch(ctx, h->ino_objref, CKPT_OBJ_FILE);
+ if (PTR_ERR(file) == -EINVAL)
+ file = NULL;
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ /* if file is NULL, this is the premiere - create and insert */
+ if (!file) {
+ if (h->vma_type != CKPT_VMA_SHM_ANON)
+ return -EINVAL;
+ /*
+ * in theory could pass NULL to mmap and let it create
+ * the file. But, if 'shm_size != vm_end - vm_start',
+ * or if 'vm_pgoff != 0', then the vma reflects only a
+ * portion of the shm object and we need to "manually"
+ * create the full shm object.
+ */
+ file = shmem_file_setup("/dev/zero", h->ino_size, h->vm_flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ ret = ckpt_obj_insert(ctx, file, h->ino_objref, CKPT_OBJ_FILE);
+ if (ret < 0)
+ goto out;
+ } else {
+ if (h->vma_type != CKPT_VMA_SHM_ANON_SKIP)
+ return -EINVAL;
+ /* Already need fput() for the file above; keep path simple */
+ get_file(file);
+ }
+
+ addr = generic_vma_restore(mm, file, h);
+ if (IS_ERR((void *) addr))
+ return PTR_ERR((void *) addr);
+
+ if (h->vma_type == CKPT_VMA_SHM_ANON)
+ ret = restore_memory_contents(ctx, file->f_dentry->d_inode);
+ out:
+ fput(file);
+ return ret;
+}
+
+#endif /* CONFIG_CHECKPOINT */
+
static void init_once(void *foo)
{
struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2505,6 +2586,9 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = shmem_checkpoint,
+#endif
};
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 047/100] splice: export pipe/file-to-pipe/file functionality
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (11 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 046/100] c/r: dump anonymous- and file-mapped- shared memory Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
` (7 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
During pipes c/r pipes we need to save and restore pipe buffers. But
do_splice() requires two file descriptors, therefore we can't use it,
as we always have one file descriptor (checkpoint image) and one
pipe_inode_info.
This patch exports interfaces that work at the pipe_inode_info level,
namely link_pipe(), do_splice_to() and do_splice_from(). They are used
in the following patch to to save and restore pipe buffers without
unnecessary data copy.
It slightly modifies both do_splice_to() and do_splice_from() to
detect the case of pipe-to-pipe transfer, in which case they invoke
splice_pipe_to_pipe() directly.
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/splice.c | 61 ++++++++++++++++++++++++++++++++---------------
include/linux/splice.h | 9 +++++++
2 files changed, 50 insertions(+), 20 deletions(-)
diff --git a/fs/splice.c b/fs/splice.c
index 188e17d..ed91d7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1037,18 +1037,43 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
EXPORT_SYMBOL(generic_splice_sendpage);
/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+static inline struct pipe_inode_info *pipe_info(struct inode *inode)
+{
+ if (S_ISFIFO(inode->i_mode))
+ return inode->i_pipe;
+
+ return NULL;
+}
+
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags);
+
+/*
* Attempt to initiate a splice from pipe to file.
*/
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
+long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int);
+ struct pipe_inode_info *opipe;
int ret;
if (unlikely(!(out->f_mode & FMODE_WRITE)))
return -EBADF;
+ /* When called directly (e.g. from c/r) output may be a pipe */
+ opipe = pipe_info(out->f_path.dentry->d_inode);
+ if (opipe) {
+ BUG_ON(opipe == pipe);
+ return splice_pipe_to_pipe(pipe, opipe, len, flags);
+ }
+
if (unlikely(out->f_flags & O_APPEND))
return -EINVAL;
@@ -1067,17 +1092,25 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
/*
* Attempt to initiate a splice from a file to a pipe.
*/
-static long do_splice_to(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+long do_splice_to(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
ssize_t (*splice_read)(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
+ struct pipe_inode_info *ipipe;
int ret;
if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
+ /* When called firectly (e.g. from c/r) input may be a pipe */
+ ipipe = pipe_info(in->f_path.dentry->d_inode);
+ if (ipipe) {
+ BUG_ON(ipipe == pipe);
+ return splice_pipe_to_pipe(ipipe, pipe, len, flags);
+ }
+
ret = rw_verify_area(READ, in, ppos, len);
if (unlikely(ret < 0))
return ret;
@@ -1257,18 +1290,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags);
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
- if (S_ISFIFO(inode->i_mode))
- return inode->i_pipe;
-
- return NULL;
-}
/*
* Determine where to splice to/from.
@@ -1873,9 +1894,9 @@ retry:
/*
* Link contents of ipipe to opipe.
*/
-static int link_pipe(struct pipe_inode_info *ipipe,
- struct pipe_inode_info *opipe,
- size_t len, unsigned int flags)
+int link_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
int ret = 0, i = 0, nbuf;
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 18e7c7c..431662c 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,4 +82,13 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *);
+extern int link_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags);
+extern long do_splice_to(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags);
+extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags);
+
#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 048/100] c/r: support for open pipes
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2010-05-01 14:15 ` Oren Laadan
0 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Pavel Emelyanov
A pipe is a double-headed inode with a buffer attached to it. We
checkpoint the pipe buffer only once, as soon as we hit one side of
the pipe, regardless whether it is read- or write- end.
To checkpoint a file descriptor that refers to a pipe (either end), we
first lookup the inode in the hash table: If not found, it is the
first encounter of this pipe. Besides the file descriptor, we also (a)
save the pipe data, and (b) register the pipe inode in the hash. If
found, it is the second encounter of this pipe, namely, as we hit the
other end of the same pipe. In both cases we write the pipe-objref of
the inode.
To restore, create a new pipe and thus have two file pointers (read-
and write- ends). We only use one of them, depending on which side was
checkpointed first. We register the file pointer of the other end in
the hash table, with the pipe_objref given for this pipe from the
checkpoint, to be used later when the other arrives. At this point we
also restore the contents of the pipe buffers.
To save the pipe buffer, given a source pipe, use do_tee() to clone
its contents into a temporary 'struct pipe_inode_info', and then use
do_splice_from() to transfer it directly to the checkpoint image file.
To restore the pipe buffer, with a fresh newly allocated target pipe,
use do_splice_to() to splice the data directly between the checkpoint
image file and the pipe.
Changelog[v21]:
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[v19-rc1]:
- Switch to ckpt_obj_try_fetch()
- [Matt Helsley] Add cpp definitions for enums
Changelog[v18]:
- Adjust format of pipe buffer to include the mandatory pre-header
Changelog[v17]:
- Forward-declare 'ckpt_ctx' et-al, don't use checkpoint_types.h
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Acked-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Tested-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
fs/checkpoint.c | 7 ++
fs/pipe.c | 161 ++++++++++++++++++++++++++++++++++++++++
include/linux/checkpoint_hdr.h | 9 ++
include/linux/pipe_fs_i.h | 8 ++
4 files changed, 185 insertions(+), 0 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 95a51e9..e840d8a 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -17,6 +17,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
+#include <linux/pipe_fs_i.h>
#include <linux/syscalls.h>
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
@@ -586,6 +587,12 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_GENERIC,
.restore = generic_file_restore,
},
+ /* pipes */
+ {
+ .file_name = "PIPE",
+ .file_type = CKPT_FILE_PIPE,
+ .restore = pipe_file_restore,
+ },
};
static void *restore_file(struct ckpt_ctx *ctx)
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29f..801aad9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -13,11 +13,13 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
#include <linux/uio.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
+#include <linux/checkpoint.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -828,6 +830,156 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
return ret;
}
+#ifdef CONFIG_CHECKPOINT
+static int checkpoint_pipe(struct ckpt_ctx *ctx, struct inode *inode)
+{
+ struct pipe_inode_info *pipe;
+ int len, ret = -ENOMEM;
+
+ pipe = alloc_pipe_info(NULL);
+ if (!pipe)
+ return ret;
+
+ pipe->readers = 1; /* bluff link_pipe() below */
+ len = link_pipe(inode->i_pipe, pipe, INT_MAX, SPLICE_F_NONBLOCK);
+ if (len == -EAGAIN)
+ len = 0;
+ if (len < 0) {
+ ret = len;
+ goto out;
+ }
+
+ ret = ckpt_write_obj_type(ctx, NULL, len, CKPT_HDR_PIPE_BUF);
+ if (ret < 0)
+ goto out;
+
+ ret = do_splice_from(pipe, ctx->file, &ctx->file->f_pos, len, 0);
+ if (ret < 0)
+ goto out;
+ if (ret != len)
+ ret = -EPIPE; /* can occur due to an error in target file */
+ out:
+ __free_pipe_info(pipe);
+ return ret;
+}
+
+static int pipe_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct ckpt_hdr_file_pipe *h;
+ struct inode *inode = file->f_dentry->d_inode;
+ int objref, first, ret;
+
+ objref = ckpt_obj_lookup_add(ctx, inode, CKPT_OBJ_INODE, &first);
+ if (objref < 0)
+ return objref;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+
+ h->common.f_type = CKPT_FILE_PIPE;
+ h->pipe_objref = objref;
+
+ ret = checkpoint_file_common(ctx, file, &h->common);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, &h->common.h);
+ if (ret < 0)
+ goto out;
+
+ if (first)
+ ret = checkpoint_pipe(ctx, inode);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int restore_pipe(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct pipe_inode_info *pipe;
+ int len, ret;
+
+ len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_PIPE_BUF);
+ if (len < 0)
+ return len;
+
+ pipe = file->f_dentry->d_inode->i_pipe;
+ ret = do_splice_to(ctx->file, &ctx->file->f_pos, pipe, len, 0);
+
+ if (ret >= 0 && ret != len)
+ ret = -EPIPE; /* can occur due to an error in source file */
+
+ return ret;
+}
+
+struct file *pipe_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+{
+ struct ckpt_hdr_file_pipe *h = (struct ckpt_hdr_file_pipe *) ptr;
+ struct file *file;
+ int fds[2], which, ret;
+
+ if (ptr->h.type != CKPT_HDR_FILE ||
+ ptr->h.len != sizeof(*h) || ptr->f_type != CKPT_FILE_PIPE)
+ return ERR_PTR(-EINVAL);
+
+ if (h->pipe_objref <= 0)
+ return ERR_PTR(-EINVAL);
+
+ file = ckpt_obj_try_fetch(ctx, h->pipe_objref, CKPT_OBJ_FILE);
+ /*
+ * If ckpt_obj_try_fetch() returned ERR_PTR(-EINVAL), then this is
+ * the first time we see this pipe so need to restore the
+ * contents. Otherwise, use the file pointer skip forward.
+ */
+ if (!IS_ERR(file)) {
+ get_file(file);
+ } else if (PTR_ERR(file) == -EINVAL) {
+ /* first encounter of this pipe: create it */
+ ret = do_pipe_flags(fds, 0);
+ if (ret < 0)
+ return file;
+
+ which = (ptr->f_flags & O_WRONLY ? 1 : 0);
+ /*
+ * Below we return the file corersponding to one side
+ * of the pipe for our caller to use. Now insert the
+ * other side of the pipe to the hash, to be picked up
+ * when that side is restored.
+ */
+ file = fget(fds[1-which]); /* the 'other' side */
+ if (!file) /* this should _never_ happen ! */
+ return ERR_PTR(-EBADF);
+ ret = ckpt_obj_insert(ctx, file, h->pipe_objref, CKPT_OBJ_FILE);
+ if (ret < 0)
+ goto out;
+
+ ret = restore_pipe(ctx, file);
+ fput(file);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ file = fget(fds[which]); /* 'this' side */
+ if (!file) /* this should _never_ happen ! */
+ return ERR_PTR(-EBADF);
+
+ /* get rid of the file descriptors (caller sets that) */
+ sys_close(fds[which]);
+ sys_close(fds[1-which]);
+ } else {
+ return file;
+ }
+
+ ret = restore_file_common(ctx, file, ptr);
+ out:
+ if (ret < 0) {
+ fput(file);
+ file = ERR_PTR(ret);
+ }
+
+ return file;
+}
+#endif /* CONFIG_CHECKPOINT */
+
/*
* The file_operations structs are not static because they
* are also used in linux/fs/fifo.c to do operations on FIFOs.
@@ -844,6 +996,9 @@ const struct file_operations read_pipefifo_fops = {
.open = pipe_read_open,
.release = pipe_read_release,
.fasync = pipe_read_fasync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = pipe_file_checkpoint,
+#endif
};
const struct file_operations write_pipefifo_fops = {
@@ -856,6 +1011,9 @@ const struct file_operations write_pipefifo_fops = {
.open = pipe_write_open,
.release = pipe_write_release,
.fasync = pipe_write_fasync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = pipe_file_checkpoint,
+#endif
};
const struct file_operations rdwr_pipefifo_fops = {
@@ -869,6 +1027,9 @@ const struct file_operations rdwr_pipefifo_fops = {
.open = pipe_rdwr_open,
.release = pipe_rdwr_release,
.fasync = pipe_rdwr_fasync,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = pipe_file_checkpoint,
+#endif
};
struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 86cab42..50ef2b6 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -95,6 +95,8 @@ enum {
#define CKPT_HDR_FILE_NAME CKPT_HDR_FILE_NAME
CKPT_HDR_FILE,
#define CKPT_HDR_FILE CKPT_HDR_FILE
+ CKPT_HDR_PIPE_BUF,
+#define CKPT_HDR_PIPE_BUF CKPT_HDR_PIPE_BUF
CKPT_HDR_MM = 401,
#define CKPT_HDR_MM CKPT_HDR_MM
@@ -282,6 +284,8 @@ enum file_type {
#define CKPT_FILE_IGNORE CKPT_FILE_IGNORE
CKPT_FILE_GENERIC,
#define CKPT_FILE_GENERIC CKPT_FILE_GENERIC
+ CKPT_FILE_PIPE,
+#define CKPT_FILE_PIPE CKPT_FILE_PIPE
CKPT_FILE_MAX
#define CKPT_FILE_MAX CKPT_FILE_MAX
};
@@ -301,6 +305,11 @@ struct ckpt_hdr_file_generic {
struct ckpt_hdr_file common;
} __attribute__((aligned(8)));
+struct ckpt_hdr_file_pipe {
+ struct ckpt_hdr_file common;
+ __s32 pipe_objref;
+} __attribute__((aligned(8)));
+
/* memory layout */
struct ckpt_hdr_mm {
struct ckpt_hdr h;
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b43a9e0..e526a12 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -154,4 +154,12 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
+/* checkpoint/restart */
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+struct ckpt_hdr_file;
+extern struct file *pipe_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr);
+#endif
+
#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 049/100] c/r: checkpoint and restore FIFOs
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (13 preceding siblings ...)
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 050/100] c/r: refuse to checkpoint if monitoring directories with dnotify Oren Laadan
` (5 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, linux-fsdevel
FIFOs are almost like pipes.
Checkpoints adds the FIFO pathname. The first time the FIFO is found
it also assigns an @objref and dumps the contents in the buffers.
To restore, use the @objref only to determine whether a particular
FIFO has already been restored earlier. Note that it ignores the file
pointer that matches that @objref (unlike with pipes, where that file
corresponds to the other end of the pipe). Instead, it creates a new
FIFO using the saved pathname.
Changelog [v21]:
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog [v19-rc3]:
- Rebase to kernel 2.6.33
Changelog [v19-rc1]:
- Switch to ckpt_obj_try_fetch()
- [Matt Helsley] Add cpp definitions for enums
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/checkpoint.c | 6 +++
fs/pipe.c | 80 +++++++++++++++++++++++++++++++++++++++-
include/linux/checkpoint_hdr.h | 2 +
include/linux/pipe_fs_i.h | 2 +
4 files changed, 89 insertions(+), 1 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index e840d8a..06f1130 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -593,6 +593,12 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_PIPE,
.restore = pipe_file_restore,
},
+ /* fifo */
+ {
+ .file_name = "FIFO",
+ .file_type = CKPT_FILE_FIFO,
+ .restore = fifo_file_restore,
+ },
};
static void *restore_file(struct ckpt_ctx *ctx)
diff --git a/fs/pipe.c b/fs/pipe.c
index 801aad9..7f00e58 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -830,6 +830,8 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
return ret;
}
+static struct vfsmount *pipe_mnt __read_mostly;
+
#ifdef CONFIG_CHECKPOINT
static int checkpoint_pipe(struct ckpt_ctx *ctx, struct inode *inode)
{
@@ -877,7 +879,11 @@ static int pipe_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
if (!h)
return -ENOMEM;
- h->common.f_type = CKPT_FILE_PIPE;
+ /* fifo and pipe are similar at checkpoint, differ on restore */
+ if (inode->i_sb == pipe_mnt->mnt_sb)
+ h->common.f_type = CKPT_FILE_PIPE;
+ else
+ h->common.f_type = CKPT_FILE_FIFO;
h->pipe_objref = objref;
ret = checkpoint_file_common(ctx, file, &h->common);
@@ -887,6 +893,13 @@ static int pipe_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
if (ret < 0)
goto out;
+ /* FIFO also needs a file name */
+ if (h->common.f_type == CKPT_FILE_FIFO) {
+ ret = checkpoint_fname(ctx, &file->f_path, &ctx->root_fs_path);
+ if (ret < 0)
+ goto out;
+ }
+
if (first)
ret = checkpoint_pipe(ctx, inode);
out:
@@ -978,6 +991,71 @@ struct file *pipe_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
return file;
}
+
+struct file *fifo_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+{
+ struct ckpt_hdr_file_pipe *h = (struct ckpt_hdr_file_pipe *) ptr;
+ struct file *file;
+ int first, ret;
+
+ if (ptr->h.type != CKPT_HDR_FILE ||
+ ptr->h.len != sizeof(*h) || ptr->f_type != CKPT_FILE_FIFO)
+ return ERR_PTR(-EINVAL);
+
+ if (h->pipe_objref <= 0)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If ckpt_obj_try_fetch() returned ERR_PTR(-EINVAL), this is the
+ * first time for this fifo.
+ */
+ file = ckpt_obj_try_fetch(ctx, h->pipe_objref, CKPT_OBJ_FILE);
+ if (!IS_ERR(file))
+ first = 0;
+ else if (PTR_ERR(file) == -EINVAL)
+ first = 1;
+ else
+ return file;
+
+ /*
+ * To avoid blocking, always open the fifo with O_RDWR;
+ * then fix flags below.
+ */
+ file = restore_open_fname(ctx, (ptr->f_flags & ~O_ACCMODE) | O_RDWR);
+ if (IS_ERR(file))
+ return file;
+
+ if ((ptr->f_flags & O_ACCMODE) == O_RDONLY) {
+ file->f_flags = (file->f_flags & ~O_ACCMODE) | O_RDONLY;
+ file->f_mode &= ~FMODE_WRITE;
+ } else if ((ptr->f_flags & O_ACCMODE) == O_WRONLY) {
+ file->f_flags = (file->f_flags & ~O_ACCMODE) | O_WRONLY;
+ file->f_mode &= ~FMODE_READ;
+ } else if ((ptr->f_flags & O_ACCMODE) != O_RDWR) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* first time: add to objhash and restore fifo's contents */
+ if (first) {
+ ret = ckpt_obj_insert(ctx, file, h->pipe_objref, CKPT_OBJ_FILE);
+ if (ret < 0)
+ goto out;
+
+ ret = restore_pipe(ctx, file);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = restore_file_common(ctx, file, ptr);
+ out:
+ if (ret < 0) {
+ fput(file);
+ file = ERR_PTR(ret);
+ }
+
+ return file;
+}
#endif /* CONFIG_CHECKPOINT */
/*
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 50ef2b6..fbcbee7 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -286,6 +286,8 @@ enum file_type {
#define CKPT_FILE_GENERIC CKPT_FILE_GENERIC
CKPT_FILE_PIPE,
#define CKPT_FILE_PIPE CKPT_FILE_PIPE
+ CKPT_FILE_FIFO,
+#define CKPT_FILE_FIFO CKPT_FILE_FIFO
CKPT_FILE_MAX
#define CKPT_FILE_MAX CKPT_FILE_MAX
};
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index e526a12..596403e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -160,6 +160,8 @@ struct ckpt_ctx;
struct ckpt_hdr_file;
extern struct file *pipe_file_restore(struct ckpt_ctx *ctx,
struct ckpt_hdr_file *ptr);
+extern struct file *fifo_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr);
#endif
#endif
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 050/100] c/r: refuse to checkpoint if monitoring directories with dnotify
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (14 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 049/100] c/r: checkpoint and restore FIFOs Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 063/100] c/r: restore file->f_cred Oren Laadan
` (4 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, linux-fsdevel
From: Matt Helsley <matthltc@us.ibm.com>
We do not support restarting fsnotify watches. inotify and fanotify utilize
anon_inodes for pseudofiles which lack the .checkpoint operation. So they
already cleanly prevent checkpoint. dnotify on the other hand registers
its watches using fcntl() which does not require the userspace task to
hold an fd with an empty .checkpoint operation. This means userspace
could use dnotify to set up fsnotify watches which won't be re-created during
restart.
Check for fsnotify watches created with dnotify and reject checkpoint
if there are any.
Changelog [v21]:
- [Stanislav O. Bezzubtsev] Fix omitted parameter name error
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/checkpoint.c | 5 +++++
fs/notify/dnotify/dnotify.c | 18 ++++++++++++++++++
include/linux/dnotify.h | 6 ++++++
3 files changed, 29 insertions(+), 0 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 06f1130..874be00 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -206,6 +206,11 @@ static int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
return -EBADF;
}
+ if (is_dnotify_attached(file)) {
+ ckpt_err(ctx, -EBADF, "%(T)%(P)dnotify unsupported\n", file);
+ return -EBADF;
+ }
+
ret = file->f_op->checkpoint(ctx, file);
if (ret < 0)
ckpt_err(ctx, ret, "%(T)%(P)file checkpoint failed\n", file);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52..0a63bf6 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -289,6 +289,24 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
return 0;
}
+int is_dnotify_attached(struct file *filp)
+{
+ struct fsnotify_mark_entry *entry;
+ struct inode *inode;
+
+ inode = filp->f_path.dentry->d_inode;
+ if (!S_ISDIR(inode->i_mode))
+ return 0;
+
+ spin_lock(&inode->i_lock);
+ entry = fsnotify_find_mark_entry(dnotify_group, inode);
+ spin_unlock(&inode->i_lock);
+ if (!entry)
+ return 0;
+ fsnotify_put_mark(entry);
+ return 1;
+}
+
/*
* When a process calls fcntl to attach a dnotify watch to a directory it ends
* up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index ecc0628..7093052 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -29,6 +29,7 @@ struct dnotify_struct {
FS_MOVED_FROM | FS_MOVED_TO)
extern void dnotify_flush(struct file *, fl_owner_t);
+extern int is_dnotify_attached(struct file *);
extern int fcntl_dirnotify(int, struct file *, unsigned long);
#else
@@ -37,6 +38,11 @@ static inline void dnotify_flush(struct file *filp, fl_owner_t id)
{
}
+static inline int is_dnotify_attached(struct file *filp)
+{
+ return 0;
+}
+
static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
{
return -EINVAL;
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 063/100] c/r: restore file->f_cred
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (15 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 050/100] c/r: refuse to checkpoint if monitoring directories with dnotify Oren Laadan
@ 2010-05-01 14:15 ` Oren Laadan
2010-05-01 14:16 ` [PATCH v21 079/100] c/r: checkpoint/restart epoll sets Oren Laadan
` (3 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, linux-fsdevel, David Howells
From: Serge E. Hallyn <serue@us.ibm.com>
Restore a file's f_cred. This is set to the cred of the task doing
the open, so often it will be the same as that of the restarted task.
Changelog[v1]:
- [Nathan Lynch] discard const from struct cred * where appropriate
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
Cc: linux-fsdevel@vger.kernel.org
Cc: David Howells <dhowells@redhat.com>
---
fs/checkpoint.c | 21 ++++++++++++++++++---
include/linux/checkpoint_hdr.h | 2 +-
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 874be00..783c920 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -148,14 +148,21 @@ static int scan_fds(struct files_struct *files, int **fdtable)
int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
struct ckpt_hdr_file *h)
{
+ struct cred *f_cred = (struct cred *) file->f_cred;
+
h->f_flags = file->f_flags;
h->f_mode = file->f_mode;
h->f_pos = file->f_pos;
h->f_version = file->f_version;
- ckpt_debug("file %s", file->f_dentry->d_name.name);
+ h->f_credref = checkpoint_obj(ctx, f_cred, CKPT_OBJ_CRED);
+ if (h->f_credref < 0)
+ return h->f_credref;
+
+ ckpt_debug("file %s credref %d", file->f_dentry->d_name.name,
+ h->f_credref);
- /* FIX: need also file->uid, file->gid, file->f_owner, etc */
+ /* FIX: need also file->f_owner, etc */
return 0;
}
@@ -516,8 +523,16 @@ int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
fmode_t new_mode = file->f_mode;
fmode_t saved_mode = (__force fmode_t) h->f_mode;
int ret;
+ struct cred *cred;
+
+ /* FIX: need to restore owner etc */
- /* FIX: need to restore uid, gid, owner etc */
+ /* restore the cred */
+ cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
+ put_cred(file->f_cred);
+ file->f_cred = get_cred(cred);
/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 8598eb5..261badb 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -437,7 +437,7 @@ struct ckpt_hdr_file {
__u32 f_type;
__u32 f_mode;
__u32 f_flags;
- __u32 _padding;
+ __s32 f_credref;
__u64 f_pos;
__u64 f_version;
} __attribute__((aligned(8)));
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 079/100] c/r: checkpoint/restart epoll sets
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (16 preceding siblings ...)
2010-05-01 14:15 ` [PATCH v21 063/100] c/r: restore file->f_cred Oren Laadan
@ 2010-05-01 14:16 ` Oren Laadan
2010-05-01 14:16 ` [PATCH v21 080/100] c/r: checkpoint/restart eventfd Oren Laadan
` (2 subsequent siblings)
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:16 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Davide Libenzi, linux-fsdevel
From: Matt Helsley <matthltc@us.ibm.com>
Save/restore epoll items during checkpoint/restart respectively.
Output the epoll header and items separately. Chunk the output much
like the pid array gets chunked. This ensures that even sub-order 0
allocations will enable checkpoint of large epoll sets. A subsequent
patch will do something similar for the restore path.
On restart, we grab a piece of memory suitable to store a "chunk" of
items for input. Read the input one chunk at a time and add epoll
items for each item in the chunk.
Changelog [v21]:
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog [v19]:
- [Oren Laadan] Fix broken compilation for no-c/r architectures
Changelog [v19-rc1]:
- [Oren Laadan] Return -EBUSY (not BUG_ON) if fd is gone on restart
- [Oren Laadan] Fix the chunk size instead of auto-tune
Changelog v5:
Fix potential recursion during collect.
Replace call to ckpt_obj_collect() with ckpt_collect_file().
[Oren]
Fix checkpoint leak detection when there are more items than
expected.
Cleanup/simplify error write paths. (will complicate in a later
patch) [Oren]
Remove files_deferq bits. [Oren]
Remove extra newline. [Oren]
Remove aggregate check on number of watches added. [Oren]
This is OK since these will be done individually anyway.
Remove check for negative objrefs during restart. [Oren]
Fixup comment regarding race that indicates checkpoint leaks.
[Oren]
s/ckpt_read_obj/ckpt_read_buf_type/ [Oren]
Patch for lots of epoll items follows.
Moved sys_close(epfd) right under fget(). [Oren]
Use CKPT_HDR_BUFFER rather than custome ckpt_read/write_*
This makes it more similar to the pid array code. [Oren]
It also simplifies the error recovery paths.
Tested polling a pipe and 50,000 UNIX sockets.
Changelog v4: ckpt-v18
Use files_deferq as submitted by Dan Smith
Cleanup to only report >= 1 items when debugging.
Changelog v3: [unposted]
Removed most of the TODOs -- the remainder will be removed by
subsequent patches.
Fixed missing ep_file_collect() [Serge]
Rather than include checkpoint_hdr.h declare (but do not define)
the two structs needed in eventpoll.h [Oren]
Complain with ckpt_write_err() when we detect checkpoint obj
leaks. [Oren]
Remove redundant is_epoll_file() check in collect. [Oren]
Move epfile_objref lookup to simplify error handling. [Oren]
Simplify error handling with early return in
ep_eventpoll_checkpoint(). [Oren]
Cleaned up a comment. [Oren]
Shorten CKPT_HDR_FILE_EPOLL_ITEMS (-FILE) [Oren]
Renumbered to indicate that it follows the file table.
Renamed the epoll struct in checkpoint_hdr.h [Oren]
Also renamed substruct.
Fixup return of empty ep_file_restore(). [Oren]
Changed some error returns. [Oren]
Changed some tests to BUG_ON(). [Oren]
Factored out watch insert with epoll_ctl() into do_epoll_ctl().
[Cedric, Oren]
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge Hallyn <serue@us.ibm.com>
---
fs/checkpoint.c | 7 +
fs/eventpoll.c | 334 ++++++++++++++++++++++++++++++++++++----
include/linux/checkpoint_hdr.h | 18 ++
include/linux/eventpoll.h | 17 ++-
4 files changed, 347 insertions(+), 29 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 9b29a26..3bfa692 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -21,6 +21,7 @@
#include <linux/syscalls.h>
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
+#include <linux/eventpoll.h>
#include <net/sock.h>
/**************************************************************************
@@ -632,6 +633,12 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_TTY,
.restore = tty_file_restore,
},
+ /* epoll */
+ {
+ .file_name = "EPOLL",
+ .file_type = CKPT_FILE_EPOLL,
+ .restore = ep_file_restore,
+ },
};
static void *restore_file(struct ckpt_ctx *ctx)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5..99920d2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,9 @@
#include <asm/mman.h>
#include <asm/atomic.h>
+#include <linux/checkpoint.h>
+#include <linux/deferqueue.h>
+
/*
* LOCKING:
* There are three level of locking required by epoll :
@@ -671,10 +674,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
return pollflags != -1 ? pollflags : 0;
}
+#ifdef CONFIG_CHECKPOINT
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
+#endif
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
- .poll = ep_eventpoll_poll
+ .poll = ep_eventpoll_poll,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = ep_eventpoll_checkpoint,
+ .collect = ep_file_collect,
+#endif
};
/* Fast test to see if the file is an evenpoll file */
@@ -1226,35 +1238,18 @@ SYSCALL_DEFINE1(epoll_create, int, size)
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
-SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
- struct epoll_event __user *, event)
+int do_epoll_ctl(int op, int fd,
+ struct file *file, struct file *tfile,
+ struct epoll_event *epds)
{
int error;
- struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
- struct epoll_event epds;
-
- error = -EFAULT;
- if (ep_op_has_event(op) &&
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto error_return;
-
- /* Get the "struct file *" for the eventpoll file */
- error = -EBADF;
- file = fget(epfd);
- if (!file)
- goto error_return;
-
- /* Get the "struct file *" for the target file */
- tfile = fget(fd);
- if (!tfile)
- goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
- goto error_tgt_fput;
+ return error;
/*
* We have to check that the file structure underneath the file descriptor
@@ -1263,7 +1258,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
*/
error = -EINVAL;
if (file == tfile || !is_file_epoll(file))
- goto error_tgt_fput;
+ return error;
/*
* At this point it is safe to assume that the "private_data" contains
@@ -1284,8 +1279,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
+ epds->events |= POLLERR | POLLHUP;
+ error = ep_insert(ep, epds, tfile, fd);
} else
error = -EEXIST;
break;
@@ -1297,15 +1292,46 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
+ epds->events |= POLLERR | POLLHUP;
+ error = ep_modify(ep, epi, epds);
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
-error_tgt_fput:
+ return error;
+}
+
+/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+ struct epoll_event __user *, event)
+{
+ int error;
+ struct file *file, *tfile;
+ struct epoll_event epds;
+
+ error = -EFAULT;
+ if (ep_op_has_event(op) &&
+ copy_from_user(&epds, event, sizeof(struct epoll_event)))
+ goto error_return;
+
+ /* Get the "struct file *" for the eventpoll file */
+ error = -EBADF;
+ file = fget(epfd);
+ if (!file)
+ goto error_return;
+
+ /* Get the "struct file *" for the target file */
+ tfile = fget(fd);
+ if (!tfile)
+ goto error_fput;
+
+ error = do_epoll_ctl(op, fd, file, tfile, &epds);
fput(tfile);
error_fput:
fput(file);
@@ -1413,6 +1439,258 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
#endif /* HAVE_SET_RESTORE_SIGMASK */
+#ifdef CONFIG_CHECKPOINT
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct rb_node *rbp;
+ struct eventpoll *ep;
+ int ret = 0;
+
+ ep = file->private_data;
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ struct epitem *epi;
+
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (is_file_epoll(epi->ffd.file))
+ continue; /* Don't recurse */
+ ret = ckpt_collect_file(ctx, epi->ffd.file);
+ if (ret < 0)
+ break;
+ }
+ mutex_unlock(&ep->mtx);
+ return ret;
+}
+
+struct epoll_deferq_entry {
+ struct ckpt_ctx *ctx;
+ struct file *epfile;
+};
+
+#define CKPT_EPOLL_CHUNK (8096 / (int) sizeof(struct ckpt_eventpoll_item))
+
+static int ep_items_checkpoint(void *data)
+{
+ struct epoll_deferq_entry *dq_entry = data;
+ struct ckpt_ctx *ctx;
+ struct ckpt_hdr_eventpoll_items *h;
+ struct ckpt_eventpoll_item *items;
+ struct rb_node *rbp;
+ struct eventpoll *ep;
+ __s32 epfile_objref;
+ int num_items = 0, ret;
+
+ ctx = dq_entry->ctx;
+
+ epfile_objref = ckpt_obj_lookup(ctx, dq_entry->epfile, CKPT_OBJ_FILE);
+ BUG_ON(epfile_objref <= 0);
+
+ ep = dq_entry->epfile->private_data;
+ mutex_lock(&ep->mtx);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp))
+ num_items++;
+ mutex_unlock(&ep->mtx);
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS);
+ if (!h)
+ return -ENOMEM;
+ h->num_items = num_items;
+ h->epfile_objref = epfile_objref;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret || !num_items)
+ return ret;
+
+ ret = ckpt_write_obj_type(ctx, NULL, sizeof(*items)*num_items,
+ CKPT_HDR_BUFFER);
+ if (ret < 0)
+ return ret;
+
+ items = kzalloc(sizeof(*items) * CKPT_EPOLL_CHUNK, GFP_KERNEL);
+ if (!items)
+ return -ENOMEM;
+
+ /*
+ * Walk the rbtree copying items into the chunk of memory and then
+ * writing them to the checkpoint image
+ */
+ ret = 0;
+ mutex_lock(&ep->mtx);
+ rbp = rb_first(&ep->rbr);
+ while ((num_items > 0) && rbp) {
+ int n = min(num_items, CKPT_EPOLL_CHUNK);
+ int j;
+
+ for (j = 0; rbp && j < n; j++, rbp = rb_next(rbp)) {
+ struct epitem *epi;
+ int objref;
+
+ epi = rb_entry(rbp, struct epitem, rbn);
+ items[j].fd = epi->ffd.fd;
+ items[j].events = epi->event.events;
+ items[j].data = epi->event.data;
+ objref = ckpt_obj_lookup(ctx, epi->ffd.file,
+ CKPT_OBJ_FILE);
+ if (objref <= 0)
+ goto unlock;
+ items[j].file_objref = objref;
+ }
+ ret = ckpt_kwrite(ctx, items, n*sizeof(*items));
+ if (ret < 0)
+ break;
+ num_items -= n;
+ }
+unlock:
+ mutex_unlock(&ep->mtx);
+ kfree(items);
+ if (num_items != 0 || (num_items == 0 && rbp))
+ ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */
+ if (ret)
+ ckpt_err(ctx, ret, "Checkpointing epoll items.\n");
+ return ret;
+}
+
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct ckpt_hdr_file *h;
+ struct epoll_deferq_entry dq_entry;
+ int ret = -ENOMEM;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+ h->f_type = CKPT_FILE_EPOLL;
+ ret = checkpoint_file_common(ctx, file, h);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, &h->h);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Defer saving the epoll items until all of the ffd.file pointers
+ * have an objref; after the file table has been checkpointed.
+ */
+ dq_entry.ctx = ctx;
+ dq_entry.epfile = file;
+ ret = deferqueue_add(ctx->files_deferq, &dq_entry,
+ sizeof(dq_entry), ep_items_checkpoint, NULL);
+out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int ep_items_restore(void *data)
+{
+ struct ckpt_ctx *ctx = deferqueue_data_ptr(data);
+ struct ckpt_hdr_eventpoll_items *h;
+ struct ckpt_eventpoll_item *items = NULL;
+ struct eventpoll *ep;
+ struct file *epfile = NULL;
+ int ret, num_items;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+ num_items = h->num_items;
+ epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
+ ckpt_hdr_put(ctx, h);
+
+ /* Make sure userspace didn't give us a ref to a non-epoll file. */
+ if (IS_ERR(epfile))
+ return PTR_ERR(epfile);
+ if (!is_file_epoll(epfile))
+ return -EINVAL;
+ if (!num_items)
+ return 0;
+
+ ret = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+ if (ret < 0)
+ return ret;
+ /* Make sure the items match the size we expect */
+ if (num_items != (ret / sizeof(*items)))
+ return -EINVAL;
+
+ items = kzalloc(sizeof(*items) * CKPT_EPOLL_CHUNK, GFP_KERNEL);
+ if (!items)
+ return -ENOMEM;
+
+ ep = epfile->private_data;
+
+ while (num_items > 0) {
+ int n = min(num_items, CKPT_EPOLL_CHUNK);
+ int j;
+
+ ret = ckpt_kread(ctx, items, n*sizeof(*items));
+ if (ret < 0)
+ break;
+
+ /* Restore the epoll items/watches */
+ for (j = 0; !ret && j < n; j++) {
+ struct epoll_event epev;
+ struct file *tfile;
+
+ tfile = ckpt_obj_fetch(ctx, items[j].file_objref,
+ CKPT_OBJ_FILE);
+ if (IS_ERR(tfile)) {
+ ret = PTR_ERR(tfile);
+ goto out;
+ }
+ epev.events = items[j].events;
+ epev.data = items[j].data;
+ ret = do_epoll_ctl(EPOLL_CTL_ADD, items[j].fd,
+ epfile, tfile, &epev);
+ }
+ num_items -= n;
+ }
+out:
+ kfree(items);
+ return ret;
+}
+
+struct file *ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *h)
+{
+ struct file *epfile;
+ int epfd, ret;
+
+ if (h->h.type != CKPT_HDR_FILE ||
+ h->h.len != sizeof(*h) ||
+ h->f_type != CKPT_FILE_EPOLL)
+ return ERR_PTR(-EINVAL);
+
+ epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
+ if (epfd < 0)
+ return ERR_PTR(epfd);
+ epfile = fget(epfd);
+ sys_close(epfd); /* harmless even if an error occured */
+ if (!epfile) /* can happen with a malicious user */
+ return ERR_PTR(-EBUSY);
+
+ /*
+ * Needed before we can properly restore the watches and enforce the
+ * limit on watch numbers.
+ */
+ ret = restore_file_common(ctx, epfile, h);
+ if (ret < 0)
+ goto fput_out;
+
+ /*
+ * Defer restoring the epoll items until the file table is
+ * fully restored. Ensures that valid file objrefs will resolve.
+ */
+ ret = deferqueue_add_ptr(ctx->files_deferq, ctx,
+ ep_items_restore, NULL);
+ if (ret < 0) {
+fput_out:
+ fput(epfile);
+ epfile = ERR_PTR(ret);
+ }
+ return epfile;
+}
+
+#endif /* CONFIG_CHECKPOINT */
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 79e8e2d..21540d7 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -127,6 +127,8 @@ enum {
#define CKPT_HDR_TTY CKPT_HDR_TTY
CKPT_HDR_TTY_LDISC,
#define CKPT_HDR_TTY_LDISC CKPT_HDR_TTY_LDISC
+ CKPT_HDR_EPOLL_ITEMS, /* must be after file-table */
+#define CKPT_HDR_EPOLL_ITEMS CKPT_HDR_EPOLL_ITEMS
CKPT_HDR_MM = 401,
#define CKPT_HDR_MM CKPT_HDR_MM
@@ -485,6 +487,8 @@ enum file_type {
#define CKPT_FILE_SOCKET CKPT_FILE_SOCKET
CKPT_FILE_TTY,
#define CKPT_FILE_TTY CKPT_FILE_TTY
+ CKPT_FILE_EPOLL,
+#define CKPT_FILE_EPOLL CKPT_FILE_EPOLL
CKPT_FILE_MAX
#define CKPT_FILE_MAX CKPT_FILE_MAX
};
@@ -701,6 +705,20 @@ struct ckpt_hdr_file_socket {
__s32 sock_objref;
} __attribute__((aligned(8)));
+struct ckpt_hdr_eventpoll_items {
+ struct ckpt_hdr h;
+ __s32 epfile_objref;
+ __u32 num_items;
+} __attribute__((aligned(8)));
+
+/* Contained in a CKPT_HDR_BUFFER following the ckpt_hdr_eventpoll_items */
+struct ckpt_eventpoll_item {
+ __u64 data;
+ __u32 fd;
+ __s32 file_objref;
+ __u32 events;
+} __attribute__((aligned(8)));
+
/* memory layout */
struct ckpt_hdr_mm {
struct ckpt_hdr h;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f6856a5..52282ae 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -56,6 +56,9 @@ struct file;
#ifdef CONFIG_EPOLL
+struct ckpt_ctx;
+struct ckpt_hdr_file;
+
/* Used to initialize the epoll bits inside the "struct file" */
static inline void eventpoll_init_file(struct file *file)
@@ -95,11 +98,23 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
-#else
+#ifdef CONFIG_CHECKPOINT
+extern struct file *ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *h);
+#endif
+#else
+/* !defined(CONFIG_EPOLL) */
static inline void eventpoll_init_file(struct file *file) {}
static inline void eventpoll_release(struct file *file) {}
+#ifdef CONFIG_CHECKPOINT
+static inline struct file *ep_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_file *ptr)
+{
+ return ERR_PTR(-ENOSYS);
+}
+#endif
#endif
#endif /* #ifdef __KERNEL__ */
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 080/100] c/r: checkpoint/restart eventfd
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (17 preceding siblings ...)
2010-05-01 14:16 ` [PATCH v21 079/100] c/r: checkpoint/restart epoll sets Oren Laadan
@ 2010-05-01 14:16 ` Oren Laadan
2010-05-01 14:16 ` [PATCH v21 081/100] c/r: restore task fs_root and pwd (v3) Oren Laadan
2010-05-01 14:16 ` [PATCH v21 082/100] c/r: preliminary support mounts namespace Oren Laadan
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:16 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Davide Libenzi, linux-fsdevel
From: Matt Helsley <matthltc@us.ibm.com>
Save/restore eventfd files. These are anon_inodes just like epoll
but instead of a set of files to poll they are a 64-bit counter
and a flag value. Used for AIO.
[Oren Laadan] Added #ifdef's around checkpoint/restart to compile even
without CONFIG_CHECKPOINT
Changelog[v21]:
- Add missing spin locks around eventfd checkpoint
- Put file_ops->checkpoint under CONFIG_CHECKPOINT
Changelog[v19]:
- Fix broken compilation for architectures that don't support c/r
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
fs/checkpoint.c | 7 +++++
fs/eventfd.c | 57 ++++++++++++++++++++++++++++++++++++++++
include/linux/checkpoint_hdr.h | 8 +++++
include/linux/eventfd.h | 10 +++++++
4 files changed, 82 insertions(+), 0 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 3bfa692..e0f8a15 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -22,6 +22,7 @@
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
#include <linux/eventpoll.h>
+#include <linux/eventfd.h>
#include <net/sock.h>
/**************************************************************************
@@ -639,6 +640,12 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_EPOLL,
.restore = ep_file_restore,
},
+ /* eventfd */
+ {
+ .file_name = "EVENTFD",
+ .file_type = CKPT_FILE_EVENTFD,
+ .restore = eventfd_restore,
+ },
};
static void *restore_file(struct ckpt_ctx *ctx)
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76..307beca 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/kref.h>
#include <linux/eventfd.h>
+#include <linux/checkpoint.h>
struct eventfd_ctx {
struct kref kref;
@@ -288,11 +289,67 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
return res;
}
+#ifdef CONFIG_CHECKPOINT
+static int eventfd_checkpoint(struct ckpt_ctx *ckpt_ctx, struct file *file)
+{
+ struct eventfd_ctx *ctx;
+ struct ckpt_hdr_file_eventfd *h;
+ int ret = -ENOMEM;
+
+ h = ckpt_hdr_get_type(ckpt_ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+ h->common.f_type = CKPT_FILE_EVENTFD;
+ ret = checkpoint_file_common(ckpt_ctx, file, &h->common);
+ if (ret < 0)
+ goto out;
+ ctx = file->private_data;
+ spin_lock_irq(&ctx->wqh.lock);
+ h->count = ctx->count;
+ h->flags = ctx->flags;
+ spin_unlock_irq(&ctx->wqh.lock);
+ ret = ckpt_write_obj(ckpt_ctx, &h->common.h);
+out:
+ ckpt_hdr_put(ckpt_ctx, h);
+ return ret;
+}
+
+struct file *eventfd_restore(struct ckpt_ctx *ckpt_ctx,
+ struct ckpt_hdr_file *ptr)
+{
+ struct ckpt_hdr_file_eventfd *h = (struct ckpt_hdr_file_eventfd *) ptr;
+ struct file *evfile;
+ int evfd, ret;
+
+ /* Already know type == CKPT_HDR_FILE and f_type == CKPT_FILE_EVENTFD */
+ if (h->common.h.len != sizeof(*h))
+ return ERR_PTR(-EINVAL);
+
+ evfd = sys_eventfd2(h->count, h->flags);
+ if (evfd < 0)
+ return ERR_PTR(evfd);
+ evfile = fget(evfd);
+ sys_close(evfd);
+ if (!evfile)
+ return ERR_PTR(-EBUSY);
+
+ ret = restore_file_common(ckpt_ctx, evfile, &h->common);
+ if (ret < 0) {
+ fput(evfile);
+ return ERR_PTR(ret);
+ }
+ return evfile;
+}
+#endif
+
static const struct file_operations eventfd_fops = {
.release = eventfd_release,
.poll = eventfd_poll,
.read = eventfd_read,
.write = eventfd_write,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = eventfd_checkpoint,
+#endif
};
/**
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 21540d7..e89fbf9 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -489,6 +489,8 @@ enum file_type {
#define CKPT_FILE_TTY CKPT_FILE_TTY
CKPT_FILE_EPOLL,
#define CKPT_FILE_EPOLL CKPT_FILE_EPOLL
+ CKPT_FILE_EVENTFD,
+#define CKPT_FILE_EVENTFD CKPT_FILE_EVENTFD
CKPT_FILE_MAX
#define CKPT_FILE_MAX CKPT_FILE_MAX
};
@@ -513,6 +515,12 @@ struct ckpt_hdr_file_pipe {
__s32 pipe_objref;
} __attribute__((aligned(8)));
+struct ckpt_hdr_file_eventfd {
+ struct ckpt_hdr_file common;
+ __u64 count;
+ __u32 flags;
+} __attribute__((aligned(8)));
+
/* socket */
struct ckpt_hdr_socket {
struct ckpt_hdr h;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 91bb4f2..e8238cc 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -39,6 +39,14 @@ ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
__u64 *cnt);
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+struct ckpt_hdr_file;
+
+struct file *eventfd_restore(struct ckpt_ctx *ckpt_ctx,
+ struct ckpt_hdr_file *ptr);
+#endif
+
#else /* CONFIG_EVENTFD */
/*
@@ -77,6 +85,8 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
return -ENOSYS;
}
+#define eventfd_restore NULL
+
#endif
#endif /* _LINUX_EVENTFD_H */
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 081/100] c/r: restore task fs_root and pwd (v3)
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (18 preceding siblings ...)
2010-05-01 14:16 ` [PATCH v21 080/100] c/r: checkpoint/restart eventfd Oren Laadan
@ 2010-05-01 14:16 ` Oren Laadan
2010-05-01 14:16 ` [PATCH v21 082/100] c/r: preliminary support mounts namespace Oren Laadan
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:16 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, Al Viro, linux-fsdevel
Checkpoint and restore task->fs. Tasks sharing task->fs will
share them again after restart.
Original patch by Serge Hallyn <serue@us.ibm.com>
Changelog:
Jan 25: [orenl] Addressed comments by .. myself:
- add leak detection
- change order of save/restore of chroot and cwd
- save/restore fs only after file-table and mm
- rename functions to adapt existing conventions
Dec 28: [serge] Addressed comments by Oren (and Dave)
- define and use {get,put}_fs_struct helpers
- fix locking comment
- define ckpt_read_fname() and use in checkpoint/files.c
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
fs/checkpoint.c | 232 +++++++++++++++++++++++++++++++++++++++-
fs/fs_struct.c | 21 ++++
fs/open.c | 58 ++++++----
include/linux/checkpoint.h | 6 +-
include/linux/checkpoint_hdr.h | 12 ++
include/linux/fs.h | 5 +
include/linux/fs_struct.h | 2 +
kernel/checkpoint/process.c | 17 +++
8 files changed, 325 insertions(+), 28 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index e0f8a15..61b68da 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -15,6 +15,9 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/pipe_fs_i.h>
@@ -369,6 +372,58 @@ int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
return objref;
}
+int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct fs_struct *fs;
+ int fs_objref;
+
+ task_lock(current);
+ fs = t->fs;
+ get_fs_struct(fs);
+ task_unlock(current);
+
+ fs_objref = checkpoint_obj(ctx, fs, CKPT_OBJ_FS);
+ put_fs_struct(fs);
+
+ return fs_objref;
+}
+
+/* called with fs refcount bumped so it won't disappear */
+static int checkpoint_fs(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct fs_struct *fs = ptr;
+ struct ckpt_hdr_fs *h;
+ struct fs_struct *fscopy;
+ int ret;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FS);
+ if (!h)
+ return -ENOMEM;
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
+ if (ret)
+ return ret;
+
+ fscopy = copy_fs_struct(fs);
+ if (!fs)
+ return -ENOMEM;
+
+ ret = checkpoint_fname(ctx, &fscopy->pwd, &ctx->root_fs_path);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)writing path of cwd");
+ goto out;
+ }
+ ret = checkpoint_fname(ctx, &fscopy->root, &ctx->root_fs_path);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "%(T)writing path of fs root");
+ goto out;
+ }
+ ret = 0;
+ out:
+ free_fs_struct(fscopy);
+ return ret;
+}
+
/***********************************************************************
* Collect
*/
@@ -455,10 +510,41 @@ int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
return ret;
}
+int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+ struct fs_struct *fs;
+ int ret;
+
+ task_lock(t);
+ fs = t->fs;
+ get_fs_struct(fs);
+ task_unlock(t);
+
+ ret = ckpt_obj_collect(ctx, fs, CKPT_OBJ_FS);
+
+ put_fs_struct(fs);
+ return ret;
+}
+
/**************************************************************************
* Restart
*/
+static int ckpt_read_fname(struct ckpt_ctx *ctx, char **fname)
+{
+ int len;
+
+ len = ckpt_read_payload(ctx, (void **) fname,
+ PATH_MAX, CKPT_HDR_FILE_NAME);
+ if (len < 0)
+ return len;
+
+ (*fname)[len - 1] = '\0'; /* always play if safe */
+ ckpt_debug("read filename '%s'\n", *fname);
+
+ return len;
+}
+
/**
* restore_open_fname - read a file name and open a file
* @ctx: checkpoint context
@@ -474,11 +560,9 @@ struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags)
if (flags & (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC))
return ERR_PTR(-EINVAL);
- len = ckpt_read_payload(ctx, (void **) &fname,
- PATH_MAX, CKPT_HDR_FILE_NAME);
+ len = ckpt_read_fname(ctx, &fname);
if (len < 0)
return ERR_PTR(len);
- fname[len - 1] = '\0'; /* always play if safe */
ckpt_debug("fname '%s' flags %#x\n", fname, flags);
file = filp_open(fname, flags, 0);
@@ -805,8 +889,136 @@ int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
}
/*
+ * Called by task restore code to set the restarted task's
+ * current->fs to an entry on the hash
+ */
+int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref)
+{
+ struct fs_struct *newfs, *oldfs;
+
+ newfs = ckpt_obj_fetch(ctx, fs_objref, CKPT_OBJ_FS);
+ if (IS_ERR(newfs))
+ return PTR_ERR(newfs);
+
+ task_lock(current);
+ get_fs_struct(newfs);
+ oldfs = current->fs;
+ current->fs = newfs;
+ task_unlock(current);
+ put_fs_struct(oldfs);
+
+ return 0;
+}
+
+static int restore_chroot(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
+{
+ struct nameidata nd;
+ int ret;
+
+ ckpt_debug("attempting chroot to %s\n", name);
+ ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+ if (ret) {
+ ckpt_err(ctx, ret, "%(T)Opening chroot dir %s", name);
+ return ret;
+ }
+ ret = do_chroot(fs, &nd.path);
+ path_put(&nd.path);
+ if (ret) {
+ ckpt_err(ctx, ret, "%(T)Setting chroot %s", name);
+ return ret;
+ }
+ return 0;
+}
+
+static int restore_cwd(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
+{
+ struct nameidata nd;
+ int ret;
+
+ ckpt_debug("attempting chdir to %s\n", name);
+ ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+ if (ret) {
+ ckpt_err(ctx, ret, "%(T)Opening cwd %s", name);
+ return ret;
+ }
+ ret = do_chdir(fs, &nd.path);
+ path_put(&nd.path);
+ if (ret) {
+ ckpt_err(ctx, ret, "%(T)Setting cwd %s", name);
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Called by objhash when it runs into a CKPT_OBJ_FS entry. Creates
+ * an fs_struct with desired chroot/cwd and places it in the hash.
+ */
+static void *restore_fs(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_fs *h;
+ struct fs_struct *fs;
+ char *path;
+ int ret = 0;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FS);
+ if (IS_ERR(h))
+ return ERR_PTR(PTR_ERR(h));
+ ckpt_hdr_put(ctx, h);
+
+ fs = copy_fs_struct(current->fs);
+ if (!fs)
+ return ERR_PTR(-ENOMEM);
+
+ ret = ckpt_read_fname(ctx, &path);
+ if (ret < 0)
+ goto out;
+ ret = restore_cwd(ctx, fs, path);
+ kfree(path);
+ if (ret)
+ goto out;
+
+ ret = ckpt_read_fname(ctx, &path);
+ if (ret < 0)
+ goto out;
+ ret = restore_chroot(ctx, fs, path);
+ kfree(path);
+
+out:
+ if (ret) {
+ free_fs_struct(fs);
+ return ERR_PTR(ret);
+ }
+ return fs;
+}
+
+/*
* fs-related checkpoint objects
*/
+
+static int obj_fs_grab(void *ptr)
+{
+ get_fs_struct((struct fs_struct *) ptr);
+ return 0;
+}
+
+static void obj_fs_drop(void *ptr, int lastref)
+{
+ put_fs_struct((struct fs_struct *) ptr);
+}
+
+static int obj_fs_users(void *ptr)
+{
+ /*
+ * It's safe to not use fs->lock because the fs referenced.
+ * It's also sufficient for leak detection: with no leak the
+ * count can't change; with a leak it will be too big already
+ * (even if it's about to grow), and if it's about to shrink
+ * then it's as if we sampled the count a bit earlier.
+ */
+ return ((struct fs_struct *) ptr)->users;
+}
+
static int obj_file_table_grab(void *ptr)
{
atomic_inc(&((struct files_struct *) ptr)->count);
@@ -839,6 +1051,17 @@ static int obj_file_users(void *ptr)
return atomic_long_read(&((struct file *) ptr)->f_count);
}
+/* fs object */
+static const struct ckpt_obj_ops ckpt_obj_fs_ops = {
+ .obj_name = "FS",
+ .obj_type = CKPT_OBJ_FS,
+ .ref_drop = obj_fs_drop,
+ .ref_grab = obj_fs_grab,
+ .ref_users = obj_fs_users,
+ .checkpoint = checkpoint_fs,
+ .restore = restore_fs,
+};
+
/* files_struct object */
static const struct ckpt_obj_ops ckpt_obj_files_struct_ops = {
.obj_name = "FILE_TABLE",
@@ -865,6 +1088,9 @@ static __init int checkpoint_register_fs(void)
{
int ret;
+ ret = register_checkpoint_obj(&ckpt_obj_fs_ops);
+ if (ret < 0)
+ return ret;
ret = register_checkpoint_obj(&ckpt_obj_files_struct_ops);
if (ret < 0)
return ret;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee0590..2a4c6f5 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -6,6 +6,27 @@
#include <linux/fs_struct.h>
/*
+ * call with owning task locked
+ */
+void get_fs_struct(struct fs_struct *fs)
+{
+ write_lock(&fs->lock);
+ fs->users++;
+ write_unlock(&fs->lock);
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+ int kill;
+
+ write_lock(&fs->lock);
+ kill = !--fs->users;
+ write_unlock(&fs->lock);
+ if (kill)
+ free_fs_struct(fs);
+}
+
+/*
* Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
* It can block.
*/
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9..e9d5626 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -524,6 +524,18 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
return sys_faccessat(AT_FDCWD, filename, mode);
}
+int do_chdir(struct fs_struct *fs, struct path *path)
+{
+ int error;
+
+ error = inode_permission(path->dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+ if (error)
+ return error;
+
+ set_fs_pwd(fs, path);
+ return 0;
+}
+
SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
struct path path;
@@ -531,17 +543,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
error = user_path_dir(filename, &path);
if (error)
- goto out;
-
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
- if (error)
- goto dput_and_out;
-
- set_fs_pwd(current->fs, &path);
+ return error;
-dput_and_out:
+ error = do_chdir(current->fs, &path);
path_put(&path);
-out:
return error;
}
@@ -571,31 +576,36 @@ out:
return error;
}
-SYSCALL_DEFINE1(chroot, const char __user *, filename)
+int do_chroot(struct fs_struct *fs, struct path *path)
{
- struct path path;
int error;
- error = user_path_dir(filename, &path);
+ error = inode_permission(path->dentry->d_inode, MAY_EXEC | MAY_ACCESS);
if (error)
- goto out;
+ return error;
+
+ if (!capable(CAP_SYS_CHROOT))
+ return -EPERM;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+ error = security_path_chroot(path);
if (error)
- goto dput_and_out;
+ return error;
- error = -EPERM;
- if (!capable(CAP_SYS_CHROOT))
- goto dput_and_out;
- error = security_path_chroot(&path);
+ set_fs_root(fs, path);
+ return 0;
+}
+
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
+{
+ struct path path;
+ int error;
+
+ error = user_path_dir(filename, &path);
if (error)
- goto dput_and_out;
+ return error;
- set_fs_root(current->fs, &path);
- error = 0;
-dput_and_out:
+ error = do_chroot(current->fs, &path);
path_put(&path);
-out:
return error;
}
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 09fbb59..c1079b7 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,7 +10,7 @@
* distribution for more details.
*/
-#define CHECKPOINT_VERSION 3
+#define CHECKPOINT_VERSION 4
/* checkpoint user flags */
#define CHECKPOINT_SUBTREE 0x1
@@ -224,6 +224,10 @@ extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
struct ckpt_hdr_file *h);
+extern int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref);
+
/* memory */
extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index e89fbf9..8dbd6e9 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -139,6 +139,9 @@ enum {
CKPT_HDR_MM_CONTEXT,
#define CKPT_HDR_MM_CONTEXT CKPT_HDR_MM_CONTEXT
+ CKPT_HDR_FS = 451, /* must be after file-table, mm */
+#define CKPT_HDR_FS CKPT_HDR_FS
+
CKPT_HDR_IPC = 501,
#define CKPT_HDR_IPC CKPT_HDR_IPC
CKPT_HDR_IPC_SHM,
@@ -209,6 +212,8 @@ enum obj_type {
#define CKPT_OBJ_FILE CKPT_OBJ_FILE
CKPT_OBJ_MM,
#define CKPT_OBJ_MM CKPT_OBJ_MM
+ CKPT_OBJ_FS,
+#define CKPT_OBJ_FS CKPT_OBJ_FS
CKPT_OBJ_SIGHAND,
#define CKPT_OBJ_SIGHAND CKPT_OBJ_SIGHAND
CKPT_OBJ_SIGNAL,
@@ -424,6 +429,7 @@ struct ckpt_hdr_task_objs {
__s32 files_objref;
__s32 mm_objref;
+ __s32 fs_objref;
__s32 sighand_objref;
__s32 signal_objref;
} __attribute__((aligned(8)));
@@ -461,6 +467,12 @@ enum restart_block_type {
};
/* file system */
+struct ckpt_hdr_fs {
+ struct ckpt_hdr h;
+ /* char *fs_root */
+ /* char *fs_pwd */
+} __attribute__((aligned(8)));
+
struct ckpt_hdr_file_table {
struct ckpt_hdr h;
__s32 fdt_nfds;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0a59ea..ee725ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1826,6 +1826,11 @@ extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
struct vfsmount *);
extern int vfs_statfs(struct dentry *, struct kstatfs *);
+struct fs_struct;
+extern int do_chdir(struct fs_struct *fs, struct path *path);
+extern int do_chroot(struct fs_struct *fs, struct path *path);
+
+
extern int current_umask(void);
/* /sys/fs */
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 78a05bf..a73cbcb 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -20,5 +20,7 @@ extern struct fs_struct *copy_fs_struct(struct fs_struct *);
extern void free_fs_struct(struct fs_struct *);
extern void daemonize_fs_struct(void);
extern int unshare_fs_struct(void);
+extern void get_fs_struct(struct fs_struct *);
+extern void put_fs_struct(struct fs_struct *);
#endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
index fa08616..922287b 100644
--- a/kernel/checkpoint/process.c
+++ b/kernel/checkpoint/process.c
@@ -232,6 +232,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
struct ckpt_hdr_task_objs *h;
int files_objref;
int mm_objref;
+ int fs_objref;
int sighand_objref;
int signal_objref;
int first, ret;
@@ -272,6 +273,13 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
return mm_objref;
}
+ /* note: this must come *after* file-table and mm */
+ fs_objref = checkpoint_obj_fs(ctx, t);
+ if (fs_objref < 0) {
+ ckpt_err(ctx, fs_objref, "%(T)process fs\n");
+ return fs_objref;
+ }
+
sighand_objref = checkpoint_obj_sighand(ctx, t);
ckpt_debug("sighand: objref %d\n", sighand_objref);
if (sighand_objref < 0) {
@@ -299,6 +307,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
return -ENOMEM;
h->files_objref = files_objref;
h->mm_objref = mm_objref;
+ h->fs_objref = fs_objref;
h->sighand_objref = sighand_objref;
h->signal_objref = signal_objref;
ret = ckpt_write_obj(ctx, &h->h);
@@ -477,6 +486,9 @@ int ckpt_collect_task(struct ckpt_ctx *ctx, struct task_struct *t)
ret = ckpt_collect_mm(ctx, t);
if (ret < 0)
return ret;
+ ret = ckpt_collect_fs(ctx, t);
+ if (ret < 0)
+ return ret;
ret = ckpt_collect_sighand(ctx, t);
return ret;
@@ -645,6 +657,11 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
if (ret < 0)
goto out;
+ ret = restore_obj_fs(ctx, h->fs_objref);
+ ckpt_debug("fs: ret %d (%p)\n", ret, current->fs);
+ if (ret < 0)
+ return ret;
+
ret = restore_obj_sighand(ctx, h->sighand_objref);
ckpt_debug("sighand: ret %d (%p)\n", ret, current->sighand);
if (ret < 0)
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH v21 082/100] c/r: preliminary support mounts namespace
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
` (19 preceding siblings ...)
2010-05-01 14:16 ` [PATCH v21 081/100] c/r: restore task fs_root and pwd (v3) Oren Laadan
@ 2010-05-01 14:16 ` Oren Laadan
20 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-01 14:16 UTC (permalink / raw)
To: Andrew Morton
Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
Pavel Emelyanov, Oren Laadan, Al Viro, linux-fsdevel
We only allow c/r when all processes shared a single mounts ns.
We do intend to implement c/r of mounts and mounts namespaces in the
kernel. It shouldn't be ugly or complicate locking to do so. Just
haven't gotten around to it. A more complete solution is more than we
want to take on now for v19.
But we'd like as much as possible for everything which we don't
support, to not be checkpointable, since not doing so has in the past
invited slanderous accusations of being a toy implementation :)
Meanwhile, we get the following:
1) Checkpoint bails if not all tasks share the same mnt-ns
2) Leak detection works for full container checkpoint
On restart, all tasks inherit the same mnt-ns of the coordinator, by
default. A follow-up patch to user-cr will add a new switch to the
'restart' to request a CLONE_NEWMNT flag when creating the root-task
of the restart.
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
---
fs/namespace.c | 63 +++++++++++++++++++++++++++++----------
include/linux/checkpoint.h | 2 +-
include/linux/checkpoint_hdr.h | 4 ++
kernel/nsproxy.c | 16 ++++++++--
4 files changed, 65 insertions(+), 20 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8a..e335285 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/idr.h>
#include <linux/fs_struct.h>
+#include <linux/checkpoint.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -2318,6 +2319,49 @@ static void __init init_mount_tree(void)
set_fs_root(current->fs, &root);
}
+void put_mnt_ns(struct mnt_namespace *ns)
+{
+ LIST_HEAD(umount_list);
+
+ if (!atomic_dec_and_test(&ns->count))
+ return;
+ down_write(&namespace_sem);
+ spin_lock(&vfsmount_lock);
+ umount_tree(ns->root, 0, &umount_list);
+ spin_unlock(&vfsmount_lock);
+ up_write(&namespace_sem);
+ release_mounts(&umount_list);
+ kfree(ns);
+}
+EXPORT_SYMBOL(put_mnt_ns);
+
+#ifdef CONFIG_CHECKPOINT
+static int obj_mnt_ns_grab(void *ptr)
+{
+ get_mnt_ns((struct mnt_namespace *) ptr);
+ return 0;
+}
+
+static void obj_mnt_ns_drop(void *ptr, int lastref)
+{
+ put_mnt_ns((struct mnt_namespace *) ptr);
+}
+
+static int obj_mnt_ns_users(void *ptr)
+{
+ return atomic_read(&((struct mnt_namespace *) ptr)->count);
+}
+
+/* mnt_ns object */
+static const struct ckpt_obj_ops ckpt_obj_mntns_ops = {
+ .obj_name = "MOUNTS NS",
+ .obj_type = CKPT_OBJ_MNT_NS,
+ .ref_grab = obj_mnt_ns_grab,
+ .ref_drop = obj_mnt_ns_drop,
+ .ref_users = obj_mnt_ns_users,
+};
+#endif /* CONFIG_CHECKPOINT */
+
void __init mnt_init(void)
{
unsigned u;
@@ -2347,20 +2391,7 @@ void __init mnt_init(void)
printk(KERN_WARNING "%s: kobj create error\n", __func__);
init_rootfs();
init_mount_tree();
+#ifdef CONFIG_CHECKPOINT
+ register_checkpoint_obj(&ckpt_obj_mntns_ops);
+#endif
}
-
-void put_mnt_ns(struct mnt_namespace *ns)
-{
- LIST_HEAD(umount_list);
-
- if (!atomic_dec_and_test(&ns->count))
- return;
- down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
- umount_tree(ns->root, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
- up_write(&namespace_sem);
- release_mounts(&umount_list);
- kfree(ns);
-}
-EXPORT_SYMBOL(put_mnt_ns);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index c1079b7..6560f63 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,7 +10,7 @@
* distribution for more details.
*/
-#define CHECKPOINT_VERSION 4
+#define CHECKPOINT_VERSION 5
/* checkpoint user flags */
#define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 8dbd6e9..e74d668 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -98,6 +98,8 @@ enum {
#define CKPT_HDR_UTS_NS CKPT_HDR_UTS_NS
CKPT_HDR_IPC_NS,
#define CKPT_HDR_IPC_NS CKPT_HDR_IPC_NS
+ CKPT_HDR_MNT_NS,
+#define CKPT_HDR_MNT_NS CKPT_HDR_MNT_NS
CKPT_HDR_CAPABILITIES,
#define CKPT_HDR_CAPABILITIES CKPT_HDR_CAPABILITIES
CKPT_HDR_USER_NS,
@@ -224,6 +226,8 @@ enum obj_type {
#define CKPT_OBJ_UTS_NS CKPT_OBJ_UTS_NS
CKPT_OBJ_IPC_NS,
#define CKPT_OBJ_IPC_NS CKPT_OBJ_IPC_NS
+ CKPT_OBJ_MNT_NS,
+#define CKPT_OBJ_MNT_NS CKPT_OBJ_MNT_NS
CKPT_OBJ_USER_NS,
#define CKPT_OBJ_USER_NS CKPT_OBJ_USER_NS
CKPT_OBJ_CRED,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 67a1456..5bdce9e 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -268,11 +268,18 @@ int ckpt_collect_ns(struct ckpt_ctx *ctx, struct task_struct *t)
* ipc_ns (shm) may keep references to files: if this is the
* first time we see this ipc_ns (ret > 0), proceed inside.
*/
- if (ret)
+ if (ret) {
ret = ckpt_collect_ipc_ns(ctx, nsproxy->ipc_ns);
+ if (ret < 0)
+ goto out;
+ }
#endif
- /* TODO: collect other namespaces here */
+ ret = ckpt_obj_collect(ctx, nsproxy->mnt_ns, CKPT_OBJ_MNT_NS);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
out:
put_nsproxy(nsproxy);
return ret;
@@ -301,7 +308,10 @@ static int checkpoint_ns(struct ckpt_ctx *ctx, void *ptr)
#endif /* CONFIG_IPC_NS */
h->ipc_objref = ret;
- /* TODO: Write other namespaces here */
+ /* FIXME: for now, only marked visited to pacify leaks */
+ ret = ckpt_obj_visit(ctx, nsproxy->mnt_ns, CKPT_OBJ_MNT_NS);
+ if (ret < 0)
+ goto out;
ret = ckpt_write_obj(ctx, &h->h);
out:
--
1.6.3.3
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v21 019/100] Make file_pos_read/write() public and export kernel_write()
2010-05-01 14:15 ` [PATCH v21 019/100] Make file_pos_read/write() public and export kernel_write() Oren Laadan
@ 2010-05-06 12:26 ` Josef Bacik
0 siblings, 0 replies; 24+ messages in thread
From: Josef Bacik @ 2010-05-06 12:26 UTC (permalink / raw)
To: Oren Laadan
Cc: Andrew Morton, containers, linux-kernel, Serge Hallyn,
Matt Helsley, Pavel Emelyanov, linux-fsdevel
On Sat, May 01, 2010 at 10:15:01AM -0400, Oren Laadan wrote:
> These three are used in a subsequent patch to allow the kernel c/r
> code to call vfs_read/write() to read and write data to and from the
> checkpoint image.
>
> This patch makes the following changes:
>
> 1) Move kernel_write() from fs/splice.c to fs/exec.c to be near
> kernel_read()
>
> 2) Make kernel_read/write() iterate if they face partial reads or
> writes, and retry if they face -EAGAIN.
>
> 3) Adjust prototypes of kernel_read/write() to use size_t and ssize_t
>
> 4) Move file_pos_read/write() to include/linux/fs.h
>
> Changelog [ckpt-v21]
> - Introduce kernel_write(), fix kernel_read()
>
> Cc: linux-fsdevel@vger.kernel.org
> Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
> Acked-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
> fs/exec.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++----
> fs/read_write.c | 10 -------
> fs/splice.c | 17 +------------
> include/linux/fs.h | 13 +++++++++-
> 4 files changed, 77 insertions(+), 32 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 49cdaa1..7bacb6a 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -693,23 +693,82 @@ exit:
> }
> EXPORT_SYMBOL(open_exec);
>
> -int kernel_read(struct file *file, loff_t offset,
> - char *addr, unsigned long count)
> +static ssize_t _kernel_read(struct file *file, loff_t offset,
> + char __user *ubuf, size_t count)
> {
> - mm_segment_t old_fs;
> + ssize_t nread;
> + size_t nleft;
> loff_t pos = offset;
> - int result;
> +
> + for (nleft = count; nleft; nleft -= nread) {
> + nread = vfs_read(file, ubuf, nleft, &pos);
> + if (nread <= 0) {
> + if (nread == -EAGAIN) {
> + nread = 0;
> + continue;
> + } else if (nread == 0)
> + break;
> + else
> + return nread;
> + }
> + ubuf += nread;
> + }
> + return count - nleft;
> +}
> +
> +ssize_t kernel_read(struct file *file, loff_t offset,
> + char *addr, size_t count)
> +{
> + mm_segment_t old_fs;
> + ssize_t result;
>
> old_fs = get_fs();
> set_fs(get_ds());
> /* The cast to a user pointer is valid due to the set_fs() */
> - result = vfs_read(file, (void __user *)addr, count, &pos);
> + result = _kernel_read(file, offset, (void __user *)addr, count);
> set_fs(old_fs);
> return result;
> }
>
> EXPORT_SYMBOL(kernel_read);
>
> +static ssize_t _kernel_write(struct file *file, loff_t offset,
> + const char __user *ubuf, size_t count)
> +{
> + ssize_t nwrite;
> + size_t nleft;
> + loff_t pos = offset;
> +
> + for (nleft = count; nleft; nleft -= nwrite) {
> + nwrite = vfs_write(file, ubuf, nleft, &pos);
> + if (nwrite < 0) {
> + if (nwrite == -EAGAIN) {
> + nwrite = 0;
> + continue;
> + } else
> + return nwrite;
> + }
> + ubuf += nwrite;
> + }
> + return count - nleft;
> +}
I'm not entirely sure if this can happen, but if vfs_write doesn't write
anything, but doesn't have an error, we could end up in an infinite loop. Like
I said I'm not sure if thats even possible, but its definitely one of those
things that if it is possible some random security guy is going to figure out
how to exploit it at some point down the line.
> +
> +ssize_t kernel_write(struct file *file, loff_t offset,
> + const char *addr, size_t count)
> +{
> + mm_segment_t old_fs;
> + ssize_t result;
> +
> + old_fs = get_fs();
> + set_fs(get_ds());
> + /* The cast to a user pointer is valid due to the set_fs() */
> + result = _kernel_write(file, offset, (void __user *)addr, count);
> + set_fs(old_fs);
> + return result;
> +}
> +
> +EXPORT_SYMBOL(kernel_write);
> +
> static int exec_mmap(struct mm_struct *mm)
> {
> struct task_struct *tsk;
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 113386d..67b7d83 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -361,16 +361,6 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
>
> EXPORT_SYMBOL(vfs_write);
>
> -static inline loff_t file_pos_read(struct file *file)
> -{
> - return file->f_pos;
> -}
> -
> -static inline void file_pos_write(struct file *file, loff_t pos)
> -{
> - file->f_pos = pos;
> -}
> -
> SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
> {
> struct file *file;
> diff --git a/fs/splice.c b/fs/splice.c
> index 9313b61..188e17d 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -538,21 +538,6 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
> return res;
> }
>
> -static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
> - loff_t pos)
> -{
> - mm_segment_t old_fs;
> - ssize_t res;
> -
> - old_fs = get_fs();
> - set_fs(get_ds());
> - /* The cast to a user pointer is valid due to the set_fs() */
> - res = vfs_write(file, (const char __user *)buf, count, &pos);
> - set_fs(old_fs);
> -
> - return res;
> -}
> -
> ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> struct pipe_inode_info *pipe, size_t len,
> unsigned int flags)
> @@ -1011,7 +996,7 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
> return ret;
>
> data = buf->ops->map(pipe, buf, 0);
> - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
> + ret = kernel_write(sd->u.file, sd->pos, data + buf->offset, sd->len);
> buf->ops->unmap(pipe, buf, data);
>
> return ret;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 39d57bc..9e8b171 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1548,6 +1548,16 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
> struct iovec *fast_pointer,
> struct iovec **ret_pointer);
>
> +static inline loff_t file_pos_read(struct file *file)
> +{
> + return file->f_pos;
> +}
> +
> +static inline void file_pos_write(struct file *file, loff_t pos)
> +{
> + file->f_pos = pos;
> +}
> +
> extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
> extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
> @@ -2127,7 +2137,8 @@ extern struct file *do_filp_open(int dfd, const char *pathname,
> int open_flag, int mode, int acc_mode);
> extern int may_open(struct path *, int, int);
>
> -extern int kernel_read(struct file *, loff_t, char *, unsigned long);
> +extern ssize_t kernel_read(struct file *, loff_t, char *, size_t);
> +extern ssize_t kernel_write(struct file *, loff_t, const char *, size_t);
> extern struct file * open_exec(const char *);
>
> /* fs/dcache.c -- generic fs support functions */
> --
I'd say fix that little nit I had above and you can add
Reviewed-by: Josef Bacik <josef@redhat.com>
Thanks,
Josef
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v21 020/100] c/r: documentation
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
@ 2010-05-06 20:27 ` Randy Dunlap
2010-05-07 6:54 ` Oren Laadan
0 siblings, 1 reply; 24+ messages in thread
From: Randy Dunlap @ 2010-05-06 20:27 UTC (permalink / raw)
To: Oren Laadan
Cc: Andrew Morton, containers, linux-kernel, Serge Hallyn,
Matt Helsley, Pavel Emelyanov, linux-api, linux-mm, linux-fsdevel,
netdev, Dave Hansen
On Sat, 1 May 2010 10:15:02 -0400 Oren Laadan wrote:
> Covers application checkpoint/restart, overall design, interfaces,
> usage, shared objects, and and checkpoint image format.
>
> Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> Acked-by: Serge E. Hallyn <serue@us.ibm.com>
> Tested-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
> Documentation/checkpoint/checkpoint.c | 38 +++
> Documentation/checkpoint/readme.txt | 370 ++++++++++++++++++++++++++++
> Documentation/checkpoint/self_checkpoint.c | 69 +++++
> Documentation/checkpoint/self_restart.c | 40 +++
> Documentation/checkpoint/usage.txt | 247 +++++++++++++++++++
> 5 files changed, 764 insertions(+), 0 deletions(-)
> create mode 100644 Documentation/checkpoint/checkpoint.c
> create mode 100644 Documentation/checkpoint/readme.txt
> create mode 100644 Documentation/checkpoint/self_checkpoint.c
> create mode 100644 Documentation/checkpoint/self_restart.c
> create mode 100644 Documentation/checkpoint/usage.txt
> diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
> new file mode 100644
> index 0000000..4fa5560
> --- /dev/null
> +++ b/Documentation/checkpoint/readme.txt
> @@ -0,0 +1,370 @@
> +
...
> +In contrast, when checkpointing a subtree of a container it is up to
> +the user to ensure that dependencies either don't exist or can be
> +safely ignored. This is useful, for instance, for HPC scenarios or
> +even a user that would like to periodically checkpoint a long-running
who
> +batch job.
> +
...
> +
> +Checkpoint image format
> +=======================
> +
...
> +
> +The container configuration section containers information that is
contains
> +global to the container. Security (LSM) configuration is one example.
> +Network configuration and container-wide mounts may also go here, so
> +that the userspace restart coordinator can re-create a suitable
> +environment.
> +
...
> +
> +Then the state of all tasks is saved, in the order that they appear in
> +the tasks array above. For each state, we save data like task_struct,
> +namespaces, open files, memory layout, memory contents, cpu state,
CPU (throughout, please)
> +signals and signal handlers, etc. For resources that are shared among
> +multiple processes, we first checkpoint said resource (and only once),
> +and in the task data we give a reference to it. More about shared
> +resources below.
> +
...
> +
> +Shared objects
> +==============
> +
> +Many resources may be shared by multiple tasks (e.g. file descriptors,
> +memory address space, etc), or even have multiple references from
etc.),
> +other resources (e.g. a single inode that represents two ends of a
> +pipe).
> +
...
> +Memory contents format
> +======================
> +
> +The memory contents of a given memory address space (->mm) is dumped
are (I think)
> +as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
> +This header details the vma properties, and a reference to a file
> +(if file backed) or an inode (or shared memory) object.
> +
> +The vma header is followed by the actual contents - but only those
> +pages that need to be saved, i.e. dirty pages. They are written in
> +chunks of data, where each chunks contains a header that indicates
chunk
> +that number of pages in the chunk, followed by an array of virtual
the
> +addresses and then an array of actual page contents. The last chunk
> +holds zero pages.
> +
...
> +Kernel interfaces
> +=================
> +
> +* To checkpoint a vma, the 'struct vm_operations_struct' needs to
> + provide a method ->checkpoint:
> + int checkpoint(struct ckpt_ctx *, struct vma_struct *)
> + Restart requires a matching (exported) restore:
> + int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
> +
> +* To checkpoint a file, the 'struct file_operations' needs to provide
> + the methods ->checkpoint and ->collect:
> + int checkpoint(struct ckpt_ctx *, struct file *)
> + int collect(struct ckpt_ctx *, struct file *)
> + Restart requires a matching (exported) restore:
> + int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
> + For most file systems, generic_file_{checkpoint,restore}() can be
> + used.
> +
> +* To checkpoint a socket, the 'struct proto_ops' needs to provide
To checkpoint/restart a socket,
> + the methods ->checkpoint, ->collect and ->restore:
> + int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
> + int collect(struct ckpt_ctx *ctx, struct socket *sock);
> + int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
> diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
> new file mode 100644
> index 0000000..c6fc045
> --- /dev/null
> +++ b/Documentation/checkpoint/usage.txt
> @@ -0,0 +1,247 @@
> +
> + How to use Checkpoint-Restart
> + =========================================
> +
> +
> +API
> +===
> +
> +The API consists of three new system calls:
> +
> +* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
flags,
> +
> + Checkpoint a (sub-)container whose root task is identified by @pid,
> + to the open file indicated by @fd. If @logfd isn't -1, it indicates
> + an open file to which error and debug messages are written. @flags
> + may be one or more of:
> + - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
> + (other value are not allowed).
> +
> + Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
> + it returns from a restart, and -1 if an error occurs. The ckptid will
> + uniquely identify a checkpoint image, for as long as the checkpoint
> + is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
> + partial checkpoint, residing in kernel memory).
> +
> +* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
> +
> + Restart a process hierarchy from a checkpoint image that is read from
> + the blob stored in the file indicated by @fd. If @logfd isn't -1, it
> + indicates an open file to which error and debug messages are written.
> + @flags will have future meaning (must be 0 for now). @pid indicates
> + the root of the hierarchy as seen in the coordinator's pid-namespace,
> + and is expected to be a child of the coordinator. @flags may be one
> + or more of:
> + - RESTART_TASKSELF : (self) restart of a single process
> + - RESTART_FROEZN : processes remain frozen once restart completes
FROZEN ?
> + - RESTART_GHOST : process is a ghost (placeholder for a pid)
about @flags: Above says both of these:
a) @flags will have future meaning (must be 0 for now)
b) @flags may be one or more of:
so please decide which one it is ;)
> + (Note that this argument may mean 'ckptid' to identify an in-kernel
> + checkpoint image, with some @flags in the future).
> +
> + Returns: -1 if an error occurs, 0 on success when restarting from a
> + "self" checkpoint, and return value of system call at the time of the
> + checkpoint when restarting from an "external" checkpoint.
> +
...
> +
> +Sysctl/proc
> +===========
> +
> +/proc/sys/kernel/ckpt_unpriv_allowed [default = 1]
> + controls whether c/r operation is allowed for unprivileged users
C/R
> +
> +
> +Operation
> +=========
> +
> +The granularity of a checkpoint usually is a process hierarchy. The
> +'pid' argument is interpreted in the caller's pid namespace. So to
> +checkpoint a container whose init task (pid 1 in that pidns) appears
> +as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
> +pid 1 will attempt to checkpoint the caller's container, and if the
> +caller isn't privileged and init is owned by root, it will fail.
> +
> +Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
> +which does not refer to a container's init task, then sys_checkpoint()
> +would return -EINVAL.
returns -EINVAL.
...
> +
> +
> +User tools
> +==========
> +
> +* checkpoint(1): a tool to perform a checkpoint of a container/subtree
> +* restart(1): a tool to restart a container/subtree
> +* ckptinfo: a tool to examine a checkpoint image
> +
> +It is best to use the dedicated user tools for checkpoint and restart.
> +
> +If you insist, then here is a code snippet that illustrates how a
> +checkpoint is initiated by a process inside a container - the logic is
> +similar to fork():
> + ...
> + ckptid = checkpoint(0, ...);
> + switch (crid) {
(ckptid) ?
> + case -1:
> + perror("checkpoint failed");
> + break;
> + default:
> + fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
s/ret/ckptid/ ?
> + /* proceed with execution after checkpoint */
> + ...
> + break;
> + case 0:
> + fprintf(stderr, "returned after restart\n");
> + /* proceed with action required following a restart */
> + ...
> + break;
> + }
> + ...
> +
> +And to initiate a restart, the process in an empty container can use
> +logic similar to execve():
> + ...
> + if (restart(pid, ...) < 0)
> + perror("restart failed");
> + /* only get here if restart failed */
> + ...
> +
> +Note, that the code also supports "self" checkpoint, where a process
Note that
> +can checkpoint itself. This mode does not capture the relationships of
> +the task with other tasks, or any shared resources. It is useful for
> +application that wish to be able to save and restore their state.
applications
> +They will either not use (or care about) shared resources, or they
> +will be aware of the operations and adapt suitably after a restart.
> +The code above can also be used for "self" checkpoint.
> +
> +
> +You may find the following sample programs useful:
> +
> +* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
checkpoints
> +* self_checkpoint.c: a simple test program doing self-checkpoint
> +* self_restart.c: restarts a (self-) checkpoint image from stdin
> +
> +See also the utilities 'checkpoint' and 'restart' (from user-cr).
> +
> +
> +"External" checkpoint
> +=====================
> +
> +To do "external" checkpoint, you need to first freeze that other task
> +either using the freezer cgroup.
eh? cannot parse that.
> +
> +Restart does not preserve the original PID yet, (because we haven't
> +solved yet the fork-with-specific-pid issue). In a real scenario, you
> +probably want to first create a new names space, and have the init
namespace,
> +task there call 'sys_restart()'.
> +
> +I tested it this way:
...
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v21 020/100] c/r: documentation
2010-05-06 20:27 ` Randy Dunlap
@ 2010-05-07 6:54 ` Oren Laadan
0 siblings, 0 replies; 24+ messages in thread
From: Oren Laadan @ 2010-05-07 6:54 UTC (permalink / raw)
To: Randy Dunlap
Cc: Andrew Morton, containers, linux-kernel, Serge Hallyn,
Matt Helsley, Pavel Emelyanov, linux-api, linux-mm, linux-fsdevel,
netdev, Dave Hansen
Thanks for reading carefully through and pointing out
glitches and inconsistencies. I'll fix it for next post.
Oren.
On 05/06/2010 04:27 PM, Randy Dunlap wrote:
> On Sat, 1 May 2010 10:15:02 -0400 Oren Laadan wrote:
>
>> Covers application checkpoint/restart, overall design, interfaces,
>> usage, shared objects, and and checkpoint image format.
>>
>> Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
>> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
>> Acked-by: Serge E. Hallyn <serue@us.ibm.com>
>> Tested-by: Serge E. Hallyn <serue@us.ibm.com>
>> ---
>> Documentation/checkpoint/checkpoint.c | 38 +++
>> Documentation/checkpoint/readme.txt | 370 ++++++++++++++++++++++++++++
>> Documentation/checkpoint/self_checkpoint.c | 69 +++++
>> Documentation/checkpoint/self_restart.c | 40 +++
>> Documentation/checkpoint/usage.txt | 247 +++++++++++++++++++
>> 5 files changed, 764 insertions(+), 0 deletions(-)
>> create mode 100644 Documentation/checkpoint/checkpoint.c
>> create mode 100644 Documentation/checkpoint/readme.txt
>> create mode 100644 Documentation/checkpoint/self_checkpoint.c
>> create mode 100644 Documentation/checkpoint/self_restart.c
>> create mode 100644 Documentation/checkpoint/usage.txt
>
>> diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
>> new file mode 100644
>> index 0000000..4fa5560
>> --- /dev/null
>> +++ b/Documentation/checkpoint/readme.txt
>> @@ -0,0 +1,370 @@
>> +
> ...
>> +In contrast, when checkpointing a subtree of a container it is up to
>> +the user to ensure that dependencies either don't exist or can be
>> +safely ignored. This is useful, for instance, for HPC scenarios or
>> +even a user that would like to periodically checkpoint a long-running
>
> who
>
>> +batch job.
>> +
> ...
>
>> +
>> +Checkpoint image format
>> +=======================
>> +
> ...
>
>> +
>> +The container configuration section containers information that is
>
> contains
>
>> +global to the container. Security (LSM) configuration is one example.
>> +Network configuration and container-wide mounts may also go here, so
>> +that the userspace restart coordinator can re-create a suitable
>> +environment.
>> +
> ...
>
>> +
>> +Then the state of all tasks is saved, in the order that they appear in
>> +the tasks array above. For each state, we save data like task_struct,
>> +namespaces, open files, memory layout, memory contents, cpu state,
>
> CPU (throughout, please)
>
>> +signals and signal handlers, etc. For resources that are shared among
>> +multiple processes, we first checkpoint said resource (and only once),
>> +and in the task data we give a reference to it. More about shared
>> +resources below.
>> +
> ...
>
>> +
>> +Shared objects
>> +==============
>> +
>> +Many resources may be shared by multiple tasks (e.g. file descriptors,
>> +memory address space, etc), or even have multiple references from
>
> etc.),
>
>> +other resources (e.g. a single inode that represents two ends of a
>> +pipe).
>> +
> ...
>
>> +Memory contents format
>> +======================
>> +
>> +The memory contents of a given memory address space (->mm) is dumped
>
> are (I think)
>
>> +as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
>> +This header details the vma properties, and a reference to a file
>> +(if file backed) or an inode (or shared memory) object.
>> +
>> +The vma header is followed by the actual contents - but only those
>> +pages that need to be saved, i.e. dirty pages. They are written in
>> +chunks of data, where each chunks contains a header that indicates
>
> chunk
>
>> +that number of pages in the chunk, followed by an array of virtual
>
> the
>
>> +addresses and then an array of actual page contents. The last chunk
>> +holds zero pages.
>> +
> ...
>
>> +Kernel interfaces
>> +=================
>> +
>> +* To checkpoint a vma, the 'struct vm_operations_struct' needs to
>> + provide a method ->checkpoint:
>> + int checkpoint(struct ckpt_ctx *, struct vma_struct *)
>> + Restart requires a matching (exported) restore:
>> + int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
>> +
>> +* To checkpoint a file, the 'struct file_operations' needs to provide
>> + the methods ->checkpoint and ->collect:
>> + int checkpoint(struct ckpt_ctx *, struct file *)
>> + int collect(struct ckpt_ctx *, struct file *)
>> + Restart requires a matching (exported) restore:
>> + int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
>> + For most file systems, generic_file_{checkpoint,restore}() can be
>> + used.
>> +
>> +* To checkpoint a socket, the 'struct proto_ops' needs to provide
>
> To checkpoint/restart a socket,
>
>> + the methods ->checkpoint, ->collect and ->restore:
>> + int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
>> + int collect(struct ckpt_ctx *ctx, struct socket *sock);
>> + int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
>
>
>> diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
>> new file mode 100644
>> index 0000000..c6fc045
>> --- /dev/null
>> +++ b/Documentation/checkpoint/usage.txt
>> @@ -0,0 +1,247 @@
>> +
>> + How to use Checkpoint-Restart
>> + =========================================
>> +
>> +
>> +API
>> +===
>> +
>> +The API consists of three new system calls:
>> +
>> +* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
>
> flags,
>
>> +
>> + Checkpoint a (sub-)container whose root task is identified by @pid,
>> + to the open file indicated by @fd. If @logfd isn't -1, it indicates
>> + an open file to which error and debug messages are written. @flags
>> + may be one or more of:
>> + - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
>> + (other value are not allowed).
>> +
>> + Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
>> + it returns from a restart, and -1 if an error occurs. The ckptid will
>> + uniquely identify a checkpoint image, for as long as the checkpoint
>> + is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
>> + partial checkpoint, residing in kernel memory).
>> +
>> +* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
>> +
>> + Restart a process hierarchy from a checkpoint image that is read from
>> + the blob stored in the file indicated by @fd. If @logfd isn't -1, it
>> + indicates an open file to which error and debug messages are written.
>> + @flags will have future meaning (must be 0 for now). @pid indicates
>> + the root of the hierarchy as seen in the coordinator's pid-namespace,
>> + and is expected to be a child of the coordinator. @flags may be one
>> + or more of:
>> + - RESTART_TASKSELF : (self) restart of a single process
>> + - RESTART_FROEZN : processes remain frozen once restart completes
>
> FROZEN ?
>
>> + - RESTART_GHOST : process is a ghost (placeholder for a pid)
>
> about @flags: Above says both of these:
> a) @flags will have future meaning (must be 0 for now)
> b) @flags may be one or more of:
>
> so please decide which one it is ;)
>
>> + (Note that this argument may mean 'ckptid' to identify an in-kernel
>> + checkpoint image, with some @flags in the future).
>> +
>> + Returns: -1 if an error occurs, 0 on success when restarting from a
>> + "self" checkpoint, and return value of system call at the time of the
>> + checkpoint when restarting from an "external" checkpoint.
>> +
> ...
>> +
>> +Sysctl/proc
>> +===========
>> +
>> +/proc/sys/kernel/ckpt_unpriv_allowed [default = 1]
>> + controls whether c/r operation is allowed for unprivileged users
>
> C/R
>
>> +
>> +
>> +Operation
>> +=========
>> +
>> +The granularity of a checkpoint usually is a process hierarchy. The
>> +'pid' argument is interpreted in the caller's pid namespace. So to
>> +checkpoint a container whose init task (pid 1 in that pidns) appears
>> +as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
>> +pid 1 will attempt to checkpoint the caller's container, and if the
>> +caller isn't privileged and init is owned by root, it will fail.
>> +
>> +Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
>> +which does not refer to a container's init task, then sys_checkpoint()
>> +would return -EINVAL.
>
> returns -EINVAL.
>
> ...
>
>> +
>> +
>> +User tools
>> +==========
>> +
>> +* checkpoint(1): a tool to perform a checkpoint of a container/subtree
>> +* restart(1): a tool to restart a container/subtree
>> +* ckptinfo: a tool to examine a checkpoint image
>> +
>> +It is best to use the dedicated user tools for checkpoint and restart.
>> +
>> +If you insist, then here is a code snippet that illustrates how a
>> +checkpoint is initiated by a process inside a container - the logic is
>> +similar to fork():
>> + ...
>> + ckptid = checkpoint(0, ...);
>> + switch (crid) {
>
> (ckptid) ?
>
>> + case -1:
>> + perror("checkpoint failed");
>> + break;
>> + default:
>> + fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>
> s/ret/ckptid/ ?
>
>> + /* proceed with execution after checkpoint */
>> + ...
>> + break;
>> + case 0:
>> + fprintf(stderr, "returned after restart\n");
>> + /* proceed with action required following a restart */
>> + ...
>> + break;
>> + }
>> + ...
>> +
>> +And to initiate a restart, the process in an empty container can use
>> +logic similar to execve():
>> + ...
>> + if (restart(pid, ...) < 0)
>> + perror("restart failed");
>> + /* only get here if restart failed */
>> + ...
>> +
>> +Note, that the code also supports "self" checkpoint, where a process
>
> Note that
>
>> +can checkpoint itself. This mode does not capture the relationships of
>> +the task with other tasks, or any shared resources. It is useful for
>> +application that wish to be able to save and restore their state.
>
> applications
>
>> +They will either not use (or care about) shared resources, or they
>> +will be aware of the operations and adapt suitably after a restart.
>> +The code above can also be used for "self" checkpoint.
>> +
>> +
>> +You may find the following sample programs useful:
>> +
>> +* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
>
> checkpoints
>
>> +* self_checkpoint.c: a simple test program doing self-checkpoint
>> +* self_restart.c: restarts a (self-) checkpoint image from stdin
>> +
>> +See also the utilities 'checkpoint' and 'restart' (from user-cr).
>> +
>> +
>> +"External" checkpoint
>> +=====================
>> +
>> +To do "external" checkpoint, you need to first freeze that other task
>> +either using the freezer cgroup.
>
> eh? cannot parse that.
>
>> +
>> +Restart does not preserve the original PID yet, (because we haven't
>> +solved yet the fork-with-specific-pid issue). In a real scenario, you
>> +probably want to first create a new names space, and have the init
>
> namespace,
>
>> +task there call 'sys_restart()'.
>> +
>> +I tested it this way:
>
> ...
>
> ---
> ~Randy
> *** Remember to use Documentation/SubmitChecklist when testing your code ***
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2010-05-07 6:54 UTC | newest]
Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>
2010-05-01 14:15 ` [PATCH v21 019/100] Make file_pos_read/write() public and export kernel_write() Oren Laadan
2010-05-06 12:26 ` Josef Bacik
2010-05-01 14:15 ` [PATCH v21 020/100] c/r: documentation Oren Laadan
2010-05-06 20:27 ` Randy Dunlap
2010-05-07 6:54 ` Oren Laadan
2010-05-01 14:15 ` [PATCH v21 022/100] c/r: basic infrastructure for checkpoint/restart Oren Laadan
2010-05-01 14:15 ` [PATCH v21 036/100] c/r: introduce vfs_fcntl() Oren Laadan
2010-05-01 14:15 ` [PATCH v21 037/100] c/r: introduce new 'file_operations': ->checkpoint, ->collect() Oren Laadan
2010-05-01 14:15 ` [PATCH v21 038/100] c/r: checkpoint and restart open file descriptors Oren Laadan
2010-05-01 14:15 ` [PATCH v21 039/100] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
2010-05-01 14:15 ` [PATCH v21 041/100] c/r: dump memory address space (private memory) Oren Laadan
2010-05-01 14:15 ` [PATCH v21 042/100] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
2010-05-01 14:15 ` [PATCH v21 043/100] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
2010-05-01 14:15 ` [PATCH v21 044/100] c/r: add checkpoint operation for opened files of generic filesystems Oren Laadan
2010-05-01 14:15 ` [PATCH v21 046/100] c/r: dump anonymous- and file-mapped- shared memory Oren Laadan
2010-05-01 14:15 ` [PATCH v21 047/100] splice: export pipe/file-to-pipe/file functionality Oren Laadan
[not found] ` <1272723382-19470-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-05-01 14:15 ` [PATCH v21 048/100] c/r: support for open pipes Oren Laadan
2010-05-01 14:15 ` [PATCH v21 049/100] c/r: checkpoint and restore FIFOs Oren Laadan
2010-05-01 14:15 ` [PATCH v21 050/100] c/r: refuse to checkpoint if monitoring directories with dnotify Oren Laadan
2010-05-01 14:15 ` [PATCH v21 063/100] c/r: restore file->f_cred Oren Laadan
2010-05-01 14:16 ` [PATCH v21 079/100] c/r: checkpoint/restart epoll sets Oren Laadan
2010-05-01 14:16 ` [PATCH v21 080/100] c/r: checkpoint/restart eventfd Oren Laadan
2010-05-01 14:16 ` [PATCH v21 081/100] c/r: restore task fs_root and pwd (v3) Oren Laadan
2010-05-01 14:16 ` [PATCH v21 082/100] c/r: preliminary support mounts namespace Oren Laadan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).