Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v21 074/100] c/r: add support for listening INET sockets (v2)
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
	Pavel Emelyanov, Dan Smith, netdev, Andrew Morton
In-Reply-To: <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>

From: Dan Smith <danms@us.ibm.com>

This is an incremental step towards supporting checkpoint/restart on
AF_INET sockets.  In this scenario, any sockets that were in TCP_LISTEN
state are restored as they were.  Any that were connected are forced to
TCP_CLOSE.  This should cover a range of use cases that involve
applications that are tolerant of such an interruption.

Changelog [v21]:
  - Do not include checkpoint_hdr.h explicitly
Changelog [v19-rc1]:
  - [Matt Helsley] Add cpp definitions for enums
Changes in v2:
 - Fix whitespace
 - Fix return in inet_checkpoint() on failed ckpt_hdr_get_type()
 - Fix garbage free on error path of inet_read_buffer()
 - Fix unnecessary ret=0 in inet_read_buffers()
 - Add inet_precheck() (like unix) to validate the address lengths (and
   more later)

Cc: netdev@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Smith <danms@us.ibm.com>
Acked-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
 Documentation/checkpoint/readme.txt |   21 ++++
 include/linux/checkpoint_hdr.h      |   15 +++
 include/net/inet_common.h           |   13 +++
 net/checkpoint.c                    |    9 ++
 net/ipv4/Makefile                   |    1 +
 net/ipv4/af_inet.c                  |    6 +
 net/ipv4/checkpoint.c               |  189 +++++++++++++++++++++++++++++++++++
 7 files changed, 254 insertions(+), 0 deletions(-)
 create mode 100644 net/ipv4/checkpoint.c

diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
index 4fa5560..2548bb4 100644
--- a/Documentation/checkpoint/readme.txt
+++ b/Documentation/checkpoint/readme.txt
@@ -344,6 +344,27 @@ we will be forced to more carefully review each of those features.
 However, this can be controlled with a sysctl-variable.
 
 
+Sockets
+=======
+
+For AF_UNIX sockets, both endpoints must be within the checkpointed
+task set to maintain a connected state after restart.  UNIX sockets
+that are in the process of passing a descriptor will cause the
+checkpoint to fail with -EBUSY indicating a transient state that
+cannot be checkpointed.  Listening sockets with an unaccepted peer
+will also cause an -EBUSY result.
+
+AF_INET sockets with endpoints outside the checkpointed task set may
+remain open if care is taken to avoid TCP timeouts and resets.
+Careful use of a virtual IP address can help avoid emission of an RST
+to the non-checkpointed endpoint.  If desired, the
+RESTART_SOCK_LISTENONLY flag may be passed to the restart syscall
+which will cause all connected AF_INET sockets to be closed during the
+restore process.  Listening sockets will still be restored to their
+original state, which makes this mode a candidate for something like
+an HTTP server.
+
+
 Kernel interfaces
 =================
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 2be2d2c..07934ee 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/socket.h>
 #include <linux/un.h>
+#include <linux/in.h>
 
 #ifndef CONFIG_CHECKPOINT
 #error linux/checkpoint_hdr.h included directly (without CONFIG_CHECKPOINT)
@@ -21,10 +22,14 @@
 
 #else /* __KERNEL__ */
 
+#else
+
 #include <sys/types.h>
 #include <linux/types.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <sys/un.h>
+#include <netinet/in.h>
 
 #endif
 
@@ -159,6 +164,8 @@ enum {
 #define CKPT_HDR_SOCKET_FRAG CKPT_HDR_SOCKET_FRAG
 	CKPT_HDR_SOCKET_UNIX,
 #define CKPT_HDR_SOCKET_UNIX CKPT_HDR_SOCKET_UNIX
+	CKPT_HDR_SOCKET_INET,
+#define CKPT_HDR_SOCKET_INET CKPT_HDR_SOCKET_INET
 
 	CKPT_HDR_TAIL = 9001,
 #define CKPT_HDR_TAIL CKPT_HDR_TAIL
@@ -577,6 +584,14 @@ struct ckpt_hdr_socket_unix {
 	struct sockaddr_un raddr;
 } __attribute__ ((aligned(8)));
 
+struct ckpt_hdr_socket_inet {
+	struct ckpt_hdr h;
+	__u32 laddr_len;
+	__u32 raddr_len;
+	struct sockaddr_in laddr;
+	struct sockaddr_in raddr;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_file_socket {
 	struct ckpt_hdr_file common;
 	__s32 sock_objref;
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 18c7732..bf04e6e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -45,6 +45,19 @@ extern int			inet_ctl_sock_create(struct sock **sk,
 						     unsigned char protocol,
 						     struct net *net);
 
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
+extern int inet_collect(struct ckpt_ctx *ctx, struct socket *sock);
+extern int inet_restore(struct ckpt_ctx *cftx, struct socket *sock,
+			struct ckpt_hdr_socket *h);
+#else
+#define inet_checkpoint NULL
+#define inet_collect NULL
+#define inet_restore NULL
+#endif /* CONFIG_CHECKPOINT */
+
 static inline void inet_ctl_sock_destroy(struct sock *sk)
 {
 	sk_release_kernel(sk);
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 9116d7a..d972044 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -930,6 +930,15 @@ static void *restore_sock(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		goto err;
 
+	if ((h->sock_common.family == AF_INET) &&
+	    (h->sock.state != TCP_LISTEN)) {
+		/* Temporary hack to enable restore of TCP_LISTEN sockets
+		 * while forcing anything else to a closed state
+		 */
+		sock->sk->sk_state = TCP_CLOSE;
+		sock->state = SS_UNCONNECTED;
+	}
+
 	ckpt_hdr_put(ctx, h);
 	return sock->sk;
  err:
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87c..c00d8ce 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f713574..8b7d3dd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -876,6 +876,9 @@ const struct proto_ops inet_stream_ops = {
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = tcp_sendpage,
 	.splice_read	   = tcp_splice_read,
+	.checkpoint        = inet_checkpoint,
+	.restore           = inet_restore,
+	.collect           = inet_collect,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
@@ -902,6 +905,9 @@ const struct proto_ops inet_dgram_ops = {
 	.recvmsg	   = sock_common_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
+	.checkpoint        = inet_checkpoint,
+	.restore           = inet_restore,
+	.collect           = inet_collect,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
new file mode 100644
index 0000000..0b62a15
--- /dev/null
+++ b/net/ipv4/checkpoint.c
@@ -0,0 +1,189 @@
+/*
+ *  Copyright 2009 IBM Corporation
+ *
+ *  Author(s): Dan Smith <danms@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/namei.h>
+#include <linux/tcp.h>
+#include <linux/in.h>
+#include <linux/deferqueue.h>
+#include <linux/checkpoint.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+struct dq_sock {
+	struct ckpt_ctx *ctx;
+	struct sock *sk;
+};
+
+struct dq_buffers {
+	struct ckpt_ctx *ctx;
+	struct sock *sk;
+};
+
+int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
+{
+	struct ckpt_hdr_socket_inet *in;
+	int ret;
+
+	in = ckpt_hdr_get_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
+	if (!in)
+		return -EINVAL;
+
+	ret = ckpt_sock_getnames(ctx, sock,
+				(struct sockaddr *)&in->laddr, &in->laddr_len,
+				(struct sockaddr *)&in->raddr, &in->raddr_len);
+	if (ret)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
+ out:
+	ckpt_hdr_put(ctx, in);
+
+	return ret;
+}
+
+int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
+{
+	return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
+}
+
+static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct sk_buff *skb = NULL;
+
+	skb = sock_restore_skb(ctx);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	skb_queue_tail(queue, skb);
+	return skb->len;
+}
+
+static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_queue *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_QUEUE);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	for (i = 0; i < h->skb_count; i++) {
+		ret = inet_read_buffer(ctx, queue);
+		ckpt_debug("read inet buffer %i: %i", i, ret);
+		if (ret < 0)
+			goto out;
+
+		if (ret > h->total_bytes) {
+			ret = -EINVAL;
+			ckpt_err(ctx, ret, "Buffers exceeded claim");
+			goto out;
+		}
+
+		h->total_bytes -= ret;
+	}
+
+	ret = h->skb_count;
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int inet_deferred_restore_buffers(void *data)
+{
+	struct dq_buffers *dq = (struct dq_buffers *)data;
+	struct ckpt_ctx *ctx = dq->ctx;
+	struct sock *sk = dq->sk;
+	int ret;
+
+	ret = inet_read_buffers(ctx, &sk->sk_receive_queue);
+	ckpt_debug("(R) inet_read_buffers: %i\n", ret);
+	if (ret < 0)
+		return ret;
+
+	ret = inet_read_buffers(ctx, &sk->sk_write_queue);
+	ckpt_debug("(W) inet_read_buffers: %i\n", ret);
+
+	return ret;
+}
+
+static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct dq_buffers dq;
+
+	dq.ctx = ctx;
+	dq.sk = sk;
+
+	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+			      inet_deferred_restore_buffers, NULL);
+}
+
+static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
+{
+	if (in->laddr_len > sizeof(struct sockaddr_in)) {
+		ckpt_debug("laddr_len is too big\n");
+		return -EINVAL;
+	}
+
+	if (in->raddr_len > sizeof(struct sockaddr_in)) {
+		ckpt_debug("raddr_len is too big\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int inet_restore(struct ckpt_ctx *ctx,
+		 struct socket *sock,
+		 struct ckpt_hdr_socket *h)
+{
+	struct ckpt_hdr_socket_inet *in;
+	int ret = 0;
+
+	in = ckpt_read_obj_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
+	if (IS_ERR(in))
+		return PTR_ERR(in);
+
+	ret = inet_precheck(sock, in);
+	if (ret < 0)
+		goto out;
+
+	/* Listening sockets and those that are closed but have a local
+	 * address need to call bind()
+	 */
+	if ((h->sock.state == TCP_LISTEN) ||
+	    ((h->sock.state == TCP_CLOSE) && (in->laddr_len > 0))) {
+		sock->sk->sk_reuse = 2;
+		inet_sk(sock->sk)->freebind = 1;
+		ret = sock->ops->bind(sock,
+				      (struct sockaddr *)&in->laddr,
+				      in->laddr_len);
+		ckpt_debug("inet bind: %i\n", ret);
+		if (ret < 0)
+			goto out;
+
+		if (h->sock.state == TCP_LISTEN) {
+			ret = sock->ops->listen(sock, h->sock.backlog);
+			ckpt_debug("inet listen: %i\n", ret);
+			if (ret < 0)
+				goto out;
+		}
+	} else {
+		if (!sock_flag(sock->sk, SOCK_DEAD))
+			ret = inet_defer_restore_buffers(ctx, sock->sk);
+	}
+ out:
+	ckpt_hdr_put(ctx, in);
+
+	return ret;
+}
+
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH v21 072/100] c/r: introduce checkpoint/restore methods to struct proto_ops
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
	Pavel Emelyanov, Oren Laadan, netdev
In-Reply-To: <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>

This adds new 'proto_ops' function for checkpointing and restoring
sockets. This allows the checkpoint/restart code to compile nicely
when, e.g., AF_UNIX sockets are selected as a module.

It also adds a function 'collecting' a socket for leak-detection
during full-container checkpoint. This is useful for those sockets
that hold references to other "collectable" objects. Two examples are
AF_UNIX buffers which reference the socket of origin, and sockets that
have file descriptors in-transit.

Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/net.h |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 4157b5d..1f32c70 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -153,6 +153,9 @@ struct sockaddr;
 struct msghdr;
 struct module;
 
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+
 struct proto_ops {
 	int		family;
 	struct module	*owner;
@@ -201,6 +204,12 @@ struct proto_ops {
 				      int offset, size_t size, int flags);
 	ssize_t 	(*splice_read)(struct socket *sock,  loff_t *ppos,
 				       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+	int		(*checkpoint)(struct ckpt_ctx *ctx,
+				      struct socket *sock);
+	int		(*collect)(struct ckpt_ctx *ctx,
+				   struct socket *sock);
+	int		(*restore)(struct ckpt_ctx *ctx, struct socket *sock,
+				   struct ckpt_hdr_socket *h);
 };
 
 #define DECLARE_SOCKADDR(type, dst, src)	\
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH v21 022/100] c/r: basic infrastructure for checkpoint/restart
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
	Pavel Emelyanov, Oren Laadan, linux-mm, linux-fsdevel, netdev
In-Reply-To: <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>

Add those interfaces, as well as helpers needed to easily manage the
file format. The code is roughly broken out as follows:

kernel/checkpoint/sys.c - user/kernel data transfer, as well as setup
  of the c/r context (a per-checkpoint data structure for housekeeping)

kernel/checkpoint/checkpoint.c - output wrappers and checkpoint handling

kernel/checkpoint/restart.c - input wrappers and restart handling

kernel/checkpoint/process.c - c/r of task data

For now, we can only checkpoint the 'current' task ("self" checkpoint),
and the 'pid' argument to the syscall is ignored.

Patches to add the per-architecture support as well as the actual
work to do the memory checkpoint follow in subsequent patches.

Changelog[v21]:
  - Complain if checkpoint_hdr.h included without CONFIG_CHECKPOINT
  - Do not include checkpoint_hdr.h explicitly
  - Consolidate ckpt_read/write with kernel_read/write
  - Reorganize code:move checkpoint/* to kernel/checkpoint/*
  - [Christoffer Dall] Fix trivial bug in ckpt_msg macro
Changelog[v20]:
  - Export key symbols to enable c/r from kernel modules
Changelog[v19]:
  - [Serge Hallyn] Use ckpt_err() to for bad header values
Changelog[v19-rc3]:
  - sys_{checkpoint,restart} to use ptregs prototype
Changelog[v19-rc1]:
  - Set ctx->errno in do_ckpt_msg() if needed
  - Document prototype of ckpt_write_err in header
  - Update prototype of ckpt_read_obj()
  - Fix up headers so we can munge them for use by userspace
  - [Matt Helsley] Check for empty string for _ckpt_write_err()
  - [Matt Helsley] Add cpp definitions for enums
  - [Serge Hallyn] Add global section container to image format
  - [Matt Helsley] Fix total byte read/write count for large images
  - ckpt_read_buf_type() to accept max payload (excludes ckpt_hdr)
  - [Serge Hallyn] Define new api for error and debug logging
  - Use logfd in sys_{checkpoint,restart}
Changelog[v18]:
  - Detect error-headers in input data on restart, and abort.
  - Standard format for checkpoint error strings (and documentation)
  - [Matt Helsley] Rename headerless struct ckpt_hdr_* to struct ckpt_*
  - [Dan Smith] Add an errno validation function
  - Add ckpt_read_payload(): read a variable-length object (no header)
  - Add ckpt_read_string(): same for strings (ensures null-terminated)
  - Add ckpt_read_consume(): consumes next object without processing
Changelog[v17]:
  - Fix compilation for architectures that don't support checkpoint
  - Save/restore t->{set,clear}_child_tid
  - Restart(2) isn't idempotent: must return -EINTR if interrupted
  - ckpt_debug does not depend on DYNAMIC_DEBUG, on by default
  - Export generic checkpoint headers to userespace
  - Fix comment for prototype of sys_restart
  - Have ckpt_debug() print global-pid and __LINE__
  - Only save and test kernel constants once (in header)
Changelog[v16]:
  - Split ctx->flags to ->uflags (user flags) and ->kflags (kernel flags)
  - Introduce __ckpt_write_err() and ckpt_write_err() to report errors
  - Allow @ptr == NULL to write (or read) header only without payload
  - Introduce _ckpt_read_obj_type()
Changelog[v15]:
  - Replace header buffer in ckpt_ctx (hbuf,hpos) with kmalloc/kfree()
Changelog[v14]:
  - Cleanup interface to get/put hdr buffers
  - Merge checkpoint and restart code into a single file (per subsystem)
  - Take uts_sem around access to uts->{release,version,machine}
  - Embed ckpt_hdr in all ckpt_hdr_...., cleanup read/write helpers
  - Define sys_checkpoint(0,...) as asking for a self-checkpoint (Serge)
  - Revert use of 'pr_fmt' to avoid tainting whom includes us (Nathan Lynch)
  - Explicitly indicate length of UTS fields in header
  - Discard field 'h->parent' from ckpt_hdr
Changelog[v12]:
  - ckpt_kwrite/ckpt_kread() again use vfs_read(), vfs_write() (safer)
  - Split ckpt_write/ckpt_read() to two parts: _ckpt_write/read() helper
  - Befriend with sparse : explicit conversion to 'void __user *'
  - Redfine 'pr_fmt' instead of using special ckpt_debug()
Changelog[v10]:
  - add ckpt_write_buffer(), ckpt_read_buffer() and ckpt_read_buf_type()
  - force end-of-string in ckpt_read_string() (fix possible DoS)
Changelog[v9]:
  - ckpt_kwrite/ckpt_kread() use file->f_op->write() directly
  - Drop ckpt_uwrite/ckpt_uread() since they aren't used anywhere
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (although it's not really needed)
Changelog[v5]:
  - Rename headers files s/ckpt/checkpoint/
Changelog[v2]:
  - Added utsname->{release,version,machine} to checkpoint header
  - Pad header structures to 64 bits to ensure compatibility

Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
 arch/x86/include/asm/unistd_32.h   |    2 -
 arch/x86/kernel/syscall_table_32.S |    2 -
 include/linux/Kbuild               |    3 +
 include/linux/checkpoint.h         |  202 ++++++++++++++++
 include/linux/checkpoint_hdr.h     |  135 +++++++++++
 include/linux/checkpoint_types.h   |   44 ++++
 include/linux/magic.h              |    3 +
 include/linux/syscalls.h           |    4 -
 kernel/checkpoint/Makefile         |    6 +-
 kernel/checkpoint/checkpoint.c     |  213 +++++++++++++++++
 kernel/checkpoint/process.c        |  101 ++++++++
 kernel/checkpoint/restart.c        |  460 +++++++++++++++++++++++++++++++++++
 kernel/checkpoint/sys.c            |  461 +++++++++++++++++++++++++++++++++++-
 lib/Kconfig.debug                  |   13 +
 14 files changed, 1632 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/checkpoint.h
 create mode 100644 include/linux/checkpoint_hdr.h
 create mode 100644 include/linux/checkpoint_types.h
 create mode 100644 kernel/checkpoint/checkpoint.c
 create mode 100644 kernel/checkpoint/process.c
 create mode 100644 kernel/checkpoint/restart.c

diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 007d7cd..cb67842 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -344,8 +344,6 @@
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
 #define __NR_eclone		338
-#define __NR_checkpoint		339
-#define __NR_restart		340
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 2d5a6b0..0c92570 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,5 +338,3 @@ ENTRY(sys_call_table)
 	.long sys_perf_event_open
 	.long sys_recvmmsg
 	.long ptregs_eclone
-	.long sys_checkpoint
-	.long sys_restart		/* 340 */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e2ea0b2..71bb8d1 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -45,6 +45,9 @@ header-y += bsg.h
 header-y += can.h
 header-y += cciss_defs.h
 header-y += cdk.h
+header-y += checkpoint.h
+header-y += checkpoint_hdr.h
+header-y += checkpoint_types.h
 header-y += chio.h
 header-y += coda_psdev.h
 header-y += coff.h
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
new file mode 100644
index 0000000..4bb5b8d
--- /dev/null
+++ b/include/linux/checkpoint.h
@@ -0,0 +1,202 @@
+#ifndef _LINUX_CHECKPOINT_H_
+#define _LINUX_CHECKPOINT_H_
+/*
+ *  Generic checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define CHECKPOINT_VERSION  3
+
+/* misc user visible */
+#define CHECKPOINT_FD_NONE	-1
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CHECKPOINT
+
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/err.h>
+
+/* sycall helpers */
+extern long do_sys_checkpoint(pid_t pid, int fd,
+			      unsigned long flags, int logfd);
+extern long do_sys_restart(pid_t pid, int fd,
+			   unsigned long flags, int logfd);
+
+/* ckpt_ctx: kflags */
+#define CKPT_CTX_CHECKPOINT_BIT		0
+#define CKPT_CTX_RESTART_BIT		1
+#define CKPT_CTX_ERROR_BIT		3
+
+#define CKPT_CTX_CHECKPOINT	(1 << CKPT_CTX_CHECKPOINT_BIT)
+#define CKPT_CTX_RESTART	(1 << CKPT_CTX_RESTART_BIT)
+#define CKPT_CTX_ERROR		(1 << CKPT_CTX_ERROR_BIT)
+
+
+extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, size_t count);
+extern int ckpt_kread(struct ckpt_ctx *ctx, void *buf, size_t count);
+
+extern void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int n);
+extern void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr);
+extern void *ckpt_hdr_get(struct ckpt_ctx *ctx, int n);
+extern void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int n, int type);
+
+extern int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h);
+extern int ckpt_write_obj_type(struct ckpt_ctx *ctx,
+			       void *ptr, int len, int type);
+extern int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len);
+
+extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
+			       void *ptr, int len, int type);
+extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
+extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type);
+extern int ckpt_read_payload(struct ckpt_ctx *ctx,
+			     void **ptr, int max, int type);
+extern char *ckpt_read_string(struct ckpt_ctx *ctx, int max);
+extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
+
+extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
+extern long do_restart(struct ckpt_ctx *ctx, pid_t pid);
+
+/* task */
+extern int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_task(struct ckpt_ctx *ctx);
+
+static inline int ckpt_validate_errno(int errno)
+{
+	return (errno >= 0) && (errno < MAX_ERRNO);
+}
+
+/* debugging flags */
+#define CKPT_DBASE	0x1		/* anything */
+#define CKPT_DSYS	0x2		/* generic (system) */
+#define CKPT_DRW	0x4		/* image read/write */
+
+#define CKPT_DDEFAULT	0xffff		/* default debug level */
+
+#ifndef CKPT_DFLAG
+#define CKPT_DFLAG	0xffff		/* everything */
+#endif
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+extern unsigned long ckpt_debug_level;
+
+/*
+ * This is deprecated
+ */
+/* use this to select a specific debug level */
+#define _ckpt_debug(level, fmt, args...)				\
+	do {								\
+		if (ckpt_debug_level & (level))				\
+			printk(KERN_DEBUG "[%d:%d:c/r:%s:%d] " fmt,	\
+				current->pid,				\
+				current->nsproxy ?			\
+				task_pid_vnr(current) : -1,		\
+				__func__, __LINE__, ## args);		\
+	} while (0)
+
+/*
+ * CKPT_DBASE is the base flags, doesn't change
+ * CKPT_DFLAG is to be redfined in each source file
+ */
+#define ckpt_debug(fmt, args...)  \
+	_ckpt_debug(CKPT_DBASE | CKPT_DFLAG, fmt, ## args)
+
+#else
+
+/*
+ * This is deprecated
+ */
+#define _ckpt_debug(level, fmt, args...)	do { } while (0)
+#define ckpt_debug(fmt, args...)		do { } while (0)
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
+
+/*
+ * prototypes for the new logging api
+ */
+
+extern void ckpt_msg_lock(struct ckpt_ctx *ctx);
+extern void ckpt_msg_unlock(struct ckpt_ctx *ctx);
+
+extern void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...);
+extern void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...);
+
+/*
+ * Append formatted msg to ctx->msg[ctx->msg_len].
+ * Must be called after expanding format.
+ * May be called under spinlock.
+ * Must be called under ckpt_msg_lock().
+ */
+extern void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...);
+
+/*
+ * Write ctx->msg to all relevant places.
+ * Must not be called under spinlock.
+ * Must be called under ckpt_msg_lock().
+ */
+extern void _ckpt_msg_complete(struct ckpt_ctx *ctx);
+
+/*
+ * Append an enhanced formatted message to ctx->msg.
+ * This will not write the message out to the applicable files, so
+ * the caller will have to use _ckpt_msg_complete() to finish up.
+ * @ctx must be a valid checkpoint context.
+ * @fmt is the extended format
+ *
+ * Must be called with ckpt_msg_lock held.
+ */
+#define _ckpt_msg(ctx, fmt, args...) do {	\
+	_do_ckpt_msg(ctx, 0, fmt, ##args);	\
+} while (0)
+
+/*
+ * Append an enhanced formatted message to ctx->msg.
+ * This will take the ckpt_msg_lock and also write the message out
+ * to the applicable files by calling _ckpt_msg_complete().
+ * @ctx must be a valid checkpoint context.
+ * @fmt is the extended format
+ *
+ * Must not be called under spinlock.
+ */
+#define ckpt_msg(ctx, fmt, args...) do {	\
+	do_ckpt_msg(ctx, 0, fmt, ##args);	\
+} while (0)
+
+/*
+ * Report an error.
+ * This will take the ckpt_msg_lock and also write the message out
+ * to the applicable files by calling _ckpt_msg_complete().
+ * @ctx must be a valid checkpoint context.
+ * @err is the error value
+ * @fmt is the extended format
+ *
+ * Must not be called under spinlock.
+ */
+
+#define ckpt_err(ctx, err, fmt, args...) do {				\
+	do_ckpt_msg(ctx, err, "[E @ %s:%d]" fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+/*
+ * Same as ckpt_err() but
+ *	must be called with ctx->msg_mutex held
+ *	can be called under spinlock
+ *	must be followed by a call to _ckpt_msg_complete()
+ */
+#define _ckpt_err(ctx, err, fmt, args...) do {				\
+	_do_ckpt_msg(ctx, err, "[E @ %s:%d]" fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#endif /* CONFIG_CHECKPOINT */
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CHECKPOINT_H_ */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
new file mode 100644
index 0000000..7ccebc7
--- /dev/null
+++ b/include/linux/checkpoint_hdr.h
@@ -0,0 +1,135 @@
+#ifndef _CHECKPOINT_CKPT_HDR_H_
+#define _CHECKPOINT_CKPT_HDR_H_
+/*
+ *  Generic container checkpoint-restart
+ *
+ *  Copyright (C) 2008-2010 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#ifndef __KERNEL__
+#include <sys/types.h>
+#include <linux/types.h>
+#endif
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+
+#ifndef CONFIG_CHECKPOINT
+#error linux/checkpoint_hdr.h included directly (without CONFIG_CHECKPOINT)
+#endif
+
+#endif
+
+#include <linux/utsname.h>
+
+/*
+ * To maintain compatibility between 32-bit and 64-bit architecture flavors,
+ * keep data 64-bit aligned: use padding for structure members, and use
+ * __attribute__((aligned (8))) for the entire structure.
+ *
+ * Quoting Arnd Bergmann:
+ *   "This structure has an odd multiple of 32-bit members, which means
+ *   that if you put it into a larger structure that also contains 64-bit
+ *   members, the larger structure may get different alignment on x86-32
+ *   and x86-64, which you might want to avoid. I can't tell if this is
+ *   an actual problem here. ... In this case, I'm pretty sure that
+ *   sizeof(ckpt_hdr_task) on x86-32 is different from x86-64, since it
+ *   will be 32-bit aligned on x86-32."
+ */
+
+/*
+ * header format: 'struct ckpt_hdr' must prefix all other headers. Therfore
+ * when a header is passed around, the information about it (type, size)
+ * is readily available. Structs that include a struct ckpt_hdr are named
+ * struct ckpt_hdr_* by convention (usualy the struct ckpt_hdr is the first
+ * member).
+ */
+struct ckpt_hdr {
+	__u32 type;
+	__u32 len;
+} __attribute__((aligned(8)));
+
+/* header types */
+enum {
+	CKPT_HDR_HEADER = 1,
+#define CKPT_HDR_HEADER CKPT_HDR_HEADER
+	CKPT_HDR_CONTAINER,
+#define CKPT_HDR_CONTAINER CKPT_HDR_CONTAINER
+	CKPT_HDR_BUFFER,
+#define CKPT_HDR_BUFFER CKPT_HDR_BUFFER
+	CKPT_HDR_STRING,
+#define CKPT_HDR_STRING CKPT_HDR_STRING
+
+	CKPT_HDR_TASK = 101,
+#define CKPT_HDR_TASK CKPT_HDR_TASK
+
+	CKPT_HDR_TAIL = 9001,
+#define CKPT_HDR_TAIL CKPT_HDR_TAIL
+
+	CKPT_HDR_ERROR = 9999,
+#define CKPT_HDR_ERROR CKPT_HDR_ERROR
+};
+
+/* kernel constants */
+struct ckpt_const {
+	/* task */
+	__u16 task_comm_len;
+	/* uts */
+	__u16 uts_release_len;
+	__u16 uts_version_len;
+	__u16 uts_machine_len;
+} __attribute__((aligned(8)));
+
+/* checkpoint image header */
+struct ckpt_hdr_header {
+	struct ckpt_hdr h;
+	__u64 magic;
+
+	__u16 _padding;
+
+	__u16 major;
+	__u16 minor;
+	__u16 patch;
+	__u16 rev;
+
+	struct ckpt_const constants;
+
+	__u64 time;	/* when checkpoint taken */
+	__u64 uflags;	/* uflags from checkpoint */
+
+	/*
+	 * the header is followed by three strings:
+	 *   char release[const.uts_release_len];
+	 *   char version[const.uts_version_len];
+	 *   char machine[const.uts_machine_len];
+	 */
+} __attribute__((aligned(8)));
+
+/* checkpoint image trailer */
+struct ckpt_hdr_tail {
+	struct ckpt_hdr h;
+	__u64 magic;
+} __attribute__((aligned(8)));
+
+/* container configuration section header */
+struct ckpt_hdr_container {
+	struct ckpt_hdr h;
+} __attribute__((aligned(8)));;
+
+/* task data */
+struct ckpt_hdr_task {
+	struct ckpt_hdr h;
+	__u32 state;
+	__u32 exit_state;
+	__u32 exit_code;
+	__u32 exit_signal;
+
+	__u64 set_child_tid;
+	__u64 clear_child_tid;
+} __attribute__((aligned(8)));
+
+#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
new file mode 100644
index 0000000..13d6dd5
--- /dev/null
+++ b/include/linux/checkpoint_types.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_CHECKPOINT_TYPES_H_
+#define _LINUX_CHECKPOINT_TYPES_H_
+/*
+ *  Generic checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+
+struct ckpt_ctx {
+	int crid;		/* unique checkpoint id */
+
+	pid_t root_pid;		/* container identifier */
+
+	unsigned long kflags;	/* kerenl flags */
+	unsigned long uflags;	/* user flags */
+	unsigned long oflags;	/* restart: uflags from checkpoint */
+
+	struct file *file;	/* input/output file */
+	struct file *logfile;	/* status/debug log file */
+	loff_t total;		/* total read/written */
+
+	struct task_struct *tsk;/* checkpoint: current target task */
+	char err_string[256];	/* checkpoint: error string */
+
+	int errno;		/* errno that caused failure */
+
+#define CKPT_MSG_LEN 1024
+	char fmt[CKPT_MSG_LEN];
+	char msg[CKPT_MSG_LEN];
+	int msglen;
+	struct mutex msg_mutex;
+};
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CHECKPOINT_TYPES_H_ */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index eb9800f..e04117a 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -58,4 +58,7 @@
 #define DEVPTS_SUPER_MAGIC	0x1cd1
 #define SOCKFS_MAGIC		0x534F434B
 
+#define CHECKPOINT_MAGIC_HEAD  0x00feed0cc0a2d200LL
+#define CHECKPOINT_MAGIC_TAIL  0x002d2a0cc0deef00LL
+
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d1d1703..057929b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -834,10 +834,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  struct timespec __user *, const sigset_t __user *,
 			  size_t);
-asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags,
-			       int logfd);
-asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags,
-			    int logfd);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff --git a/kernel/checkpoint/Makefile b/kernel/checkpoint/Makefile
index 8a32c6f..99364cc 100644
--- a/kernel/checkpoint/Makefile
+++ b/kernel/checkpoint/Makefile
@@ -2,4 +2,8 @@
 # Makefile for linux checkpoint/restart.
 #
 
-obj-$(CONFIG_CHECKPOINT) += sys.o
+obj-$(CONFIG_CHECKPOINT) += \
+	sys.o \
+	checkpoint.o \
+	restart.o \
+	process.o
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
new file mode 100644
index 0000000..75b43e6
--- /dev/null
+++ b/kernel/checkpoint/checkpoint.c
@@ -0,0 +1,213 @@
+/*
+ *  Checkpoint logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/utsname.h>
+#include <linux/magic.h>
+#include <linux/checkpoint.h>
+
+/* unique checkpoint identifier (FIXME: should be per-container ?) */
+static atomic_t ctx_count = ATOMIC_INIT(0);
+
+/**
+ * ckpt_write_obj - write an object
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ */
+int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	return ckpt_kwrite(ctx, h, h->len);
+}
+EXPORT_SYMBOL(ckpt_write_obj);
+
+/**
+ * ckpt_write_obj_type - write an object (from a pointer)
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ * @type: desired type
+ *
+ * If @ptr is NULL, then write only the header (payload to follow)
+ */
+int ckpt_write_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h));
+	if (!h)
+		return -ENOMEM;
+
+	h->type = type;
+	h->len = len + sizeof(*h);
+
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	ret = ckpt_kwrite(ctx, h, sizeof(*h));
+	if (ret < 0)
+		goto out;
+	if (ptr)
+		ret = ckpt_kwrite(ctx, ptr, len);
+ out:
+	_ckpt_hdr_put(ctx, h, sizeof(*h));
+	return ret;
+}
+EXPORT_SYMBOL(ckpt_write_obj_type);
+
+/**
+ * ckpt_write_buffer - write an object of type buffer
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ */
+int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	return ckpt_write_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+EXPORT_SYMBOL(ckpt_write_buffer);
+
+/**
+ * ckpt_write_string - write an object of type string
+ * @ctx: checkpoint context
+ * @str: string pointer
+ * @len: string length
+ */
+int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len)
+{
+	return ckpt_write_obj_type(ctx, str, len, CKPT_HDR_STRING);
+}
+EXPORT_SYMBOL(ckpt_write_string);
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+static void fill_kernel_const(struct ckpt_const *h)
+{
+	struct task_struct *tsk;
+	struct new_utsname *uts;
+
+	/* task */
+	h->task_comm_len = sizeof(tsk->comm);
+	/* uts */
+	h->uts_release_len = sizeof(uts->release);
+	h->uts_version_len = sizeof(uts->version);
+	h->uts_machine_len = sizeof(uts->machine);
+}
+
+/* write the checkpoint header */
+static int checkpoint_write_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts;
+	struct timeval ktv;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (!h)
+		return -ENOMEM;
+
+	do_gettimeofday(&ktv);
+	uts = utsname();
+
+	h->magic = CHECKPOINT_MAGIC_HEAD;
+	h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
+	h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
+	h->patch = (LINUX_VERSION_CODE) & 0xff;
+
+	h->rev = CHECKPOINT_VERSION;
+
+	h->uflags = ctx->uflags;
+	h->time = ktv.tv_sec;
+
+	fill_kernel_const(&h->constants);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	down_read(&uts_sem);
+	ret = ckpt_write_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
+ up:
+	up_read(&uts_sem);
+	return ret;
+}
+
+/* write the container configuration section */
+static int checkpoint_container(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_container *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+	if (!h)
+		return -ENOMEM;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/* write the checkpoint trailer */
+static int checkpoint_write_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (!h)
+		return -ENOMEM;
+
+	h->magic = CHECKPOINT_MAGIC_TAIL;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
+{
+	long ret;
+
+	ret = checkpoint_write_header(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_container(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_task(ctx, current);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_write_tail(ctx);
+	if (ret < 0)
+		goto out;
+
+	/* on success, return (unique) checkpoint identifier */
+	ctx->crid = atomic_inc_return(&ctx_count);
+	ret = ctx->crid;
+ out:
+	return ret;
+}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
new file mode 100644
index 0000000..abd9025
--- /dev/null
+++ b/kernel/checkpoint/process.c
@@ -0,0 +1,101 @@
+/*
+ *  Checkpoint task structure
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/sched.h>
+#include <linux/checkpoint.h>
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+/* dump the task_struct of a given task */
+static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (!h)
+		return -ENOMEM;
+
+	h->state = t->state;
+	h->exit_state = t->exit_state;
+	h->exit_code = t->exit_code;
+	h->exit_signal = t->exit_signal;
+
+	h->set_child_tid = (unsigned long) t->set_child_tid;
+	h->clear_child_tid = (unsigned long) t->clear_child_tid;
+
+	/* FIXME: save remaining relevant task_struct fields */
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
+}
+
+/* dump the entire state of a given task */
+int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int ret;
+
+	ctx->tsk = t;
+
+	ret = checkpoint_task_struct(ctx, t);
+	ckpt_debug("task %d\n", ret);
+
+	ctx->tsk = NULL;
+	return ret;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+/* read the task_struct into the current task */
+static int restore_task_struct(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task *h;
+	struct task_struct *t = current;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	memset(t->comm, 0, TASK_COMM_LEN);
+	ret = _ckpt_read_string(ctx, t->comm, TASK_COMM_LEN);
+	if (ret < 0)
+		goto out;
+
+	t->set_child_tid = (int __user *) (unsigned long) h->set_child_tid;
+	t->clear_child_tid = (int __user *) (unsigned long) h->clear_child_tid;
+
+	/* FIXME: restore remaining relevant task_struct fields */
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the entire state of the current task */
+int restore_task(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	ret = restore_task_struct(ctx);
+	ckpt_debug("task %d\n", ret);
+
+	return ret;
+}
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
new file mode 100644
index 0000000..cd9945c
--- /dev/null
+++ b/kernel/checkpoint/restart.c
@@ -0,0 +1,460 @@
+/*
+ *  Restart logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/magic.h>
+#include <linux/utsname.h>
+#include <linux/checkpoint.h>
+
+static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	char *ptr;
+	int len, ret;
+
+	len = h->len - sizeof(*h);
+	ptr = kzalloc(len + 1, GFP_KERNEL);
+	if (!ptr) {
+		ckpt_debug("insufficient memory to report image error\n");
+		return -ENOMEM;
+	}
+
+	ret = ckpt_kread(ctx, ptr, len);
+	if (ret >= 0) {
+		ckpt_debug("%s\n", &ptr[1]);
+		ret = -EIO;
+	}
+
+	kfree(ptr);
+	return ret;
+}
+
+/**
+ * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: desired ckpt_hdr
+ * @ptr: desired buffer
+ * @len: desired object length (if 0, flexible)
+ * @max: maximum object length (if 0, flexible)
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
+			  void *ptr, int len, int max)
+{
+	int ret;
+
+ again:
+	ret = ckpt_kread(ctx, h, sizeof(*h));
+	if (ret < 0)
+		return ret;
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    h->type, h->len, len, max);
+	if (h->len < sizeof(*h))
+		return -EINVAL;
+
+	if (h->type == CKPT_HDR_ERROR) {
+		ret = _ckpt_read_err(ctx, h);
+		if (ret < 0)
+			return ret;
+		goto again;
+	}
+
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && h->len != len) || (!len && max && h->len > max))
+		return -EINVAL;
+
+	if (ptr)
+		ret = ckpt_kread(ctx, ptr, h->len - sizeof(struct ckpt_hdr));
+	return ret;
+}
+
+/**
+ * _ckpt_read_obj_type - read an object of some type
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ * @type: buffer type
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: actual _payload_ length
+ */
+int _ckpt_read_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr h;
+	int ret;
+
+	if (len)
+		len += sizeof(struct ckpt_hdr);
+	ret = _ckpt_read_obj(ctx, &h, ptr, len, len);
+	if (ret < 0)
+		return ret;
+	if (h.type != type)
+		return -EINVAL;
+	return h.len - sizeof(h);
+}
+EXPORT_SYMBOL(_ckpt_read_obj_type);
+
+/**
+ * _ckpt_read_buffer - read an object of type buffer (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: _payload_ length.
+ */
+int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	BUG_ON(!len);
+	return _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+EXPORT_SYMBOL(_ckpt_read_buffer);
+
+/**
+ * _ckpt_read_string - read an object of type string (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: string length (including '\0')
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	int ret;
+
+	BUG_ON(!len);
+	ret = _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_STRING);
+	if (ret < 0)
+		return ret;
+	if (ptr)
+		((char *) ptr)[len - 1] = '\0';	/* always play it safe */
+	return 0;
+}
+EXPORT_SYMBOL(_ckpt_read_string);
+
+/**
+ * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ * @len: desired total length (if 0, flexible)
+ * @max: maximum total length
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+{
+	struct ckpt_hdr hh;
+	struct ckpt_hdr *h;
+	int ret;
+
+	ret = ckpt_kread(ctx, &hh, sizeof(hh));
+	if (ret < 0)
+		return ERR_PTR(ret);
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    hh.type, hh.len, len, max);
+	if (hh.len < sizeof(*h))
+		return ERR_PTR(-EINVAL);
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && hh.len != len) || (!len && max && hh.len > max))
+		return ERR_PTR(-EINVAL);
+
+	h = ckpt_hdr_get(ctx, hh.len);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	*h = hh;	/* yay ! */
+
+	ret = ckpt_kread(ctx, (h + 1), hh.len - sizeof(struct ckpt_hdr));
+	if (ret < 0) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(ret);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_obj_type - allocate and read an object of some type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	BUG_ON(!len);
+
+	h = ckpt_read_obj(ctx, len, len);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(ckpt_read_obj_type);
+
+/**
+ * ckpt_read_buf_type - allocate and read an object of some type (flxible)
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @type: desired object type
+ *
+ * This differs from ckpt_read_obj_type() in that the length of the
+ * incoming object is flexible (up to the maximum specified by @max;
+ * unlimited if @max is 0), as determined by the ckpt_hdr data.
+ *
+ * NOTE: for symmetry with checkpoint, @max is the maximum _payload_
+ * size, excluding the header.
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type)
+{
+	struct ckpt_hdr *h;
+
+	if (max)
+		max += sizeof(struct ckpt_hdr);
+
+	h = ckpt_read_obj(ctx, 0, max);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(ckpt_read_buf_type);
+
+/**
+ * ckpt_read_payload - allocate and read the payload of an object
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @str: pointer to buffer to be allocated (caller must free)
+ * @type: desired object type
+ *
+ * This can be used to read a variable-length _payload_ from the checkpoint
+ * stream. @max limits the size of the resulting buffer.
+ *
+ * Return: actual _payload_ length
+ */
+int ckpt_read_payload(struct ckpt_ctx *ctx, void **ptr, int max, int type)
+{
+	int len, ret;
+
+	len = _ckpt_read_obj_type(ctx, NULL, 0, type);
+	if (len < 0)
+		return len;
+	else if (len > max)
+		return -EINVAL;
+
+	*ptr = kmalloc(len, GFP_KERNEL);
+	if (!*ptr)
+		return -ENOMEM;
+
+	ret = ckpt_kread(ctx, *ptr, len);
+	if (ret < 0) {
+		kfree(*ptr);
+		return ret;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ckpt_read_payload);
+
+/**
+ * ckpt_read_string - allocate and read a string (variable length)
+ * @ctx: checkpoint context
+ * @max: maximum acceptable length
+ *
+ * Return: allocate string or error pointer
+ */
+char *ckpt_read_string(struct ckpt_ctx *ctx, int max)
+{
+	char *str;
+	int len;
+
+	len = ckpt_read_payload(ctx, (void **)&str, max, CKPT_HDR_STRING);
+	if (len < 0)
+		return ERR_PTR(len);
+	str[len - 1] = '\0';	/* always play it safe */
+	return str;
+}
+EXPORT_SYMBOL(ckpt_read_string);
+
+/**
+ * ckpt_read_consume - consume the next object of expected type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * This can be used to skip an object in the input stream when the
+ * data is unnecessary for the restart. @len indicates the length of
+ * the object); if @len is zero the length is unconstrained.
+ */
+int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret = 0;
+
+	h = ckpt_read_obj(ctx, len, 0);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->type != type)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+EXPORT_SYMBOL(ckpt_read_consume);
+
+/***********************************************************************
+ * Restart
+ */
+
+static int check_kernel_const(struct ckpt_const *h)
+{
+	struct task_struct *tsk;
+	struct new_utsname *uts;
+
+	/* task */
+	if (h->task_comm_len != sizeof(tsk->comm))
+		return -EINVAL;
+	/* uts */
+	if (h->uts_release_len != sizeof(uts->release))
+		return -EINVAL;
+	if (h->uts_version_len != sizeof(uts->version))
+		return -EINVAL;
+	if (h->uts_machine_len != sizeof(uts->machine))
+		return -EINVAL;
+
+	return 0;
+}
+
+/* read the checkpoint header */
+static int restore_read_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts = NULL;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->magic != CHECKPOINT_MAGIC_HEAD ||
+	    h->rev != CHECKPOINT_VERSION ||
+	    h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
+	    h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
+	    h->patch != ((LINUX_VERSION_CODE) & 0xff)) {
+		ckpt_err(ctx, ret, "incompatible kernel version");
+		goto out;
+	}
+	if (h->uflags) {
+		ckpt_err(ctx, ret, "incompatible restart user flags");
+		goto out;
+	}
+
+	ret = check_kernel_const(&h->constants);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "incompatible kernel constants");
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	uts = kmalloc(sizeof(*uts), GFP_KERNEL);
+	if (!uts)
+		goto out;
+
+	ctx->oflags = h->uflags;
+
+	/* FIX: verify compatibility of release, version and machine */
+	ret = _ckpt_read_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
+ out:
+	kfree(uts);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the container configuration section */
+static int restore_container(struct ckpt_ctx *ctx)
+{
+	int ret = 0;
+	struct ckpt_hdr_container *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/* read the checkpoint trailer */
+static int restore_read_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->magic != CHECKPOINT_MAGIC_TAIL)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+long do_restart(struct ckpt_ctx *ctx, pid_t pid)
+{
+	long ret;
+
+	ret = restore_read_header(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_container(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_task(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_read_tail(ctx);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
index a81750a..af8c1bf 100644
--- a/kernel/checkpoint/sys.c
+++ b/kernel/checkpoint/sys.c
@@ -8,12 +8,398 @@
  *  distribution for more details.
  */
 
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/checkpoint.h>
+
+/*
+ * Helpers to write(read) from(to) kernel space to(from) the checkpoint
+ * image file descriptor (similar to how a core-dump is performed).
+ *
+ *   _ckpt_kwrite() - write a kernel-space buffer to a file
+ *   _ckpt_kread() - read from a file to a kernel-space buffer
+ *
+ *   ckpt_kread() - read from the checkpoint image to a kernel-space buffer
+ *   ckpt_kwrite() - write a kernel-space buffer to the checkpoint image
+ *
+ * They latter two succeed only if the entire read or write succeeds,
+ * and return 0, or negative error otherwise.
+ */
+
+static ssize_t _ckpt_kwrite(struct file *file, void *addr, size_t count)
+{
+	loff_t pos;
+	int ret;
+
+	pos = file_pos_read(file);
+	ret = kernel_write(file, pos, addr, count);
+	if (ret < 0)
+		return ret;
+	file_pos_write(file, pos + ret);
+	return ret;
+}
+
+/* returns 0 on success */
+int ckpt_kwrite(struct ckpt_ctx *ctx, void *addr, size_t count)
+{
+	int ret;
+
+	ret = _ckpt_kwrite(ctx->file, addr, count);
+	if (ret < 0)
+		return ret;
+
+	ctx->total += count;
+	return 0;
+}
+
+static ssize_t _ckpt_kread(struct file *file, void *addr, size_t count)
+{
+	loff_t pos;
+	int ret;
+
+	pos = file_pos_read(file);
+	ret = kernel_read(file, pos, addr, count);
+	if (ret < 0)
+		return ret;
+	file_pos_write(file, pos + ret);
+	return ret;
+}
+
+/* returns 0 on success */
+int ckpt_kread(struct ckpt_ctx *ctx, void *addr, size_t count)
+{
+	int ret;
+
+	ret = _ckpt_kread(ctx->file, addr, count);
+	if (ret < 0)
+		return ret;
+	if (ret != count)
+		return -EPIPE;
+
+	ctx->total += count;
+	return 0;
+}
+
+/**
+ * ckpt_hdr_get - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: desired length
+ *
+ * Returns pointer to header
+ */
+void *ckpt_hdr_get(struct ckpt_ctx *ctx, int len)
+{
+	return kzalloc(len, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ckpt_hdr_get);
+
+/**
+ * _ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ * @len: header length
+ *
+ * (requiring 'ptr' makes it easily interchangable with kmalloc/kfree
+ */
+void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	kfree(ptr);
+}
+EXPORT_SYMBOL(_ckpt_hdr_put);
+
+/**
+ * ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ *
+ * It is assumed that @ptr begins with a 'struct ckpt_hdr'.
+ */
+void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr *h = (struct ckpt_hdr *) ptr;
+	_ckpt_hdr_put(ctx, ptr, h->len);
+}
+EXPORT_SYMBOL(ckpt_hdr_put);
+
+/**
+ * ckpt_hdr_get_type - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: number of bytes to reserve
+ *
+ * Returns pointer to reserved space on hbuf
+ */
+void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	h = ckpt_hdr_get(ctx, len);
+	if (!h)
+		return NULL;
+
+	h->type = type;
+	h->len = len;
+	return h;
+}
+EXPORT_SYMBOL(ckpt_hdr_get_type);
+
+/*
+ * Helpers to manage c/r contexts: allocated for each checkpoint and/or
+ * restart operation, and persists until the operation is completed.
+ */
+
+static void ckpt_ctx_free(struct ckpt_ctx *ctx)
+{
+	if (ctx->file)
+		fput(ctx->file);
+	if (ctx->logfile)
+		fput(ctx->logfile);
+	kfree(ctx);
+}
+
+static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
+				       unsigned long kflags, int logfd)
+{
+	struct ckpt_ctx *ctx;
+	int err;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	ctx->uflags = uflags;
+	ctx->kflags = kflags;
+
+	mutex_init(&ctx->msg_mutex);
+
+	err = -EBADF;
+	ctx->file = fget(fd);
+	if (!ctx->file)
+		goto err;
+	if (logfd == CHECKPOINT_FD_NONE)
+		goto nolog;
+	ctx->logfile = fget(logfd);
+	if (!ctx->logfile)
+		goto err;
+ nolog:
+	return ctx;
+ err:
+	ckpt_ctx_free(ctx);
+	return ERR_PTR(err);
+}
+
+static void ckpt_set_error(struct ckpt_ctx *ctx, int err)
+{
+	ctx->errno = err;
+}
+
+/* helpers to handler log/dbg/err messages */
+void ckpt_msg_lock(struct ckpt_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	mutex_lock(&ctx->msg_mutex);
+	ctx->msg[0] = '\0';
+	ctx->msglen = 1;
+}
+
+void ckpt_msg_unlock(struct ckpt_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	mutex_unlock(&ctx->msg_mutex);
+}
+
+static inline int is_special_flag(char *s)
+{
+	if (*s == '%' && s[1] == '(' && s[2] != '\0' && s[3] == ')')
+		return 1;
+	return 0;
+}
+
+/*
+ * _ckpt_generate_fmt - handle the special flags in the enhanced format
+ * strings used by checkpoint/restart error messages.
+ * @ctx: checkpoint context
+ * @fmt: message format
+ *
+ * The special flags are surrounded by %() to help them visually stand
+ * out.  For instance, %(O) means an objref.  The following special
+ * flags are recognized:
+ *	O: objref
+ *	P: pointer
+ *	T: task
+ *	S: string
+ *	V: variable
+ *
+ * %(O) will be expanded to "[obj %d]".  Likewise P, S, and V, will
+ * also expand to format flags requiring an argument to the subsequent
+ * sprintf or printk.  T will be expanded to a string with no flags,
+ * requiring no further arguments.
+ *
+ * These do not accept any extra flags (i.e. min field width, precision,
+ * etc).
+ *
+ * The caller of ckpt_err() and _ckpt_err() must provide
+ * the additional variabes, in order, to match the @fmt (except for
+ * the T key), e.g.:
+ *
+ *	ckpt_err(ctx, err, "%(T)FILE flags %d %(O)\n", flags, objref);
+ *
+ * May be called under spinlock.
+ * Must be called with ctx->msg_mutex held.  The expanded format
+ * will be placed in ctx->fmt.
+ */
+static void _ckpt_generate_fmt(struct ckpt_ctx *ctx, char *fmt)
+{
+	char *s = ctx->fmt;
+	int len = 0;
+
+	for (; *fmt && len < CKPT_MSG_LEN; fmt++) {
+		if (!is_special_flag(fmt)) {
+			s[len++] = *fmt;
+			continue;
+		}
+		switch (fmt[2]) {
+		case 'O':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[obj %%d]");
+			break;
+		case 'P':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[ptr %%p]");
+			break;
+		case 'V':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[sym %%pS]");
+			break;
+		case 'S':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[str %%s]");
+			break;
+		case 'T':
+			if (ctx->tsk)
+				len += snprintf(s+len, CKPT_MSG_LEN-len,
+					"[pid %d tsk %s]",
+					task_pid_vnr(ctx->tsk), ctx->tsk->comm);
+			else
+				len += snprintf(s+len, CKPT_MSG_LEN-len,
+					"[pid -1 tsk NULL]");
+			break;
+		default:
+			printk(KERN_ERR "c/r: bad format specifier %c\n",
+					fmt[2]);
+			BUG();
+		}
+		fmt += 3;
+	}
+	if (len == CKPT_MSG_LEN)
+		s[CKPT_MSG_LEN-1] = '\0';
+	else
+		s[len] = '\0';
+}
+
+static void _ckpt_msg_appendv(struct ckpt_ctx *ctx, int err, char *fmt,
+				va_list ap)
+{
+	int len = ctx->msglen;
+
+	if (err) {
+		len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[err %d]",
+				 err);
+		if (len > CKPT_MSG_LEN)
+			goto full;
+	}
+
+	len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[pos %lld]",
+			ctx->total);
+	len += vsnprintf(&ctx->msg[len], CKPT_MSG_LEN-len, fmt, ap);
+	if (len > CKPT_MSG_LEN) {
+full:
+		len = CKPT_MSG_LEN;
+		ctx->msg[CKPT_MSG_LEN-1] = '\0';
+	}
+	ctx->msglen = len;
+}
+
+void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	_ckpt_msg_appendv(ctx, 0, fmt, ap);
+	va_end(ap);
+}
+
+void _ckpt_msg_complete(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	/* Don't write an empty or uninitialized msg */
+	if (ctx->msglen <= 1)
+		return;
+
+	if (ctx->kflags & CKPT_CTX_CHECKPOINT && ctx->errno) {
+		ret = ckpt_write_obj_type(ctx, NULL, 0, CKPT_HDR_ERROR);
+		if (!ret)
+			ret = ckpt_write_string(ctx, ctx->msg, ctx->msglen);
+		if (ret < 0)
+			printk(KERN_NOTICE "c/r: error string unsaved (%d): %s\n",
+			       ret, ctx->msg+1);
+	}
+
+	if (ctx->logfile) {
+		struct file *logfile = ctx->logfile;
+		loff_t pos = file_pos_read(logfile);
+		ret = kernel_write(logfile, pos, ctx->msg+1, ctx->msglen-1);
+		if (ret > 0)
+			file_pos_write(logfile, pos + ret);
+	}
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+	printk(KERN_DEBUG "%s", ctx->msg+1);
+#endif
+
+	ctx->msglen = 0;
+}
+
+#define __do_ckpt_msg(ctx, err, fmt) do {		\
+	va_list ap;					\
+	_ckpt_generate_fmt(ctx, fmt);			\
+	va_start(ap, fmt);				\
+	_ckpt_msg_appendv(ctx, err, ctx->fmt, ap);	\
+	va_end(ap);					\
+} while (0)
+
+void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+	__do_ckpt_msg(ctx, err, fmt);
+}
+
+void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+	if (!ctx)
+		return;
+
+	ckpt_msg_lock(ctx);
+	__do_ckpt_msg(ctx, err, fmt);
+	_ckpt_msg_complete(ctx);
+	ckpt_msg_unlock(ctx);
+
+	if (err)
+		ckpt_set_error(ctx, err);
+}
+EXPORT_SYMBOL(do_ckpt_msg);
+
+/* checkpoint/restart syscalls */
 
 /**
- * sys_checkpoint - checkpoint a container
+ * do_sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
  * @fd: file to which dump the checkpoint image
  * @flags: checkpoint operation flags
@@ -22,14 +408,32 @@
  * Returns positive identifier on success, 0 when returning from restart
  * or negative value on error
  */
-SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
-		unsigned long, flags, int, logfd)
+long do_sys_checkpoint(pid_t pid, int fd, unsigned long flags, int logfd)
 {
-	return -ENOSYS;
+	struct ckpt_ctx *ctx;
+	long ret;
+
+	/* no flags for now */
+	if (flags)
+		return -EINVAL;
+
+	if (pid == 0)
+		pid = task_pid_vnr(current);
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT, logfd);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_checkpoint(ctx, pid);
+
+	if (!ret)
+		ret = ctx->crid;
+
+	ckpt_ctx_free(ctx);
+	return ret;
 }
 
 /**
- * sys_restart - restart a container
+ * do_sys_restart - restart a container
  * @pid: pid of task root (in coordinator's namespace), or 0
  * @fd: file from which read the checkpoint image
  * @flags: restart operation flags
@@ -38,8 +442,49 @@ SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
  * Returns negative value on error, or otherwise returns in the realm
  * of the original checkpoint
  */
-SYSCALL_DEFINE4(restart, pid_t, pid, int, fd,
-		unsigned long, flags, int, logfd)
+long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd)
+{
+	struct ckpt_ctx *ctx = NULL;
+	long ret;
+
+	/* no flags for now */
+	if (flags)
+		return -EINVAL;
+
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART, logfd);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_restart(ctx, pid);
+
+	/* restart(2) isn't idempotent: can't restart syscall */
+	if (ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+	    ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)
+		ret = -EINTR;
+
+	ckpt_ctx_free(ctx);
+	return ret;
+}
+
+
+/* 'ckpt_debug_level' controls the verbosity level of c/r code */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+
+/* FIX: allow to change during runtime */
+unsigned long __read_mostly ckpt_debug_level = CKPT_DDEFAULT;
+EXPORT_SYMBOL(ckpt_debug_level);
+
+static __init int ckpt_debug_setup(char *s)
 {
-	return -ENOSYS;
+	long val, ret;
+
+	ret = strict_strtoul(s, 10, &val);
+	if (ret < 0)
+		return ret;
+	ckpt_debug_level = val;
+	return 0;
 }
+
+__setup("ckpt_debug=", ckpt_debug_setup);
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 935248b..75d413e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1086,6 +1086,19 @@ config DMA_API_DEBUG
 	  This option causes a performance degredation.  Use only if you want
 	  to debug device drivers. If unsure, say N.
 
+config CHECKPOINT_DEBUG
+	bool "Checkpoint/restart debugging (EXPERIMENTAL)"
+	depends on CHECKPOINT
+	default y
+	help
+	  This options turns on the debugging output of checkpoint/restart.
+	  The level of verbosity is controlled by 'ckpt_debug_level' and can
+	  be set at boot time with "ckpt_debug=" option.
+
+	  Turning this option off will reduce the size of the c/r code. If
+	  turned on, it is unlikely to incur visible overhead if the debug
+	  level is set to zero.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v21 020/100] c/r: documentation
From: Oren Laadan @ 2010-05-01 14:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: containers, linux-kernel, Serge Hallyn, Matt Helsley,
	Pavel Emelyanov, Oren Laadan, linux-api, linux-mm, linux-fsdevel,
	netdev, Dave Hansen
In-Reply-To: <1272723382-19470-1-git-send-email-orenl@cs.columbia.edu>

Covers application checkpoint/restart, overall design, interfaces,
usage, shared objects, and and checkpoint image format.

Changelog[v19-rc1]:
  - Update documentation and examples for new syscalls API
  - [Liu Alexander] Fix typos
  - [Serge Hallyn] Update checkpoint image format
Changelog[v16]:
  - Update documentation
  - Unify into readme.txt and usage.txt
Changelog[v14]:
  - Discard the 'h.parent' field
  - New image format (shared objects appear before they are referenced
    unless they are compound)
Changelog[v8]:
  - Split into multiple files in Documentation/checkpoint/...
  - Extend documentation, fix typos and comments from feedback

Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
 Documentation/checkpoint/checkpoint.c      |   38 +++
 Documentation/checkpoint/readme.txt        |  370 ++++++++++++++++++++++++++++
 Documentation/checkpoint/self_checkpoint.c |   69 +++++
 Documentation/checkpoint/self_restart.c    |   40 +++
 Documentation/checkpoint/usage.txt         |  247 +++++++++++++++++++
 5 files changed, 764 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/checkpoint/checkpoint.c
 create mode 100644 Documentation/checkpoint/readme.txt
 create mode 100644 Documentation/checkpoint/self_checkpoint.c
 create mode 100644 Documentation/checkpoint/self_restart.c
 create mode 100644 Documentation/checkpoint/usage.txt

diff --git a/Documentation/checkpoint/checkpoint.c b/Documentation/checkpoint/checkpoint.c
new file mode 100644
index 0000000..8560f30
--- /dev/null
+++ b/Documentation/checkpoint/checkpoint.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+	return syscall(__NR_checkpoint, pid, fd, flags);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t pid;
+	int ret;
+
+	if (argc != 2) {
+		printf("usage: ckpt PID\n");
+		exit(1);
+	}
+
+	pid = atoi(argv[1]);
+	if (pid <= 0) {
+		printf("invalid pid\n");
+		exit(1);
+	}
+
+	ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+
+	if (ret < 0)
+		perror("checkpoint");
+	else
+		printf("checkpoint id %d\n", ret);
+
+	return (ret > 0 ? 0 : 1);
+}
diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
new file mode 100644
index 0000000..4fa5560
--- /dev/null
+++ b/Documentation/checkpoint/readme.txt
@@ -0,0 +1,370 @@
+
+	      Checkpoint-Restart support in the Linux kernel
+	==========================================================
+
+Copyright (C) 2008-2010 Oren Laadan
+
+Author:		Oren Laadan <orenl@cs.columbia.edu>
+
+License:	The GNU Free Documentation License, Version 1.2
+		(dual licensed under the GPL v2)
+
+Contributors:	Oren Laadan <orenl@cs.columbia.edu>
+		Serge Hallyn <serue@us.ibm.com>
+		Dan Smith <danms@us.ibm.com>
+		Matt Helsley <matthltc@us.ibm.com>
+		Nathan Lynch <ntl@pobox.com>
+		Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
+		Dave Hansen <dave@linux.vnet.ibm.com>
+
+
+Introduction
+============
+
+Application checkpoint/restart [C/R] is the ability to save the state
+of a running application so that it can later resume its execution
+from the time at which it was checkpointed. An application can be
+migrated by checkpointing it on one machine and restarting it on
+another. C/R can provide many potential benefits:
+
+* Failure recovery: by rolling back to a previous checkpoint
+
+* Improved response time: by restarting applications from checkpoints
+  instead of from scratch.
+
+* Improved system utilization: by suspending long running CPU
+  intensive jobs and resuming them when load decreases.
+
+* Fault resilience: by migrating applications off faulty hosts.
+
+* Dynamic load balancing: by migrating applications to less loaded
+  hosts.
+
+* Improved service availability and administration: by migrating
+  applications before host maintenance so that they continue to run
+  with minimal downtime
+
+* Time-travel: by taking periodic checkpoints and restarting from
+  any previous checkpoint.
+
+Compared to hypervisor approaches, application C/R is more lightweight
+since it need only save the state associated with applications, while
+operating system data structures (e.g. buffer cache, drivers state
+and the like) are uninteresting.
+
+
+Overall design
+==============
+
+Checkpoint and restart are done in the kernel as much as possible.
+Two new system calls are introduced to provide C/R: sys_checkpoint()
+and sys_restart(). They both operate on a process tree (hierarchy),
+either a whole container or a subtree of a container.
+
+Checkpointing entire containers ensures that there are no dependencies
+on anything outside the container, which guarantees that a matching
+restart will succeed (assuming that the file system state remains
+consistent). However, it requires that users will always run the tasks
+that they wish to checkpoint inside containers. This is ideal for,
+e.g., private virtual servers and the like.
+
+In contrast, when checkpointing a subtree of a container it is up to
+the user to ensure that dependencies either don't exist or can be
+safely ignored. This is useful, for instance, for HPC scenarios or
+even a user that would like to periodically checkpoint a long-running
+batch job.
+
+An additional system call, a la madvise(), is planned, so that tasks
+can advise the kernel how to handle specific resources. For instance,
+a task could ask to skip a memory area at checkpoint to save space,
+or to use a preset file descriptor at restart instead of restoring it
+from the checkpoint image. It will provide the flexibility that is
+particularly useful to address the needs of a diverse crowd of users
+and use-cases.
+
+Syscall sys_checkpoint() is given a pid that indicates the top of the
+hierarchy, a file descriptor to store the image, and flags. The code
+serializes internal user- and kernel-state and writes it out to the
+file descriptor. The resulting image is stream-able. The processes are
+expected to be frozen for the duration of the checkpoint.
+
+In general, a checkpoint consists of 5 steps:
+1. Pre-dump
+2. Freeze the container/subtree
+3. Save tasks' and kernel state		<-- sys_checkpoint()
+4. Thaw (or kill) the container/subtree
+5. Post-dump
+
+Step 3 is done by calling sys_checkpoint(). Steps 1 and 5 are an
+optimization to reduce application downtime. In particular, "pre-dump"
+works before freezing the container, e.g. the pre-copy for live
+migration, and "post-dump" works after the container resumes
+execution, e.g. write-back the data to secondary storage.
+
+The kernel exports a relatively opaque 'blob' of data to userspace
+which can then be handed to the new kernel at restart time.  The
+'blob' contains data and state of select portions of kernel structures
+such as VMAs and mm_structs, as well as copies of the actual memory
+that the tasks use. Any changes in this blob's format between kernel
+revisions can be handled by an in-userspace conversion program.
+
+To restart, userspace first create a process hierarchy that matches
+that of the checkpoint, and each task calls sys_restart(). The syscall
+reads the saved kernel state from a file descriptor, and re-creates
+the resources that the tasks need to resume execution. The restart
+code is executed by each task that is restored in the new hierarchy to
+reconstruct its own state.
+
+In general, a restart consists of 3 steps:
+1. Create hierarchy
+2. Restore tasks' and kernel state	<-- sys_restart()
+3. Resume userspace (or freeze tasks)
+
+Because the process hierarchy, during restart in created in userspace,
+the restarting tasks have the flexibility to prepare before calling
+sys_restart().
+
+
+Checkpoint image format
+=======================
+
+The checkpoint image format is built of records that consist of a
+pre-header identifying its contents, followed by a payload. This
+format allow userspace tools to easily parse and skip through the
+image without requiring intimate knowledge of the data. It will also
+be handy to enable parallel checkpointing in the future where multiple
+threads interleave data from multiple processes into a single stream.
+
+The pre-header is defined by 'struct ckpt_hdr' as follows: @type
+identifies the type of the payload, @len tells its length in bytes
+including the pre-header.
+
+struct ckpt_hdr {
+	__s32 type;
+	__s32 len;
+};
+
+The pre-header must be the first component in all other headers. For
+instance, the task data is saved in 'struct ckpt_hdr_task', which
+looks something like this:
+
+struct ckpt_hdr_task {
+	struct ckpt_hdr h;
+	__u32 pid;
+	...
+};
+
+THE IMAGE FORMAT IS EXPECTED TO CHANGE over time as more features are
+supported, or as existing features change in the kernel and require to
+adjust their representation. Any such changes will be be handled by
+in-userspace conversion tools.
+
+The general format of the checkpoint image is as follows:
+* Image header
+* Container configuration
+* Task hierarchy
+* Tasks' state
+* Image trailer
+
+The image always begins with a general header that holds a magic
+number, an architecture identifier (little endian format), a format
+version number (@rev), followed by information about the kernel
+(currently version and UTS data). It also holds the time of the
+checkpoint and the flags given to sys_checkpoint(). This header is
+followed by an arch-specific header.
+
+The container configuration section containers information that is
+global to the container. Security (LSM) configuration is one example.
+Network configuration and container-wide mounts may also go here, so
+that the userspace restart coordinator can re-create a suitable
+environment.
+
+The task hierarchy comes next so that userspace tools can read it
+early (even from a stream) and re-create the restarting tasks. This is
+basically an array of all checkpointed tasks, and their relationships
+(parent, siblings, threads, etc).
+
+Then the state of all tasks is saved, in the order that they appear in
+the tasks array above. For each state, we save data like task_struct,
+namespaces, open files, memory layout, memory contents, cpu state,
+signals and signal handlers, etc. For resources that are shared among
+multiple processes, we first checkpoint said resource (and only once),
+and in the task data we give a reference to it. More about shared
+resources below.
+
+Finally, the image always ends with a trailer that holds a (different)
+magic number, serving for sanity check.
+
+
+Shared objects
+==============
+
+Many resources may be shared by multiple tasks (e.g. file descriptors,
+memory address space, etc), or even have multiple references from
+other resources (e.g. a single inode that represents two ends of a
+pipe).
+
+Shared objects are tracked using a hash table (objhash) to ensure that
+they are only checkpointed or restored once. To handle a shared
+object, it is first looked up in the hash table, to determine if is
+the first encounter or a recurring appearance.  The hash table itself
+is not saved as part of the checkpoint image: it is constructed
+dynamically during both checkpoint and restart, and discarded at the
+end of the operation.
+
+During checkpoint, when a shared object is encountered for the first
+time, it is inserted to the hash table, indexed by its kernel address.
+It is assigned an identifier (@objref) in order of appearance, and
+then its state is saved. Subsequent lookups of that object in the hash
+will yield that entry, in which case only the @objref is saved, as
+opposed the entire state of the object.
+
+During restart, shared objects are indexed by their @objref as given
+during the checkpoint. On the first appearance of each shared object,
+a new resource will be created and its state restored from the image.
+Then the object is added to the hash table. Subsequent lookups of the
+same unique identifier in the hash table will yield that entry, and
+then the existing object instance is reused instead of creating
+a new one.
+
+The hash grabs a reference to each object that is inserted, and
+maintains this reference for the entire lifetime of the hash. Thus,
+it is always safe to reference an object that is stored in the hash.
+The hash is "one-way" in the sense that objects that are added are
+never deleted from the hash until the hash is discarded. This, in
+turn, happens only when the checkpoint (or restart) terminates.
+
+Shared objects are thus saved when they are first seen, and _before_
+the parent object that uses them. Therefore by the time the parent
+objects needs them, they should already be in the objhash. The one
+exception is when more than a single shared resource will be restarted
+at once (e.g. like the two ends of a pipe, or all the namespaces in an
+nsproxy). In this case the parent object is dumped first followed by
+the individual sub-resources).
+
+The checkpoint image is stream-able, meaning that restarting from it
+may not require lseek(). This is enforced at checkpoint time, by
+carefully selecting the order of shared objects, to respect the rule
+that an object is always saved before the objects that refers to it.
+
+
+Memory contents format
+======================
+
+The memory contents of a given memory address space (->mm) is dumped
+as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
+This header details the vma properties, and a reference to a file
+(if file backed) or an inode (or shared memory) object.
+
+The vma header is followed by the actual contents - but only those
+pages that need to be saved, i.e. dirty pages. They are written in
+chunks of data, where each chunks contains a header that indicates
+that number of pages in the chunk, followed by an array of virtual
+addresses and then an array of actual page contents. The last chunk
+holds zero pages.
+
+To illustrate this, consider a single simple task with two vmas: one
+is file mapped with two dumped pages, and the other is anonymous with
+three dumped pages. The memory dump will look like this:
+
+	ckpt_hdr + ckpt_hdr_vma
+		ckpt_hdr_pgarr (nr_pages = 2)
+			addr1, addr2
+			page1, page2
+		ckpt_hdr_pgarr (nr_pages = 0)
+	ckpt_hdr + ckpt_hdr_vma
+		ckpt_hdr_pgarr (nr_pages = 3)
+		addr3, addr4, addr5
+		page3, page4, page5
+		ckpt_hdr_pgarr (nr_pages = 0)
+
+
+Error handling
+==============
+
+Both checkpoint and restart operations may fail due to a variety of
+reasons. Using a simple, single return value from the system call is
+insufficient to report the reason of a failure.
+
+Instead, both sys_checkpoint() and sys_restart() accept an additional
+argument - a file descriptor to which the kernel writes diagnostic
+and debugging information. Both the checkpoint and restart userspace
+utilities have options to specify a filename to store this log.
+
+In addition, checkpoint provides informative status report upon
+failure in the checkpoint image in the form of (one or more) error
+objects, 'struct ckpt_hdr_err'.  An error objects consists of a
+mandatory pre-header followed by a null character ('\0'), and then a
+string that describes the error. By default, if an error occurs, this
+will be the last object written to the checkpoint image.
+
+Upon failure, the caller can examine the image (e.g. with 'ckptinfo')
+and extract the detailed error message. The leading '\0' is useful if
+one wants to seek back from the end of the checkpoint image, instead
+of parsing the entire image separately.
+
+
+Security
+========
+
+The main question is whether sys_checkpoint() and sys_restart()
+require privileged or unprivileged operation.
+
+Early versions checked capable(CAP_SYS_ADMIN) assuming that we would
+attempt to remove the need for privilege, so that all users could
+safely use it. Arnd Bergmann pointed out that it'd make more sense to
+let unprivileged users use them now, so that we'll be more careful
+about the security as patches roll in.
+
+Checkpoint: the main concern is whether a task that performs the
+checkpoint of another task has sufficient privileges to access its
+state. We address this by requiring that the checkpointer task will be
+able to ptrace the target task, by means of ptrace_may_access() with
+access mode.
+
+Restart: the main concern is that we may allow an unprivileged user to
+feed the kernel with random data. To this end, the restart works in a
+way that does not skip the usual security checks. Task credentials,
+i.e. euid, reuid, and LSM security contexts currently come from the
+caller, not the checkpoint image.  As credentials are restored too,
+the ability of a task that calls sys_restore() to setresuid/setresgid
+to those values must be checked.
+
+Keeping the restart procedure to operate within the limits of the
+caller's credentials means that there various scenarios that cannot
+be supported. For instance, a setuid program that opened a protected
+log file and then dropped privileges will fail the restart, because
+the user won't have enough credentials to reopen the file. In these
+cases, we should probably treat restarting like inserting a kernel
+module: surely the user can cause havoc by providing incorrect data,
+but then again we must trust the root account.
+
+So that's why we don't want CAP_SYS_ADMIN required up-front. That way
+we will be forced to more carefully review each of those features.
+However, this can be controlled with a sysctl-variable.
+
+
+Kernel interfaces
+=================
+
+* To checkpoint a vma, the 'struct vm_operations_struct' needs to
+  provide a method ->checkpoint:
+    int checkpoint(struct ckpt_ctx *, struct vma_struct *)
+  Restart requires a matching (exported) restore:
+    int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
+
+* To checkpoint a file, the 'struct file_operations' needs to provide
+  the methods ->checkpoint and ->collect:
+    int checkpoint(struct ckpt_ctx *, struct file *)
+    int collect(struct ckpt_ctx *, struct file *)
+  Restart requires a matching (exported) restore:
+    int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
+  For most file systems, generic_file_{checkpoint,restore}() can be
+  used.
+
+* To checkpoint a socket, the 'struct proto_ops' needs to provide
+  the methods ->checkpoint, ->collect and ->restore:
+    int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
+    int collect(struct ckpt_ctx *ctx, struct socket *sock);
+    int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
+
diff --git a/Documentation/checkpoint/self_checkpoint.c b/Documentation/checkpoint/self_checkpoint.c
new file mode 100644
index 0000000..27dba0d
--- /dev/null
+++ b/Documentation/checkpoint/self_checkpoint.c
@@ -0,0 +1,69 @@
+/*
+ *  self_checkpoint.c: demonstrate self-checkpoint
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <math.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+	return syscall(__NR_checkpoint, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+#define OUTFILE  "/tmp/cr-self.out"
+
+int main(int argc, char *argv[])
+{
+	pid_t pid = getpid();
+	FILE *file;
+	int i, ret;
+
+	close(0);
+	close(2);
+
+	unlink(OUTFILE);
+	file = fopen(OUTFILE, "w+");
+	if (!file) {
+		perror("open");
+		exit(1);
+	}
+	if (dup2(0, 2) < 0) {
+		perror("dup2");
+		exit(1);
+	}
+
+	fprintf(file, "hello, world!\n");
+	fflush(file);
+
+	for (i = 0; i < 1000; i++) {
+		sleep(1);
+		fprintf(file, "count %d\n", i);
+		fflush(file);
+
+		if (i != 2)
+			continue;
+		ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+		if (ret < 0) {
+			fprintf(file, "ckpt: %s\n", strerror(errno));
+			exit(2);
+		}
+
+		fprintf(file, "checkpoint ret: %d\n", ret);
+		fflush(file);
+	}
+
+	return 0;
+}
diff --git a/Documentation/checkpoint/self_restart.c b/Documentation/checkpoint/self_restart.c
new file mode 100644
index 0000000..647ce51
--- /dev/null
+++ b/Documentation/checkpoint/self_restart.c
@@ -0,0 +1,40 @@
+/*
+ *  self_restart.c: demonstrate self-restart
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define _GNU_SOURCE        /* or _BSD_SOURCE or _SVID_SOURCE */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int restart(pid_t pid, int fd, unsigned long flags)
+{
+	return syscall(__NR_restart, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t pid = getpid();
+	int ret;
+
+	ret = restart(pid, STDIN_FILENO, RESTART_TASKSELF);
+	if (ret < 0)
+		perror("restart");
+
+	printf("should not reach here !\n");
+
+	return 0;
+}
diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
new file mode 100644
index 0000000..c6fc045
--- /dev/null
+++ b/Documentation/checkpoint/usage.txt
@@ -0,0 +1,247 @@
+
+	      How to use Checkpoint-Restart
+	=========================================
+
+
+API
+===
+
+The API consists of three new system calls:
+
+* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
+
+ Checkpoint a (sub-)container whose root task is identified by @pid,
+ to the open file indicated by @fd. If @logfd isn't -1, it indicates
+ an open file to which error and debug messages are written. @flags
+ may be one or more of:
+   - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
+ (other value are not allowed).
+
+ Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
+ it returns from a restart, and -1 if an error occurs. The ckptid will
+ uniquely identify a checkpoint image, for as long as the checkpoint
+ is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
+ partial checkpoint, residing in kernel memory).
+
+* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
+
+ Restart a process hierarchy from a checkpoint image that is read from
+ the blob stored in the file indicated by @fd.  If @logfd isn't -1, it
+ indicates an open file to which error and debug messages are written.
+ @flags will have future meaning (must be 0 for now). @pid indicates
+ the root of the hierarchy as seen in the coordinator's pid-namespace,
+ and is expected to be a child of the coordinator. @flags may be one
+ or more of:
+   - RESTART_TASKSELF : (self) restart of a single process
+   - RESTART_FROEZN : processes remain frozen once restart completes
+   - RESTART_GHOST : process is a ghost (placeholder for a pid)
+ (Note that this argument may mean 'ckptid' to identify an in-kernel
+ checkpoint image, with some @flags in the future).
+
+ Returns: -1 if an error occurs, 0 on success when restarting from a
+ "self" checkpoint, and return value of system call at the time of the
+ checkpoint when restarting from an "external" checkpoint.
+
+ (If a process was frozen for checkpoint while in userspace, it will
+ resume running in userspace exactly where it was interrupted. If it
+ was frozen while in kernel doing a syscall, it will return what the
+ syscall returned when interrupted/completed, and proceed from there
+ as if it had only been frozen and then thawed. Finally, if it did a
+ self-checkpoint, it will resume to the first instruction after the
+ call to checkpoint(2), having returned 0, to indicate whether the
+ return is from the checkpoint or a restart).
+
+* int clone_with_pid(unsigned long clone_flags, void *news,
+		     int *parent_tidptr, int *child_tidptr,
+		     struct target_pid_set *pid_set)
+
+  struct target_pid_set {
+	 int num_pids;
+	 pid_t *target_pids;
+  }
+
+ Container restart requires that a task have the same pid it had when
+ it was checkpointed. When containers are nested the tasks within the
+ containers exist in multiple pid namespaces and hence have multiple
+ pids to specify during restart.
+
+ clone_with_pids(), intended for use during restart, is similar to
+ clone(), except that it takes a 'target_pid_set' parameter. This
+ parameter lets caller choose specific pid numbers for the child
+ process, in the process's active and ancestor pid namespaces.
+
+ Unlike clone(), clone_with_pids() needs CAP_SYS_ADMIN, at least for
+ now, to prevent unprivileged processes from misusing this interface.
+
+ If a target-pid is 0, the kernel continues to assign a pid for the
+ process in that namespace. If a requested pid is taken, the system
+ call fails with -EBUSY. If 'pid_set.num_pids' exceeds the current
+ nesting level of pid namespaces, the system call fails with -EINVAL.
+
+
+Sysctl/proc
+===========
+
+/proc/sys/kernel/ckpt_unpriv_allowed		[default = 1]
+  controls whether c/r operation is allowed for unprivileged users
+
+
+Operation
+=========
+
+The granularity of a checkpoint usually is a process hierarchy. The
+'pid' argument is interpreted in the caller's pid namespace. So to
+checkpoint a container whose init task (pid 1 in that pidns) appears
+as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
+pid 1 will attempt to checkpoint the caller's container, and if the
+caller isn't privileged and init is owned by root, it will fail.
+
+Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
+which does not refer to a container's init task, then sys_checkpoint()
+would return -EINVAL.
+
+We assume that during checkpoint and restart the container state is
+quiescent. During checkpoint, this means that all affected tasks are
+frozen (or otherwise stopped). During restart, this means that all
+affected tasks are executing the sys_restart() call. In both cases, if
+there are other tasks possible sharing state with the container, they
+must not modify it during the operation. It is the responsibility of
+the caller to follow this requirement.
+
+If the assumption that all tasks are frozen and that there is no other
+sharing doesn't hold - then the results of the operation are undefined
+(just as, e.g. not calling execve() immediately after vfork() produces
+undefined results). In particular, either checkpoint will fail, or it
+may produce a checkpoint image that can't be restarted, or (unlikely)
+the restart may produce a container whose state does not match that of
+the original container.
+
+
+User tools
+==========
+
+* checkpoint(1): a tool to perform a checkpoint of a container/subtree
+* restart(1): a tool to restart a container/subtree
+* ckptinfo: a tool to examine a checkpoint image
+
+It is best to use the dedicated user tools for checkpoint and restart.
+
+If you insist, then here is a code snippet that illustrates how a
+checkpoint is initiated by a process inside a container - the logic is
+similar to fork():
+	...
+	ckptid = checkpoint(0, ...);
+	switch (crid) {
+	case -1:
+		perror("checkpoint failed");
+		break;
+	default:
+		fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
+		/* proceed with execution after checkpoint */
+		...
+		break;
+	case 0:
+		fprintf(stderr, "returned after restart\n");
+		/* proceed with action required following a restart */
+		...
+		break;
+	}
+	...
+
+And to initiate a restart, the process in an empty container can use
+logic similar to execve():
+	...
+	if (restart(pid, ...) < 0)
+		perror("restart failed");
+	/* only get here if restart failed */
+	...
+
+Note, that the code also supports "self" checkpoint, where a process
+can checkpoint itself. This mode does not capture the relationships of
+the task with other tasks, or any shared resources. It is useful for
+application that wish to be able to save and restore their state.
+They will either not use (or care about) shared resources, or they
+will be aware of the operations and adapt suitably after a restart.
+The code above can also be used for "self" checkpoint.
+
+
+You may find the following sample programs useful:
+
+* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
+* self_checkpoint.c: a simple test program doing self-checkpoint
+* self_restart.c: restarts a (self-) checkpoint image from stdin
+
+See also the utilities 'checkpoint' and 'restart' (from user-cr).
+
+
+"External" checkpoint
+=====================
+
+To do "external" checkpoint, you need to first freeze that other task
+either using the freezer cgroup.
+
+Restart does not preserve the original PID yet, (because we haven't
+solved yet the fork-with-specific-pid issue). In a real scenario, you
+probably want to first create a new names space, and have the init
+task there call 'sys_restart()'.
+
+I tested it this way:
+	$ ./test &
+	[1] 3493
+
+	$ echo 3493 > /cgroup/0/tasks
+	$ echo FROZEN > /cgroup/0/freezer.state
+	$ ./checkpoint 3493 > ckpt.image
+
+	$ mv /tmp/cr-test.out /tmp/cr-test.out.orig
+	$ cp /tmp/cr-test.out.orig /tmp/cr-test.out
+
+	$ echo THAWED > /cgroup/0/freezer.state
+
+	$ ./self_restart < ckpt.image
+Now compare the output of the two output files.
+
+
+"Self" checkpoint
+================
+
+To do self-checkpoint, you can incorporate the code from
+self_checkpoint.c into your application.
+
+Here is how to test the self-checkpoint:
+	$ ./self_checkpoint > self.image &
+	[1] 3512
+
+	$ sleep 3
+	$ mv /tmp/cr-self.out /tmp/cr-self.out.orig
+	$ cp /tmp/cr-self.out.orig /tmp/cr-self.out
+
+	$ cat /tmp/cr-self.out
+	hello, world!
+	count 0
+	count 1
+	count 2
+	checkpoint ret: 1
+	count 3
+	...
+
+	$ sed -i 's/count/xxxxx/g' /tmp/cr-self.out
+
+	$ ./self_restart < self.image &
+
+Now compare the output of the two output files.
+	$ cat /tmp/cr-self.out
+	hello, world!
+	xxxxx 0
+	xxxxx 1
+	xxxxx 2
+	checkpoint ret: 0
+	count 3
+	...
+
+
+Note how in test.c we close stdin, stdout, stderr - that's because
+currently we only support regular files (not ttys/ptys).
+
+If you check the output of ps, you'll see that "self_restart" changed
+its name to "test" or "self_checkpoint", as expected.
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [PATCH 1/3] ptp: Added a brand new class driver for ptp clocks.
From: Wolfgang Grandegger @ 2010-05-01 14:09 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev
In-Reply-To: <20100501140823.GA2779@riccoc20.at.omicron.at>

Richard Cochran wrote:
> On Fri, Apr 30, 2010 at 07:58:41PM +0200, Wolfgang Grandegger wrote:
>>>  include/linux/Kbuild             |    1 +
>>>  include/linux/ptp_clock.h        |   37 +++++
>> ptp_clock.h should probably be added to "include/linux/Kbuild".
> 
> But it already is, see the two lines above. Or did you mean something
> else?

Oops, sorry for the noise.

Wolfgang.

^ permalink raw reply

* Re: [PATCH 1/3] ptp: Added a brand new class driver for ptp clocks.
From: Richard Cochran @ 2010-05-01 14:08 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: netdev
In-Reply-To: <4BDB1A51.20505@grandegger.com>

On Fri, Apr 30, 2010 at 07:58:41PM +0200, Wolfgang Grandegger wrote:
> >  include/linux/Kbuild             |    1 +
> >  include/linux/ptp_clock.h        |   37 +++++
> 
> ptp_clock.h should probably be added to "include/linux/Kbuild".

But it already is, see the two lines above. Or did you mean something
else?

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: jamal @ 2010-05-01 13:49 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272720125.2230.178.camel@edumazet-laptop>

On Sat, 2010-05-01 at 15:22 +0200, Eric Dumazet wrote:

> You must understand that the whole 'bench' is mostly governed by
> scheduler artifacts. The regression you mention is probably a side
> effect.

likely.

> By slowing down one part, its possible to zap all calls to scheduler and
> go maybe 300% faster (Because consumer threads can avoid 3/4 of the time
> to schedule)
> 
> Reciprocally, optimizing one part of the network stack might make
> threads hitting an empty queue, and need to call more often the
> scheduler.

It is fair to say that what i am seeing is _not_ fatal because it is rps
that is regressing; non-rps is fine. I would consider non-rps to be the
common use scenario and if that was doing badly then it is a problem.
The good news is it is getting better - likely because of some changes
made on behalf of rps ;->
With rps, one could follow some instructions on how to make it better.
I am hoping that some of the system "magic" is documented as Tom
mentioned he will.

> This is why some higly specialized programs never block/schedule and
> perform busy loops instead.

Agreed. My brain cells should learn to accept this fact ;->

cheers,
jamal

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: Eric Dumazet @ 2010-05-01 13:22 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272714966.14499.37.camel@bigi>

Le samedi 01 mai 2010 à 07:56 -0400, jamal a écrit :

> 
> [1]i.e with this program rps was getting worse (it was much better
> before say net-next of apr14) and that non-rps has been getting better
> numbers since. The regression is real - but it is likely in another
> subsystem.
> 

You must understand that the whole 'bench' is mostly governed by
scheduler artifacts. The regression you mention is probably a side
effect.

By slowing down one part, its possible to zap all calls to scheduler and
go maybe 300% faster (Because consumer threads can avoid 3/4 of the time
to schedule)

Reciprocally, optimizing one part of the network stack might make
threads hitting an empty queue, and need to call more often the
scheduler.

This is why some higly specialized programs never block/schedule and
perform busy loops instead.

^ permalink raw reply

* [PATCH linux-2.6.34-rc5] drivers/net/phy: micrel phy driver
From: Choi, David @ 2010-04-29 16:12 UTC (permalink / raw)
  To: netdev

To whom it may have concerned:

From: David J. Choi <david.choi@micrel.com>
Body of the explanation: This is the first version of phy driver from Micrel Inc.
Signed-off-by: David J. Choi <david.choi@micrel.com>

---
--- linux-2.6.34-rc5/drivers/net/phy/micrel.c.orig	2010-04-29 08:20:51.000000000 -0700
+++ linux-2.6.34-rc5/drivers/net/phy/micrel.c	2010-04-29 08:52:37.000000000 -0700
@@ -0,0 +1,104 @@
+/*
+ * drivers/net/phy/micrel.c
+ *
+ * Driver for Micrel PHYs
+ *
+ * Author: David J. Choi
+ *
+ * Copyright (c) 2010 Micrel, Inc.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * Support : ksz9021 , vsc8201, ks8001
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/phy.h>
+
+#define	PHY_ID_KSZ9021			0x00221611
+#define	PHY_ID_VSC8201			0x000FC413
+#define	PHY_ID_KS8001			0x0022161A
+
+
+static int kszphy_config_init(struct phy_device *phydev)
+{
+	return 0;
+}
+
+
+static struct phy_driver ks8001_driver = {
+	.phy_id		= PHY_ID_KS8001,
+	.phy_id_mask	= 0x00fffff0,
+	.features	= PHY_BASIC_FEATURES,
+	.flags		= PHY_POLL,
+	.config_init	= kszphy_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.driver		= { .owner = THIS_MODULE,},
+};
+
+static struct phy_driver vsc8201_driver = {
+	.phy_id		= PHY_ID_VSC8201,
+	.name		= "Micrel VSC8201",
+	.phy_id_mask	= 0x00fffff0,
+	.features	= PHY_BASIC_FEATURES,
+	.flags		= PHY_POLL,
+	.config_init	= kszphy_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.driver		= { .owner = THIS_MODULE,},
+};
+
+static struct phy_driver ksz9021_driver = {
+	.phy_id		= PHY_ID_KSZ9021,
+	.phy_id_mask	= 0x000fff10,
+	.name		= "Micrel KSZ9021 Gigabit PHY",
+	.features	= PHY_GBIT_FEATURES | SUPPORTED_Pause,
+	.flags		= PHY_POLL,
+	.config_init	= kszphy_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.driver		= { .owner = THIS_MODULE, },
+};
+
+static int __init ksphy_init(void)
+{
+	int ret;
+
+	ret = phy_driver_register(&ks8001_driver);
+	if (ret)
+		goto err1;
+	ret = phy_driver_register(&vsc8201_driver);
+	if (ret)
+		goto err2;
+
+	ret = phy_driver_register(&ksz9021_driver);
+	if (ret)
+		goto err3;
+	return 0;
+
+err3:
+	phy_driver_unregister(&vsc8201_driver);
+err2:
+	phy_driver_unregister(&ks8001_driver);
+err1:
+	return ret;
+}
+
+static void __exit ksphy_exit(void)
+{
+	phy_driver_unregister(&ks8001_driver);
+	phy_driver_unregister(&vsc8201_driver);
+	phy_driver_unregister(&ksz9021_driver);
+}
+
+module_init(ksphy_init);
+module_exit(ksphy_exit);
+
+MODULE_DESCRIPTION("Micrel PHY driver");
+MODULE_AUTHOR("David J. Choi");
+MODULE_LICENSE("GPL");
--- linux-2.6.34-rc5/drivers/net/phy/Kconfig.orig	2010-04-29 08:21:12.000000000 -0700
+++ linux-2.6.34-rc5/drivers/net/phy/Kconfig	2010-04-29 08:25:18.000000000 -0700
@@ -88,6 +88,11 @@ config LSI_ET1011C_PHY
 	---help---
 	  Supports the LSI ET1011C PHY.
 
+config MICREL_PHY
+	tristate "Driver for Micrel PHYs"
+	---help---
+	  Supports the KSZ9021, VSC8201, KS8001 PHYs.
+
 config FIXED_PHY
 	bool "Driver for MDIO Bus/PHY emulation with fixed speed/link PHYs"
 	depends on PHYLIB=y
--- linux-2.6.34-rc5/drivers/net/phy/Makefile.orig	2010-04-29 08:20:25.000000000 -0700
+++ linux-2.6.34-rc5/drivers/net/phy/Makefile	2010-04-29 08:31:13.000000000 -0700
@@ -20,4 +20,5 @@ obj-$(CONFIG_MDIO_BITBANG)	+= mdio-bitba
 obj-$(CONFIG_MDIO_GPIO)		+= mdio-gpio.o
 obj-$(CONFIG_NATIONAL_PHY)	+= national.o
 obj-$(CONFIG_STE10XP)		+= ste10Xp.o
+obj-$(CONFIG_MICREL_PHY)	+= micrel.o
 obj-$(CONFIG_MDIO_OCTEON)	+= mdio-octeon.o

---

^ permalink raw reply

* Re: [net-next-2.6 PATCH 2/2] add ndo_set_port_profile op support for enic dynamic vnics
From: Arnd Bergmann @ 2010-05-01 12:36 UTC (permalink / raw)
  To: Scott Feldman; +Cc: davem, netdev, chrisw, Jens Osterkamp
In-Reply-To: <C8008CCC.2D21E%scofeldm@cisco.com>

On Friday 30 April 2010, Scott Feldman wrote:
> >    ip iov set  port-profile DEVICE [ base BASE-DEVICE ] name PORT-PROFILE
> >                              [ host_uuid HOST_UUID ]
> >                      [ client_name CLIENT_NAME ]
> >                                       [ client_uuid CLIENT_UUID ]
> >    ip iov set  vsi { associate | pre-associate | pre-associate-rr }
> > BASE-DEVICE
> >                                       vsi MGR:VTID:VER
> >                                       mac LLADDR [ vlan VID ]
> >                                       client_uuid CLIENT_UUID
> > 
> >    ip iov del  port_profile DEVICE      [ base BASE-DEVICE ]
> >    ip iov del  vsi          BASE-DEVICE [ mac LLADDR [ vlan VID ] ]
> >        [ client_uuid CLIENT_UUID ]
> > 
> >    ip iov show port_profile DEVICE      [ base BASE-DEVICE ]
> >    ip iov show vsi          BASE-DEVICE [ mac LLADDR [ vlan VID ] ]
> > [ client_uuid CLIENT_UUID ]
> > 
> > You would obvioulsy only implement the kernel support for the port-profile
> > stuff as callbacks, because no driver yet does VDP in the kernel, but we
> > should
> > have a common netlink header that defines both variants.
> > 
> > Chris, any opinion on this interface as opposed to the combined one?
> > Either one should work, but splitting it seems cleaner to me.
> 
> I haven't seen Chris's response, but it seems vger was down for awhile, so
> maybe it's coming.  Assuming we go for the split design, we're still talking
> about using RTM_SETLINK/RTM_GETLINK/RTM_DELLINK for these netlink msgs?  Or
> are you suggesting by your cmd syntax that we return to
> RTM_SETIOV/RTM_GETIOV like in the first iovnl patch?  RTM_SET/GET/DELLINK is
> probably simplier, cleaner patch.

In either case (split or combined), I would prefer the separate IOV
commands. The reason for this is that when support is not in the kernel,
it allows a cleaner separation between what's (always) handled in the
kernel and what's (potentially) done in user space.

	Arnd

^ permalink raw reply

* Re: [PATCH] tcp: SO_TIMESTAMP implementation for TCP
From: Paul LeoNerd Evans @ 2010-05-01 12:06 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: therbert
In-Reply-To: <20100430.164115.257514715.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1455 bytes --]

On Fri, Apr 30, 2010 at 04:41:15PM -0700, David Miller wrote:
> If other people have an opinion about this, now would be the time
> to speak up. :-)

I have to say I agree with David.

The "receive timestamp" for a TCP recv() call is completely meaningless.
Each byte in the stream arguably could have a set of receive timestamps,
being the timestamp of the underlying IPv4 packet containing a fragment
of a TCP segment that covered that byte. One recv() call could cover
many packets, many recv() calls could be required to consume one packet.
We just don't know from userland.

The point about IPv4 fragments in UDP is a reasonable one; that because
of IPv4 fragmentation there are still potentially multiple timestamps
that could be relevant to a single UDP recv() call. But no two recv()
calls can possibly relate to the same IPv4 fragments, so I feel this is
more defined. Plus, of all the IPv4 fragments that go into a single UDP
packet, one of them is special - the first one, the one containing the
UDP header. We could easily say "the timestamp of a UDP recv() call
shall be the time at which its header was received, even if other
fragments arrived before or after it". 

We cannot make any such distinction for some window in a TCP stream. All
TCP segments are indistinct in this manner.

-- 
Paul "LeoNerd" Evans

leonerd@leonerd.org.uk
ICQ# 4135350       |  Registered Linux# 179460
http://www.leonerd.org.uk/

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 190 bytes --]

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: jamal @ 2010-05-01 11:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272714179.2230.151.camel@edumazet-laptop>

On Sat, 2010-05-01 at 13:42 +0200, Eric Dumazet wrote:

> But, whole point of epoll is to not change interest each time you get an
> event.
> 
> Without EV_PERSIST, you need two more syscalls per recvfrom()
> 
> epoll_wait()
>  epoll_ctl(REMOVE)
>  epoll_ctl(ADD)
>  recvfrom()
> 
> Even poll() would be faster in your case
> 
> poll(one fd)
> recvfrom()
> 

This is true - but my goal was/is to replicate the regression i was
seeing[1]. 
I will try with PERSIST next opportunity. If it gets better
then it is something that needs documentation in the doc Tom
promised ;->

> I always thought copybreak was borderline...
> It can help to reduce memory footprint (allocating 128 bytes instead of
> 2048/4096 bytes per frame), but with RPS, it would make sense to perform
> copybreak after RPS, not before.
> 
> Reducing memory footprint also means less changes on
> udp_memory_allocated /tcp_memory_allocate (memory reclaim logic)

Indeed, something that didnt cross my mind in the rush to test - it is
one of those things that need to be mentioned in some doc somewhere.
Tom, are you listening? ;->

cheers,
jamal

[1]i.e with this program rps was getting worse (it was much better
before say net-next of apr14) and that non-rps has been getting better
numbers since. The regression is real - but it is likely in another
subsystem.

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: Eric Dumazet @ 2010-05-01 11:42 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272713014.14499.21.camel@bigi>

Le samedi 01 mai 2010 à 07:23 -0400, jamal a écrit :
> On Sat, 2010-05-01 at 07:57 +0200, Eric Dumazet wrote:
> 
> > I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
> > overhead for each packet...)
> 
> Thats a different test case then ;-> You can also get rid of the timer
> (I doubt it will show much difference in results) - I have it in there
> because it i am trying to replicate what i saw causing the regression.
> 
> > RPS off : 220.000 pps 
> > 
> > RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
> > 96% of delivered packets
> > 
> 
> That's a very very huge gap. What were the numbers before you changed to
> EV_PERSIST?

But, whole point of epoll is to not change interest each time you get an
event.

Without EV_PERSIST, you need two more syscalls per recvfrom()

epoll_wait()
 epoll_ctl(REMOVE)
 epoll_ctl(ADD)
 recvfrom()

Even poll() would be faster in your case

poll(one fd)
recvfrom()



> Note: i did not add any of your other patches for dst refcnt, sockets
> etc. Were you running with those patches in these tests? I will try the
> next opportunity i get to have latest kernel + those patches. 
> 
> > This is on tg3 adapter, and tg3 has copybreak feature : small packets
> > are copied into skb of the right size.
> 
> Ok, so the driver tuning is also important then (and it shows in the
> profile).

I always thought copybreak was borderline...

It can help to reduce memory footprint (allocating 128 bytes instead of
2048/4096 bytes per frame), but with RPS, it would make sense to perform
copybreak after RPS, not before.

Reducing memory footprint also means less changes on
udp_memory_allocated /tcp_memory_allocate (memory reclaim logic)




^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: jamal @ 2010-05-01 11:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272694442.2230.86.camel@edumazet-laptop>

On Sat, 2010-05-01 at 08:14 +0200, Eric Dumazet wrote:

> BTW, using ee mask, cpu4 is not used at _all_, even for the user
> threads. Scheduler does a bad job IMHO.

I have the opposite frustration ;->
I did notice it got used. My goal was to totally avoid using it, for
simple reason it is an SMT thread that shares same core as cpu0.
In retrospect i should probably set irq affinity then to cpu0 and 4.

> Using fe mask, I get all packets (sent at 733311pps by my pktgen
> machine), and my CPU0 even has idle time !!!

I will try this next time i get the chance.

cheers,
jamal

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: jamal @ 2010-05-01 11:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272693424.2230.75.camel@edumazet-laptop>

On Sat, 2010-05-01 at 07:57 +0200, Eric Dumazet wrote:

> I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
> overhead for each packet...)

Thats a different test case then ;-> You can also get rid of the timer
(I doubt it will show much difference in results) - I have it in there
because it i am trying to replicate what i saw causing the regression.

> RPS off : 220.000 pps 
> 
> RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
> 96% of delivered packets
> 

That's a very very huge gap. What were the numbers before you changed to
EV_PERSIST?
Note: i did not add any of your other patches for dst refcnt, sockets
etc. Were you running with those patches in these tests? I will try the
next opportunity i get to have latest kernel + those patches. 

> This is on tg3 adapter, and tg3 has copybreak feature : small packets
> are copied into skb of the right size.

Ok, so the driver tuning is also important then (and it shows in the
profile).

cheers,
jamal

^ permalink raw reply

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: Andi Kleen @ 2010-05-01 11:00 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, hadi, xiaosuo, therbert, shemminger, netdev, lenb,
	arjan
In-Reply-To: <20100430.163857.180417789.davem@davemloft.net>

On Fri, Apr 30, 2010 at 04:38:57PM -0700, David Miller wrote:
> From: Andi Kleen <ak@gargoyle.fritz.box>
> Date: Thu, 29 Apr 2010 23:41:44 +0200
> 
> >     Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
> > 
> >     XXX: probably too aggressive, some of these sleeps are not under high load.
> > 
> >     Based on a bug report from Eric Dumazet.
> >     
> >     Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> I like this, except that we probably don't want the delayacct_blkio_*() calls
> these things do.

Yes.

It needs more work, please don't apply it yet, to handle the "long sleep" case.

Still curious if it fixes Eric's test case.

> 
> Probably the rest of what these things do should remain in the io_schedule*()
> functions and the block layer can call it's own versions which add in the
> delayacct_blkio_*() bits.

Good point.

> 
> Or, if the delacct stuff is useful for socket I/O too, then it's interfaces
> names should have the "blk" stripped from them :-)

Good question. I suspect it's actually useful for some cases, but just adding
sockets might confuse some users.

-Andi

^ permalink raw reply

* Re: OFT - reserving CPU's for networking
From: Andi Kleen @ 2010-05-01 10:53 UTC (permalink / raw)
  To: David Miller; +Cc: tglx, shemminger, eric.dumazet, netdev, peterz
In-Reply-To: <20100430.153038.62351857.davem@davemloft.net>

> And we don't want it to, because the decision mechanisms for steering
> that we using now are starting to get into the stateful territory and
> that's verbotton for NIC offload as far as we're concerned.

Huh? I thought full TCP offload was forbidden?[1] Statefull as in NIC 
(or someone else like netfilter) tracking flows is quite common and very far 
from full offload. AFAIK it doesn't have near all the problems full
offload has.

-Andi

[1] although it seems to leak in more and more through the RDMA backdoor.

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: Eric Dumazet @ 2010-05-01 10:47 UTC (permalink / raw)
  To: Changli Gao
  Cc: hadi, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <o2u412e6f7f1005010324sfb63393fo86acdff4c97c5be3@mail.gmail.com>

Le samedi 01 mai 2010 à 18:24 +0800, Changli Gao a écrit :
> On Sat, May 1, 2010 at 2:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > BTW, using ee mask, cpu4 is not used at _all_, even for the user
> > threads. Scheduler does a bad job IMHO.
> >
> > Using fe mask, I get all packets (sent at 733311pps by my pktgen
> > machine), and my CPU0 even has idle time !!!
> >
> > Limit seems to be around 800.000 pps
> >
> > ------------------------------------------------------------------------------------------------------------------------
> >   PerfTop:    5616 irqs/sec  kernel:93.9% [1000Hz cycles],  (all, 8 CPUs)
> > ------------------------------------------------------------------------------------------------------------------------
> >
> 
> Oh, cpu0 usage is about 100-(100-93.9)*8 = 51.2%(Am I right?). If we
> can do weighted packet distributing: cpu0's weight is 1, and other
> cpus are 2. maybe we can utilize all the cpu power.
> 

Nope, cpu0 was at 100% in this test, other cpus were about at 50% each.

weigthed would be ok if I wanted to use cpu0 in the 'slave' cpus (RPS
targets). But I know the workload I am interested to, and ability to
resist to DDOS, want to keep cpu0 outside of IP/TCP/UDP stack.


Later, skb_pull() inline in eth_type_trans() permitted to reach 840.000
pps.

top - 12:42:55 up  3:00,  2 users,  load average: 0.44, 0.11, 0.03
Tasks: 126 total,   1 running, 125 sleeping,   0 stopped,   0 zombie
Cpu(s):  2.2%us, 16.5%sy,  0.0%ni, 46.5%id, 11.4%wa,  0.9%hi, 22.5%si,
0.0%st
Mem:   4148112k total,   211152k used,  3936960k free,    15228k buffers
Swap:  4192928k total,        0k used,  4192928k free,   121804k cached

You can see average idle of 46%
So there is probably more optimizations to do to reach maybe 1.300.000
pps ;)




^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: Changli Gao @ 2010-05-01 10:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: hadi, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272694442.2230.86.camel@edumazet-laptop>

On Sat, May 1, 2010 at 2:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> BTW, using ee mask, cpu4 is not used at _all_, even for the user
> threads. Scheduler does a bad job IMHO.
>
> Using fe mask, I get all packets (sent at 733311pps by my pktgen
> machine), and my CPU0 even has idle time !!!
>
> Limit seems to be around 800.000 pps
>
> ------------------------------------------------------------------------------------------------------------------------
>   PerfTop:    5616 irqs/sec  kernel:93.9% [1000Hz cycles],  (all, 8 CPUs)
> ------------------------------------------------------------------------------------------------------------------------
>

Oh, cpu0 usage is about 100-(100-93.9)*8 = 51.2%(Am I right?). If we
can do weighted packet distributing: cpu0's weight is 1, and other
cpus are 2. maybe we can utilize all the cpu power.

-- 
Regards，
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
From: Eric Dumazet @ 2010-05-01  8:03 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb
In-Reply-To: <1272697367.2230.106.camel@edumazet-laptop>

Le samedi 01 mai 2010 à 09:02 +0200, Eric Dumazet a écrit :
> Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit :
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Thu, 29 Apr 2010 23:01:49 +0200
> > 
> > > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
> > 
> > So what's the difference between call_rcu() freeing this little waitqueue
> > struct and doing it for the entire socket?
> > 
> > We'll still be doing an RCU call every socket destroy, and now we also have
> > a new memory allocation/free per connection.
> > 
> > This has to show up in things like 'lat_connect' and friends, does it not?
> 
> Before patch :
> 
> lat_connect -N 10 127.0.0.1
> TCP/IP connection cost to 127.0.0.1: 27.8872 microseconds
> 
> After :
> 
> lat_connect -N 10 127.0.0.1
> TCP/IP connection cost to 127.0.0.1: 20.7681 microseconds
> 
> Strange isnt it ?
> 
> (special care should be taken with this bench, as it leave many sockets
> in TIME_WAIT state, so to get consistent numbers we have to wait a while
> before restarting it)


Oops, this was with the other patch (about dst no_refcounting in input
path), sorry.

With the "sock_def_readable() and friends RCU conversion" patch I got :

lat_connect -N 10 127.0.0.1
TCP/IP connection cost to 127.0.0.1: 27.6244 microseconds


Anyway, this lat_connect seems very unreliable (lot of variance)

with linux-2.6.31, ~33 us
with linux-2.6.33, ~30 us

David, I also need this RCU thing in order to be able to group all
wakeups at the end of net_rx_action().

Plan was to use RCU, so that I dont need to increase sk_refcnt when
queueing a "wakeup" (and decrease sk_refcnt a long time after)

Previous attempt was a bit hacky,
http://patchwork.ozlabs.org/patch/24179/

I expect 2010 one will be cleaner :)



^ permalink raw reply

* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
From: Eric Dumazet @ 2010-05-01  7:02 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb
In-Reply-To: <20100430.163519.133415203.davem@davemloft.net>

Le vendredi 30 avril 2010 à 16:35 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 29 Apr 2010 23:01:49 +0200
> 
> > [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
> 
> So what's the difference between call_rcu() freeing this little waitqueue
> struct and doing it for the entire socket?
> 
> We'll still be doing an RCU call every socket destroy, and now we also have
> a new memory allocation/free per connection.
> 
> This has to show up in things like 'lat_connect' and friends, does it not?

Before patch :

lat_connect -N 10 127.0.0.1
TCP/IP connection cost to 127.0.0.1: 27.8872 microseconds

After :

lat_connect -N 10 127.0.0.1
TCP/IP connection cost to 127.0.0.1: 20.7681 microseconds

Strange isnt it ?

(special care should be taken with this bench, as it leave many sockets
in TIME_WAIT state, so to get consistent numbers we have to wait a while
before restarting it)




^ permalink raw reply

* [PATCH net-next-2.6] net: eth_type_trans() should inline skb_pull()
From: Eric Dumazet @ 2010-05-01  6:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tom Herbert, jamal

840.000 pps instead of 800.000 pps on my 'old' machine, using RPS

Before patch, profile of CPU 0 (handling tg3 interrupts)

             2167.00 13.9% __alloc_skb            vmlinux
             1908.00 12.3% eth_type_trans         vmlinux
             1125.00  7.2% __kmalloc_track_caller vmlinux
              981.00  6.3% __netdev_alloc_skb     vmlinux
              925.00  5.9% _raw_spin_lock         vmlinux
              786.00  5.1% kmem_cache_alloc       vmlinux
              757.00  4.9% skb_pull               vmlinux
              698.00  4.5% tg3_read32             vmlinux
              637.00  4.1% __slab_alloc           vmlinux
              620.00  4.0% tg3_poll_work          vmlinux
              576.00  3.7% get_rps_cpu            vmlinux
              448.00  2.9% bnx2_interrupt         vmlinux

After (no more skb_pull, and eth_type_trans() not more expensive)
Predominant cost is memory allocator...

             1625.00 12.4% eth_type_trans         vmlinux
             1468.00 11.2% __alloc_skb            vmlinux
             1004.00  7.6% __kmalloc_track_caller vmlinux
              893.00  6.8% _raw_spin_lock         vmlinux
              738.00  5.6% __netdev_alloc_skb     vmlinux
              665.00  5.1% tg3_read32             vmlinux
              656.00  5.0% kmem_cache_alloc       vmlinux
              655.00  5.0% __slab_alloc           vmlinux
              509.00  3.9% bnx2_interrupt         vmlinux
              483.00  3.7% tg3_poll_work          vmlinux
              455.00  3.5% _raw_spin_lock_irqsave vmlinux
              330.00  2.5% get_rps_cpu            vmlinux
              286.00  2.2% nommu_map_page         vmlinux
              277.00  2.1% enqueue_to_backlog     vmlinux
              235.00  1.8% inet_gro_receive       vmlinux
              232.00  1.8% __copy_to_user_ll      vmlinux
              181.00  1.4% dev_gro_receive        vmlinux
              165.00  1.3% skb_gro_reset_offset   vmlinux

(bnx2_interrupt is called, because irq 16 is shared on this machine on two nics...)

Thanks !

[PATCH net-next-2.6] net: eth_type_trans() should inline skb_pull()

With RPS, this patch can give a 5 % boost in performance.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 0c0d272..763524b 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -162,7 +162,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 	skb->dev = dev;
 	skb_reset_mac_header(skb);
-	skb_pull(skb, ETH_HLEN);
+	if (likely(skb->len >= ETH_HLEN))
+		__skb_pull(skb, ETH_HLEN);
 	eth = eth_hdr(skb);
 
 	if (unlikely(is_multicast_ether_addr(eth->h_dest))) {



^ permalink raw reply related

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: Eric Dumazet @ 2010-05-01  6:14 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272693424.2230.75.camel@edumazet-laptop>

Le samedi 01 mai 2010 à 07:57 +0200, Eric Dumazet a écrit :
> Le vendredi 30 avril 2010 à 20:06 -0400, jamal a écrit :
> 
> > Yes, Nehalem. 
> > RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the
> > same trend on the old hardware?
> > 
> 
> Of course not ! Or else RPS would be useless :(
> 
> I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
> overhead for each packet...)
> 
> RPS off : 220.000 pps 
> 
> RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
> 96% of delivered packets

BTW, using ee mask, cpu4 is not used at _all_, even for the user
threads. Scheduler does a bad job IMHO.

Using fe mask, I get all packets (sent at 733311pps by my pktgen
machine), and my CPU0 even has idle time !!!

Limit seems to be around 800.000 pps

------------------------------------------------------------------------------------------------------------------------
   PerfTop:    5616 irqs/sec  kernel:93.9% [1000Hz cycles],  (all, 8 CPUs)
------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _______

             3492.00  6.2% __slab_free                 vmlinux
             2334.00  4.2% _raw_spin_lock              vmlinux
             2314.00  4.1% _raw_spin_lock_irqsave      vmlinux
             1807.00  3.2% ip_rcv                      vmlinux
             1605.00  2.9% schedule                    vmlinux
             1474.00  2.6% __netif_receive_skb         vmlinux
             1464.00  2.6% kfree                       vmlinux
             1405.00  2.5% ip_route_input              vmlinux
             1318.00  2.4% __copy_to_user_ll           vmlinux
             1214.00  2.2% __alloc_skb                 vmlinux
             1160.00  2.1% nf_hook_slow                vmlinux
             1020.00  1.8% eth_type_trans              vmlinux
              860.00  1.5% sched_clock_local           vmlinux
              775.00  1.4% read_tsc                    vmlinux
              773.00  1.4% ipt_do_table                vmlinux
              766.00  1.4% _raw_spin_unlock_irqrestore vmlinux
              748.00  1.3% sock_recv_ts_and_drops      vmlinux
              747.00  1.3% ia32_sysenter_target        vmlinux
              740.00  1.3% select_nohz_load_balancer   vmlinux
              644.00  1.2% __kmalloc_track_caller      vmlinux
              596.00  1.1% tg3_read32                  vmlinux
              566.00  1.0% __udp4_lib_lookup           vmlinux





^ permalink raw reply

* Re: [PATCH] tcp: SO_TIMESTAMP implementation for TCP
From: Eric Dumazet @ 2010-05-01  6:00 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Bill Fink, David Miller, netdev
In-Reply-To: <AANLkTimFjDNgXdMGpVpO7Gi38ROlww1Fa7IKA1ASBKOV@mail.gmail.com>

Le vendredi 30 avril 2010 à 22:40 -0700, Tom Herbert a écrit :
> > Not being a kernel hacker, I will naively ask if the kernel tracing
> > facility could somehow be used to provide the desired info (or could
> > be modified to provide it).
> >
> 
> We did consider kernel tracing (more in the context of implementing
> RFC 4898).  In the case of trying get per packet timestamps,
> correlating a ktrace event with an application message is probably too
> high to make it practical.  If it weren't for the cost of
> timestamp'ing every single skb being received, we'd probably have
> SO_TIMESTAMP turned on permanently for many connections.  For now
> we're settling for a percentage of messages for sampling.

Tom, did you tried to reuse existing skb or sk tstamps ?




^ permalink raw reply

* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: Eric Dumazet @ 2010-05-01  5:57 UTC (permalink / raw)
  To: hadi
  Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
	Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272672394.14499.1.camel@bigi>

Le vendredi 30 avril 2010 à 20:06 -0400, jamal a écrit :

> Yes, Nehalem. 
> RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the
> same trend on the old hardware?
> 

Of course not ! Or else RPS would be useless :(

I changed your program a bit to use EV_PERSIST, (to avoid epoll_ctl()
overhead for each packet...)

RPS off : 220.000 pps 

RPS on (ee mask) : 700.000 pps  (with a slightly modified tg3 driver)
96% of delivered packets

This is on tg3 adapter, and tg3 has copybreak feature : small packets
are copied into skb of the right size.

define TG3_RX_COPY_THRESHOLD       256 -> 40 ...

We really should disable this feature for RPS workload,
unfortunatly ethtool cannot tweak this.

So profile of cpu 0 (RPS ON) looks like :

------------------------------------------------------------------------------------------------------------------------
   PerfTop:    1001 irqs/sec  kernel:99.7% [1000Hz cycles],  (all, cpu: 0)
------------------------------------------------------------------------------------------------------------------------

             samples  pcnt function               DSO
             _______ _____ ______________________ _______

              819.00 12.6% __alloc_skb            vmlinux
              592.00  9.1% eth_type_trans         vmlinux
              509.00  7.8% _raw_spin_lock         vmlinux
              475.00  7.3% __kmalloc_track_caller vmlinux
              358.00  5.5% tg3_read32             vmlinux
              345.00  5.3% __netdev_alloc_skb     vmlinux
              329.00  5.0% kmem_cache_alloc       vmlinux
              307.00  4.7% _raw_spin_lock_irqsave vmlinux
              284.00  4.4% bnx2_interrupt         vmlinux
              277.00  4.2% skb_pull               vmlinux
              248.00  3.8% tg3_poll_work          vmlinux
              202.00  3.1% __slab_alloc           vmlinux
              197.00  3.0% get_rps_cpu            vmlinux
              106.00  1.6% enqueue_to_backlog     vmlinux
               87.00  1.3% _raw_spin_lock_bh      vmlinux
               80.00  1.2% __copy_to_user_ll      vmlinux
               77.00  1.2% nommu_map_page         vmlinux
               77.00  1.2% __napi_gro_receive     vmlinux
               65.00  1.0% tg3_alloc_rx_skb       vmlinux
               60.00  0.9% skb_gro_reset_offset   vmlinux
               57.00  0.9% skb_put                vmlinux
               57.00  0.9% __slab_free            vmlinux


/*
 *  Usage: udpsnkfrk [ -p baseport] nbports
*/
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <event.h>

struct worker_data {
	struct event *snk_ev;
	struct event_base *base;
	struct timeval t;
	unsigned long pack_count;
	unsigned long bytes_count;
	unsigned long tout;
	int fd;			/* move to avoid hole on 64-bit */
	int pad1;	
	unsigned long _padd[99]; /* avoid false sharing */
};

void usage(int code)
{
	fprintf(stderr, "Usage: udpsink [-p baseport] nbports\n");
	exit(code);
}

void process_recv(int fd, short ev, void *arg)
{
	char buffer[4096];
	struct sockaddr_in addr;
	socklen_t len = sizeof(addr);
	struct worker_data *wdata = (struct worker_data *)arg;
	int lu = 0;


	if (ev == EV_TIMEOUT) {
		wdata->tout++;
		if ((event_add(wdata->snk_ev, &wdata->t)) < 0) {
			perror("cb event_add");
			return;
		}
	} else {
		do {
			lu = recvfrom(wdata->fd, buffer, sizeof(buffer), 0,
			      (struct sockaddr *)&addr, &len);
			if (lu > 0) {
				wdata->pack_count++;
				wdata->bytes_count += lu;
			}
		} while (lu > 0);
	}
}

int prep_thread(struct worker_data *wdata)
{
	wdata->t.tv_sec = 1;
	wdata->t.tv_usec = random() % 50000L;

	wdata->base = event_init();
	event_set(wdata->snk_ev, wdata->fd, EV_READ|EV_PERSIST, process_recv, wdata);
	event_base_set(wdata->base, wdata->snk_ev);
	if ((event_add(wdata->snk_ev, &wdata->t)) < 0) {
		perror("event_add");
		return -1;
	}
	return 0;
}

void *worker_func(void *arg)
{
	struct worker_data *wdata = (struct worker_data *)arg;

	return (void *)event_base_loop(wdata->base, 0);
}

int main(int argc, char *argv[])
{
	int c;
	int baseport = 4000;
	int nbthreads;
	struct worker_data *wdata;
	unsigned long ototal = 0;
	int concurrent = 0;
	int verbose = 0;
	int i;
	while ((c = getopt(argc, argv, "cvp:")) != -1) {
		if (c == 'p')
			baseport = atoi(optarg);
		else if (c == 'c')
			concurrent = 1;
		else if (c == 'v')
			verbose++;
		else
			usage(1);
	}
	if (optind == argc)
		usage(1);
	nbthreads = atoi(argv[optind]);
	wdata = calloc(sizeof(struct worker_data), nbthreads);
	if (!wdata) {
		perror("calloc");
		return 1;
	}

	for (i = 0; i < nbthreads; i++) {
		struct sockaddr_in addr;
		pthread_t tid;

		if (i && concurrent) {
			wdata[i].fd = wdata[0].fd;
		} else {
			wdata[i].snk_ev = malloc(sizeof(struct event));
			if (!wdata[i].snk_ev)
				return 1;
			memset(wdata[i].snk_ev, 0, sizeof(struct event));

			wdata[i].fd = socket(PF_INET, SOCK_DGRAM, 0);
			if (wdata[i].fd == -1) {
				free(wdata[i].snk_ev);
				perror("socket");
				return 1;
			}
			memset(&addr, 0, sizeof(addr));
			addr.sin_family = AF_INET;
//                      addr.sin_addr.s_addr = inet_addr(argv[optind]);
			addr.sin_port = htons(baseport + i);
			if (bind
			    (wdata[i].fd, (struct sockaddr *)&addr,
			     sizeof(addr)) < 0) {
				free(wdata[i].snk_ev);
				perror("bind");
				return 1;
			}
                      fcntl(wdata[i].fd, F_SETFL, O_NDELAY);
		}
		if (prep_thread(wdata + i)) {
			printf("failed to allocate thread %d, exit\n", i);
			exit(0);
		}
		pthread_create(&tid, NULL, worker_func, wdata + i);
	}

	for (;;) {
		unsigned long total;
		long delta;

		sleep(1);
		total = 0;
		for (i = 0; i < nbthreads; i++) {
			total += wdata[i].pack_count;
		}
		delta = total - ototal;
		if (delta) {
			printf("%lu pps (%lu", delta, total);
			if (verbose) {
				for (i = 0; i < nbthreads; i++) {
					if (wdata[i].pack_count)
						printf(" %d:%lu", i,
						       wdata[i].pack_count);
				}
			}
			printf(")\n");
		}
		ototal = total;
	}
}




^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox