All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [RFC] Add c/r support for listening INET sockets
@ 2009-09-28 18:58 Dan Smith
       [not found] ` <1254164330-23072-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Dan Smith @ 2009-09-28 18:58 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: Andrew Morton

This is an incremental step towards supporting checkpoint/restart on
AF_INET sockets.  In this scenario, any sockets that were in TCP_LISTEN
state are restored as they were.  Any that were connected are forced to
TCP_CLOSE.  This should cover a range of use cases that involve
applications that are tolerant of such an interruption.

Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/checkpoint_hdr.h |   11 ++
 include/net/inet_common.h      |    9 ++
 net/checkpoint.c               |    9 ++
 net/ipv4/Makefile              |    1 +
 net/ipv4/af_inet.c             |    6 +
 net/ipv4/checkpoint.c          |  204 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 240 insertions(+), 0 deletions(-)
 create mode 100644 net/ipv4/checkpoint.c

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 2ed523f..b5ce115 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -15,9 +15,11 @@
 #ifdef __KERNEL__
 #include <linux/socket.h>
 #include <linux/un.h>
+#include <linux/in.h>
 #else
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <sys/in.h>
 #endif
 
 /*
@@ -106,6 +108,7 @@ enum {
 	CKPT_HDR_SOCKET_QUEUE,
 	CKPT_HDR_SOCKET_BUFFER,
 	CKPT_HDR_SOCKET_UNIX,
+	CKPT_HDR_SOCKET_INET,
 
 	CKPT_HDR_TAIL = 9001,
 
@@ -470,6 +473,14 @@ struct ckpt_hdr_socket_unix {
 	struct sockaddr_un raddr;
 } __attribute__ ((aligned(8)));
 
+struct ckpt_hdr_socket_inet {
+	struct ckpt_hdr h;
+	__u32 laddr_len;
+	__u32 raddr_len;
+	struct sockaddr_in laddr;
+	struct sockaddr_in raddr;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_file_socket {
 	struct ckpt_hdr_file common;
 	__s32 sock_objref;
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 18c7732..7ade732 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -45,6 +45,15 @@ extern int			inet_ctl_sock_create(struct sock **sk,
 						     unsigned char protocol,
 						     struct net *net);
 
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
+extern int inet_collect(struct ckpt_ctx *ctx, struct socket *sock);
+extern int inet_restore(struct ckpt_ctx *cftx, struct socket *sock,
+			struct ckpt_hdr_socket *h);
+#endif /* CONFIG_CHECKPOINT */
+
 static inline void inet_ctl_sock_destroy(struct sock *sk)
 {
 	sk_release_kernel(sk);
diff --git a/net/checkpoint.c b/net/checkpoint.c
index a11ec7a..6960389 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -711,6 +711,15 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		goto err;
 
+	if ((h->sock_common.family == AF_INET) &&
+	    (h->sock.state != TCP_LISTEN)) {
+		/* Temporary hack to enable restore of TCP_LISTEN sockets
+		 * while forcing anything else to a closed state
+		 */
+		sock->sk->sk_state = TCP_CLOSE;
+		sock->state = SS_UNCONNECTED;
+	}
+
 	ckpt_hdr_put(ctx, h);
 	return sock->sk;
  err:
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87c..c00d8ce 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 566ea6c..7828885 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -857,6 +857,9 @@ const struct proto_ops inet_stream_ops = {
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = tcp_sendpage,
 	.splice_read	   = tcp_splice_read,
+	.checkpoint        = inet_checkpoint,
+	.restore           = inet_restore,
+	.collect           = inet_collect,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
@@ -882,6 +885,9 @@ const struct proto_ops inet_dgram_ops = {
 	.recvmsg	   = sock_common_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
+	.checkpoint        = inet_checkpoint,
+	.restore           = inet_restore,
+	.collect           = inet_collect,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
new file mode 100644
index 0000000..881e723
--- /dev/null
+++ b/net/ipv4/checkpoint.c
@@ -0,0 +1,204 @@
+/*
+ *  Copyright 2009 IBM Corporation
+ *
+ *  Author(s): Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/namei.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/tcp.h>
+#include <linux/in.h>
+#include <linux/deferqueue.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+struct dq_sock {
+	struct ckpt_ctx *ctx;
+	struct sock *sk;
+};
+
+struct dq_buffers {
+	struct ckpt_ctx *ctx;
+	struct sock *sk;
+};
+
+int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
+{
+	struct ckpt_hdr_socket_inet *in;
+	int ret = -EINVAL;
+
+       in = ckpt_hdr_get_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
+       if (!in)
+               goto out;
+
+       ret = ckpt_sock_getnames(ctx, sock,
+				(struct sockaddr *)&in->laddr, &in->laddr_len,
+				(struct sockaddr *)&in->raddr, &in->raddr_len);
+       if (ret)
+	       goto out;
+
+       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
+ out:
+       ckpt_hdr_put(ctx, in);
+
+       return ret;
+}
+
+int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
+{
+       return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
+}
+
+static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int len;
+	int ret;
+	struct sk_buff *skb;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+	if (len < 0) {
+		ret = len;
+		goto out;
+	} else if (len > SKB_MAX_ALLOC) {
+		ckpt_debug("Socket buffer too big (%i > %lu)",
+			   len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ckpt_kread(ctx, skb_put(skb, len), len);
+	if (ret < 0)
+		goto out;
+
+	spin_lock(&queue->lock);
+	skb_queue_tail(queue, skb);
+	spin_unlock(&queue->lock);
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	if (ret < 0)
+		kfree_skb(skb);
+
+	return ret;
+}
+
+static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_queue *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_QUEUE);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	for (i = 0; i < h->skb_count; i++) {
+		ret = inet_read_buffer(ctx, queue);
+		ckpt_debug("read inet buffer %i: %i", i, ret);
+		if (ret < 0)
+			goto out;
+
+		if (ret > h->total_bytes) {
+			ckpt_debug("Buffers exceeded claim");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		h->total_bytes -= ret;
+		ret = 0;
+	}
+
+	ret = h->skb_count;
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int inet_deferred_restore_buffers(void *data)
+{
+	struct dq_buffers *dq = (struct dq_buffers *)data;
+	struct ckpt_ctx *ctx = dq->ctx;
+	struct sock *sk = dq->sk;
+	int ret;
+
+	ret = inet_read_buffers(ctx, &sk->sk_receive_queue);
+	ckpt_debug("(R) inet_read_buffers: %i\n", ret);
+	if (ret < 0)
+		return ret;
+
+	ret = inet_read_buffers(ctx, &sk->sk_write_queue);
+	ckpt_debug("(W) inet_read_buffers: %i\n", ret);
+
+	return ret;
+}
+
+static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct dq_buffers dq;
+
+	dq.ctx = ctx;
+	dq.sk = sk;
+
+	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+			      inet_deferred_restore_buffers, NULL);
+}
+
+int inet_restore(struct ckpt_ctx *ctx,
+		 struct socket *sock,
+		 struct ckpt_hdr_socket *h)
+{
+	struct ckpt_hdr_socket_inet *in;
+	int ret = 0;
+
+	in = ckpt_read_obj_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
+	if (IS_ERR(in))
+		return PTR_ERR(in);
+
+	/* Listening sockets and those that are closed but have a local
+	 * address need to call bind()
+	 */
+	if ((h->sock.state == TCP_LISTEN) ||
+	    ((h->sock.state == TCP_CLOSE) && (in->laddr_len > 0))) {
+		sock->sk->sk_reuse = 2;
+		inet_sk(sock->sk)->freebind = 1;
+		ret = sock->ops->bind(sock,
+				      (struct sockaddr *)&in->laddr,
+				      in->laddr_len);
+		ckpt_debug("inet bind: %i\n", ret);
+		if (ret < 0)
+			goto out;
+
+		if (h->sock.state == TCP_LISTEN) {
+			ret = sock->ops->listen(sock, h->sock.backlog);
+			ckpt_debug("inet listen: %i\n", ret);
+			if (ret < 0)
+				goto out;
+		}
+	} else {
+		if (!sock_flag(sock->sk, SOCK_DEAD))
+			ret = inet_defer_restore_buffers(ctx, sock->sk);
+	}
+ out:
+	ckpt_hdr_put(ctx, in);
+
+	return ret;
+ }
+
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] [RFC] Add c/r support for listening INET sockets
       [not found] ` <1254164330-23072-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-29 19:50   ` Serge E. Hallyn
       [not found]     ` <20090929195025.GA14956-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-29 21:49   ` Oren Laadan
  2009-09-29 22:16   ` Oren Laadan
  2 siblings, 1 reply; 6+ messages in thread
From: Serge E. Hallyn @ 2009-09-29 19:50 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Andrew Morton

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> This is an incremental step towards supporting checkpoint/restart on
> AF_INET sockets.  In this scenario, any sockets that were in TCP_LISTEN
> state are restored as they were.  Any that were connected are forced to
> TCP_CLOSE.  This should cover a range of use cases that involve
> applications that are tolerant of such an interruption.
> 
> Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
> Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  include/linux/checkpoint_hdr.h |   11 ++
>  include/net/inet_common.h      |    9 ++
>  net/checkpoint.c               |    9 ++
>  net/ipv4/Makefile              |    1 +
>  net/ipv4/af_inet.c             |    6 +
>  net/ipv4/checkpoint.c          |  204 ++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 240 insertions(+), 0 deletions(-)
>  create mode 100644 net/ipv4/checkpoint.c
> 
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 2ed523f..b5ce115 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -15,9 +15,11 @@
>  #ifdef __KERNEL__
>  #include <linux/socket.h>
>  #include <linux/un.h>
> +#include <linux/in.h>
>  #else
>  #include <sys/socket.h>
>  #include <sys/un.h>
> +#include <sys/in.h>
>  #endif
> 
>  /*
> @@ -106,6 +108,7 @@ enum {
>  	CKPT_HDR_SOCKET_QUEUE,
>  	CKPT_HDR_SOCKET_BUFFER,
>  	CKPT_HDR_SOCKET_UNIX,
> +	CKPT_HDR_SOCKET_INET,
> 
>  	CKPT_HDR_TAIL = 9001,
> 
> @@ -470,6 +473,14 @@ struct ckpt_hdr_socket_unix {
>  	struct sockaddr_un raddr;
>  } __attribute__ ((aligned(8)));
> 
> +struct ckpt_hdr_socket_inet {
> +	struct ckpt_hdr h;
> +	__u32 laddr_len;
> +	__u32 raddr_len;
> +	struct sockaddr_in laddr;
> +	struct sockaddr_in raddr;
> +} __attribute__((aligned(8)));
> +
>  struct ckpt_hdr_file_socket {
>  	struct ckpt_hdr_file common;
>  	__s32 sock_objref;
> diff --git a/include/net/inet_common.h b/include/net/inet_common.h
> index 18c7732..7ade732 100644
> --- a/include/net/inet_common.h
> +++ b/include/net/inet_common.h
> @@ -45,6 +45,15 @@ extern int			inet_ctl_sock_create(struct sock **sk,
>  						     unsigned char protocol,
>  						     struct net *net);
> 
> +#ifdef CONFIG_CHECKPOINT
> +struct ckpt_ctx;
> +struct ckpt_hdr_socket;
> +extern int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
> +extern int inet_collect(struct ckpt_ctx *ctx, struct socket *sock);
> +extern int inet_restore(struct ckpt_ctx *cftx, struct socket *sock,
> +			struct ckpt_hdr_socket *h);
> +#endif /* CONFIG_CHECKPOINT */
> +
>  static inline void inet_ctl_sock_destroy(struct sock *sk)
>  {
>  	sk_release_kernel(sk);
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> index a11ec7a..6960389 100644
> --- a/net/checkpoint.c
> +++ b/net/checkpoint.c
> @@ -711,6 +711,15 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
>  	if (ret < 0)
>  		goto err;
> 
> +	if ((h->sock_common.family == AF_INET) &&
> +	    (h->sock.state != TCP_LISTEN)) {
> +		/* Temporary hack to enable restore of TCP_LISTEN sockets
> +		 * while forcing anything else to a closed state
> +		 */
> +		sock->sk->sk_state = TCP_CLOSE;

I'll admit by this point I'm a bit confused about the order in which
these things are restored.  Does do_sock_restore() run before
inet_restore() below (curious whether its check for TCP_CLOSE will
catch these particular sockets).

> +		sock->state = SS_UNCONNECTED;
> +	}
> +
>  	ckpt_hdr_put(ctx, h);
>  	return sock->sk;
>   err:
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index 80ff87c..c00d8ce 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
>  obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
>  obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
>  obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
> +obj-$(CONFIG_CHECKPOINT) += checkpoint.o
> 
>  obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
>  		      xfrm4_output.o
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 566ea6c..7828885 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -857,6 +857,9 @@ const struct proto_ops inet_stream_ops = {
>  	.mmap		   = sock_no_mmap,
>  	.sendpage	   = tcp_sendpage,
>  	.splice_read	   = tcp_splice_read,
> +	.checkpoint        = inet_checkpoint,
> +	.restore           = inet_restore,
> +	.collect           = inet_collect,
>  #ifdef CONFIG_COMPAT
>  	.compat_setsockopt = compat_sock_common_setsockopt,
>  	.compat_getsockopt = compat_sock_common_getsockopt,
> @@ -882,6 +885,9 @@ const struct proto_ops inet_dgram_ops = {
>  	.recvmsg	   = sock_common_recvmsg,
>  	.mmap		   = sock_no_mmap,
>  	.sendpage	   = inet_sendpage,
> +	.checkpoint        = inet_checkpoint,
> +	.restore           = inet_restore,
> +	.collect           = inet_collect,
>  #ifdef CONFIG_COMPAT
>  	.compat_setsockopt = compat_sock_common_setsockopt,
>  	.compat_getsockopt = compat_sock_common_getsockopt,
> diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
> new file mode 100644
> index 0000000..881e723
> --- /dev/null
> +++ b/net/ipv4/checkpoint.c
> @@ -0,0 +1,204 @@
> +/*
> + *  Copyright 2009 IBM Corporation
> + *
> + *  Author(s): Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + */
> +
> +#include <linux/namei.h>
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/tcp.h>
> +#include <linux/in.h>
> +#include <linux/deferqueue.h>
> +#include <net/tcp_states.h>
> +#include <net/tcp.h>
> +
> +struct dq_sock {
> +	struct ckpt_ctx *ctx;
> +	struct sock *sk;
> +};
> +
> +struct dq_buffers {
> +	struct ckpt_ctx *ctx;
> +	struct sock *sk;
> +};
> +
> +int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
> +{
> +	struct ckpt_hdr_socket_inet *in;
> +	int ret = -EINVAL;
> +
> +       in = ckpt_hdr_get_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
> +       if (!in)
> +               goto out;

Better to just return...  I don't think this'll cause a crash (it only
makes ckpt_hdr_put() pass ((struct ckpt_hdr *)NULL)->len to
_ckpt_hdr_put which then ignores it and does a safe kfree(NULL)), but
it won't be safe if the implementation of the ckpt_hdr_*() calls end up
changing.

> +
> +       ret = ckpt_sock_getnames(ctx, sock,
> +				(struct sockaddr *)&in->laddr, &in->laddr_len,
> +				(struct sockaddr *)&in->raddr, &in->raddr_len);
> +       if (ret)
> +	       goto out;
> +
> +       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
> + out:
> +       ckpt_hdr_put(ctx, in);
> +
> +       return ret;
> +}
> +
> +int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
> +{
> +       return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
> +}
> +
> +static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int len;
> +	int ret;
> +	struct sk_buff *skb;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> +	if (len < 0) {
> +		ret = len;
> +		goto out;
> +	} else if (len > SKB_MAX_ALLOC) {
> +		ckpt_debug("Socket buffer too big (%i > %lu)",
> +			   len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +
> +	skb = alloc_skb(len, GFP_KERNEL);
> +	if (!skb) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = ckpt_kread(ctx, skb_put(skb, len), len);
> +	if (ret < 0)
> +		goto out;
> +
> +	spin_lock(&queue->lock);
> +	skb_queue_tail(queue, skb);
> +	spin_unlock(&queue->lock);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	if (ret < 0)
> +		kfree_skb(skb);
> +
> +	return ret;
> +}
> +
> +static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_queue *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_QUEUE);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		ret = inet_read_buffer(ctx, queue);
> +		ckpt_debug("read inet buffer %i: %i", i, ret);
> +		if (ret < 0)
> +			goto out;
> +
> +		if (ret > h->total_bytes) {
> +			ckpt_debug("Buffers exceeded claim");
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		h->total_bytes -= ret;
> +		ret = 0;
> +	}
> +
> +	ret = h->skb_count;
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int inet_deferred_restore_buffers(void *data)
> +{
> +	struct dq_buffers *dq = (struct dq_buffers *)data;
> +	struct ckpt_ctx *ctx = dq->ctx;
> +	struct sock *sk = dq->sk;
> +	int ret;
> +
> +	ret = inet_read_buffers(ctx, &sk->sk_receive_queue);
> +	ckpt_debug("(R) inet_read_buffers: %i\n", ret);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = inet_read_buffers(ctx, &sk->sk_write_queue);
> +	ckpt_debug("(W) inet_read_buffers: %i\n", ret);
> +
> +	return ret;
> +}
> +
> +static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct dq_buffers dq;
> +
> +	dq.ctx = ctx;
> +	dq.sk = sk;
> +
> +	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
> +			      inet_deferred_restore_buffers, NULL);
> +}
> +
> +int inet_restore(struct ckpt_ctx *ctx,
> +		 struct socket *sock,
> +		 struct ckpt_hdr_socket *h)
> +{
> +	struct ckpt_hdr_socket_inet *in;
> +	int ret = 0;
> +
> +	in = ckpt_read_obj_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
> +	if (IS_ERR(in))
> +		return PTR_ERR(in);
> +
> +	/* Listening sockets and those that are closed but have a local
> +	 * address need to call bind()
> +	 */
> +	if ((h->sock.state == TCP_LISTEN) ||
> +	    ((h->sock.state == TCP_CLOSE) && (in->laddr_len > 0))) {
> +		sock->sk->sk_reuse = 2;
> +		inet_sk(sock->sk)->freebind = 1;
> +		ret = sock->ops->bind(sock,
> +				      (struct sockaddr *)&in->laddr,
> +				      in->laddr_len);
> +		ckpt_debug("inet bind: %i\n", ret);
> +		if (ret < 0)
> +			goto out;
> +
> +		if (h->sock.state == TCP_LISTEN) {
> +			ret = sock->ops->listen(sock, h->sock.backlog);
> +			ckpt_debug("inet listen: %i\n", ret);
> +			if (ret < 0)
> +				goto out;
> +		}
> +	} else {
> +		if (!sock_flag(sock->sk, SOCK_DEAD))
> +			ret = inet_defer_restore_buffers(ctx, sock->sk);
> +	}
> + out:
> +	ckpt_hdr_put(ctx, in);
> +
> +	return ret;
> + }
> +
> -- 
> 1.6.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] [RFC] Add c/r support for listening INET sockets
       [not found]     ` <20090929195025.GA14956-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-29 19:56       ` Dan Smith
  0 siblings, 0 replies; 6+ messages in thread
From: Dan Smith @ 2009-09-29 19:56 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Andrew Morton

>> +	if ((h->sock_common.family == AF_INET) &&
>> +	    (h->sock.state != TCP_LISTEN)) {
>> +		/* Temporary hack to enable restore of TCP_LISTEN sockets
>> +		 * while forcing anything else to a closed state
>> +		 */
>> +		sock->sk->sk_state = TCP_CLOSE;

SH> I'll admit by this point I'm a bit confused about the order in
SH> which these things are restored.  Does do_sock_restore() run
SH> before inet_restore() below (curious whether its check for
SH> TCP_CLOSE will catch these particular sockets).

It runs after inet_restore(), which was called above via
sock->ops->restore().  Thus, inet_restore() can largely ignore the
connected sockets and then they're forced closed here (since
inet_restore() runs before the common restore routine, otherwise it
could set them closed itself).

>> +int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
>> +{
>> +	struct ckpt_hdr_socket_inet *in;
>> +	int ret = -EINVAL;
>> +
>> +       in = ckpt_hdr_get_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
>> +       if (!in)
>> +               goto out;

SH> Better to just return...  I don't think this'll cause a crash (it
SH> only makes ckpt_hdr_put() pass ((struct ckpt_hdr *)NULL)->len to
SH> _ckpt_hdr_put which then ignores it and does a safe kfree(NULL)),
SH> but it won't be safe if the implementation of the ckpt_hdr_*()
SH> calls end up changing.

Sure.  Also note my broken whitespace that came as a result of me
cutting patches into pieces.  I'll fix that too.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] [RFC] Add c/r support for listening INET sockets
       [not found] ` <1254164330-23072-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-29 19:50   ` Serge E. Hallyn
@ 2009-09-29 21:49   ` Oren Laadan
       [not found]     ` <4AC280D6.3020302-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
  2009-09-29 22:16   ` Oren Laadan
  2 siblings, 1 reply; 6+ messages in thread
From: Oren Laadan @ 2009-09-29 21:49 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Andrew Morton



Dan Smith wrote:
> This is an incremental step towards supporting checkpoint/restart on
> AF_INET sockets.  In this scenario, any sockets that were in TCP_LISTEN
> state are restored as they were.  Any that were connected are forced to
> TCP_CLOSE.  This should cover a range of use cases that involve
> applications that are tolerant of such an interruption.
> 

It's important to mention that pending connections - those that were
received and are awaiting to be accepted on the listening socket are
not handled either (does checkpoint fail or ignore them ?)

Oren.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] [RFC] Add c/r support for listening INET sockets
       [not found] ` <1254164330-23072-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-29 19:50   ` Serge E. Hallyn
  2009-09-29 21:49   ` Oren Laadan
@ 2009-09-29 22:16   ` Oren Laadan
  2 siblings, 0 replies; 6+ messages in thread
From: Oren Laadan @ 2009-09-29 22:16 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Andrew Morton



Dan Smith wrote:
> This is an incremental step towards supporting checkpoint/restart on
> AF_INET sockets.  In this scenario, any sockets that were in TCP_LISTEN
> state are restored as they were.  Any that were connected are forced to
> TCP_CLOSE.  This should cover a range of use cases that involve
> applications that are tolerant of such an interruption.
> 
> Cc: Oren Laadan <orenl-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
> Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---

Good stuff... a few comments below.

[...]

> +
> +static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int len;
> +	int ret;
> +	struct sk_buff *skb;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> +	if (len < 0) {
> +		ret = len;

Note: skb wasn't initialized yet, ret will be negative (more below)...

> +		goto out;
> +	} else if (len > SKB_MAX_ALLOC) {
> +		ckpt_debug("Socket buffer too big (%i > %lu)",
> +			   len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +
> +	skb = alloc_skb(len, GFP_KERNEL);
> +	if (!skb) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = ckpt_kread(ctx, skb_put(skb, len), len);
> +	if (ret < 0)
> +		goto out;
> +
> +	spin_lock(&queue->lock);
> +	skb_queue_tail(queue, skb);
> +	spin_unlock(&queue->lock);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	if (ret < 0)
> +		kfree_skb(skb);

skb may be uninitialized at this point (do I sound like gcc ?)


> +
> +	return ret;
> +}
> +
> +static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_queue *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_QUEUE);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		ret = inet_read_buffer(ctx, queue);
> +		ckpt_debug("read inet buffer %i: %i", i, ret);
> +		if (ret < 0)
> +			goto out;
> +
> +		if (ret > h->total_bytes) {
> +			ckpt_debug("Buffers exceeded claim");
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		h->total_bytes -= ret;
> +		ret = 0;
		^^^^^^^^
Unnecessary ?

> +	}
> +
> +	ret = h->skb_count;
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int inet_deferred_restore_buffers(void *data)
> +{
> +	struct dq_buffers *dq = (struct dq_buffers *)data;
> +	struct ckpt_ctx *ctx = dq->ctx;
> +	struct sock *sk = dq->sk;
> +	int ret;
> +
> +	ret = inet_read_buffers(ctx, &sk->sk_receive_queue);
> +	ckpt_debug("(R) inet_read_buffers: %i\n", ret);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = inet_read_buffers(ctx, &sk->sk_write_queue);
> +	ckpt_debug("(W) inet_read_buffers: %i\n", ret);
> +
> +	return ret;
> +}
> +
> +static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct dq_buffers dq;
> +
> +	dq.ctx = ctx;
> +	dq.sk = sk;
> +
> +	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
> +			      inet_deferred_restore_buffers, NULL);
> +}
> +
> +int inet_restore(struct ckpt_ctx *ctx,
> +		 struct socket *sock,
> +		 struct ckpt_hdr_socket *h)
> +{
> +	struct ckpt_hdr_socket_inet *in;
> +	int ret = 0;
> +
> +	in = ckpt_read_obj_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
> +	if (IS_ERR(in))
> +		return PTR_ERR(in);

I think you need to at least sanitize the in->laddr_len field - the
ops->bind() below assumes its valid.

> +
> +	/* Listening sockets and those that are closed but have a local
> +	 * address need to call bind()
> +	 */
> +	if ((h->sock.state == TCP_LISTEN) ||
> +	    ((h->sock.state == TCP_CLOSE) && (in->laddr_len > 0))) {
> +		sock->sk->sk_reuse = 2;
> +		inet_sk(sock->sk)->freebind = 1;
> +		ret = sock->ops->bind(sock,
> +				      (struct sockaddr *)&in->laddr,
> +				      in->laddr_len);
> +		ckpt_debug("inet bind: %i\n", ret);
> +		if (ret < 0)
> +			goto out;
> +
> +		if (h->sock.state == TCP_LISTEN) {
> +			ret = sock->ops->listen(sock, h->sock.backlog);
> +			ckpt_debug("inet listen: %i\n", ret);
> +			if (ret < 0)
> +				goto out;
> +		}
> +	} else {
> +		if (!sock_flag(sock->sk, SOCK_DEAD))
> +			ret = inet_defer_restore_buffers(ctx, sock->sk);
> +	}
> + out:
> +	ckpt_hdr_put(ctx, in);
> +
> +	return ret;
> + }
> +

Oren.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] [RFC] Add c/r support for listening INET sockets
       [not found]     ` <4AC280D6.3020302-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-09-30 14:20       ` Dan Smith
  0 siblings, 0 replies; 6+ messages in thread
From: Dan Smith @ 2009-09-30 14:20 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Andrew Morton

OL> It's important to mention that pending connections - those that
OL> were received and are awaiting to be accepted on the listening
OL> socket are not handled either (does checkpoint fail or ignore them
OL> ?)

It ignores them and presumably they'll get an RST after the restart
happens and claims no knowledge of the pending connection.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2009-09-30 14:20 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-28 18:58 [PATCH] [RFC] Add c/r support for listening INET sockets Dan Smith
     [not found] ` <1254164330-23072-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-29 19:50   ` Serge E. Hallyn
     [not found]     ` <20090929195025.GA14956-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-29 19:56       ` Dan Smith
2009-09-29 21:49   ` Oren Laadan
     [not found]     ` <4AC280D6.3020302-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-30 14:20       ` Dan Smith
2009-09-29 22:16   ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.