All of lore.kernel.org
 help / color / mirror / Atom feed
* Add support for connected INET sockets
@ 2009-11-17 15:26 Dan Smith
       [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-17 15:26 ` [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v5) Dan Smith
  0 siblings, 2 replies; 11+ messages in thread
From: Dan Smith @ 2009-11-17 15:26 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This is an update of the previous set to apply on top of v19-rc1 and to
address some of the trivial changes suggested on list.  Those are detailed
per-patch.  I will follow up with a patch to change ckpt_debug to ckpt_err
where appropriate, across the whole net/*/checkpoint.c set.  Since this
will apply to the unchanged gso patch as well, I decided to do this as a
follow-on cleanup patch.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers (v2)
       [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-17 15:26   ` Dan Smith
       [not found]     ` <1258471590-29768-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-17 15:26   ` [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file Dan Smith
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Dan Smith @ 2009-11-17 15:26 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

The INET code often creates socket buffers by attaching fragments instead
of writing to the linear region.  This extends the skb write functions
to write out the linear and fragment regions of an skb, and adds a
function to be used by others wishing to restore an skb in the same way.
This also includes the header-mark-setting bits from a previous patch.

Changes in v2:
 - Change the length variables in ckpt_hdr_socket_buffer to u32
 - Check frag number against MAX_SKB_FRAGS

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/checkpoint.h     |    1 +
 include/linux/checkpoint_hdr.h |   11 ++
 net/checkpoint.c               |  258 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 247 insertions(+), 23 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 550f6e8..0eff43e 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -104,6 +104,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct socket *socket,
 			      struct sockaddr *loc, unsigned *loc_len,
 			      struct sockaddr *rem, unsigned *rem_len);
+struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 667b7aa..787cf89 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -594,8 +594,19 @@ struct ckpt_hdr_socket_queue {
 
 struct ckpt_hdr_socket_buffer {
 	struct ckpt_hdr h;
+	__u32 transport_header;
+	__u32 network_header;
+	__u32 mac_header;
+	__u32 lin_len; /* Length of linear data */
+	__u32 frg_len; /* Length of fragment data */
+	__u32 skb_len; /* Length of skb (adjusted) */
+	__u32 hdr_len; /* Length of skipped header */
+	__u32 mac_len;
 	__s32 sk_objref;
 	__s32 pr_objref;
+	__u16 protocol;
+	__u16 nr_frags;
+	__u8 cb[48];
 };
 
 #define CKPT_UNIX_LINKED 1
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 7e8e137..49d9a2f 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -17,9 +17,11 @@
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/fs_struct.h>
+#include <linux/highmem.h>
 
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
+#include <net/tcp.h>
 
 #include <linux/deferqueue.h>
 #include <linux/checkpoint.h>
@@ -88,6 +90,238 @@ static int sock_copy_buffers(struct sk_buff_head *from,
 	return -EAGAIN;
 }
 
+static void sock_record_header_info(struct sk_buff *skb,
+				    struct ckpt_hdr_socket_buffer *h)
+{
+
+	h->mac_len = skb->mac_len;
+	h->skb_len = skb->len;
+	h->hdr_len = skb->data - skb->head;
+	h->lin_len = (skb->tail - skb->head);
+	h->frg_len = skb->data_len;
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	h->transport_header = skb->transport_hdr;
+	h->network_header = skb->network_header;
+	h->mac_header = skb->mac_header;
+#else
+	h->transport_header = skb->transport_header - skb->head;
+	h->network_header = skb->network_header - skb->head;
+	h->mac_header = skb->mac_header - skb->head;
+#endif
+
+	memcpy(h->cb, skb->cb, sizeof(skb->cb));
+	h->nr_frags = skb_shinfo(skb)->nr_frags;
+}
+
+int sock_restore_header_info(struct sk_buff *skb,
+			     struct ckpt_hdr_socket_buffer *h)
+{
+	if (h->mac_header + h->mac_len != h->network_header) {
+		ckpt_debug("skb mac_header %u+%u != network header %u\n",
+			   h->mac_header, h->mac_len, h->network_header);
+		return -EINVAL;
+	}
+
+	if (h->network_header > h->lin_len) {
+		ckpt_debug("skb network header %u > linear length %u\n",
+			   h->network_header, h->lin_len);
+		return -EINVAL;
+	}
+
+	if (h->transport_header > h->lin_len) {
+		ckpt_debug("skb transport header %u > linear length %u\n",
+			   h->transport_header, h->lin_len);
+		return -EINVAL;
+	}
+
+	if (h->skb_len > SKB_MAX_ALLOC) {
+		ckpt_debug("skb total length %u larger than max of %lu\n",
+			   h->skb_len, SKB_MAX_ALLOC);
+		return -EINVAL;
+	}
+
+	skb_set_transport_header(skb, h->transport_header);
+	skb_set_network_header(skb, h->network_header);
+	skb_set_mac_header(skb, h->mac_header);
+	skb->mac_len = h->mac_len;
+
+	/* FIXME: This should probably be sanitized per-protocol to
+	 * make sure nothing bad happens if it is hijacked.  For the
+	 * current set of protocols that we restore this way, the data
+	 * contained within is not very risky (flags and sequence
+	 * numbers) but could still be evalutated from a
+	 * could-the-user- have-set-these-flags point of view.
+	 */
+	memcpy(skb->cb, h->cb, sizeof(skb->cb));
+
+	skb->data = skb->head + skb->hdr_len;
+	skb->len = h->skb_len;
+
+	return 0;
+}
+
+static int sock_restore_skb_frag(struct ckpt_ctx *ctx,
+				 struct sk_buff *skb,
+				 int frag_idx)
+{
+	int ret = 0;
+	int fraglen;
+	struct page *page;
+	void *buf;
+
+	fraglen = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+	if (fraglen < 0)
+		return fraglen;
+
+	if (fraglen > PAGE_SIZE) {
+		ckpt_debug("skb frag size %i > PAGE_SIZE\n", fraglen);
+		return -EINVAL;
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	buf = kmap(page);
+	ret = ckpt_kread(ctx, buf, fraglen);
+	kunmap(page);
+
+	if (ret) {
+		ckpt_debug("failed to read fragment: %i\n", ret);
+		ret = -EINVAL;
+		__free_page(page);
+	} else {
+		ckpt_debug("read %i for fragment %i\n", fraglen, frag_idx);
+		skb_add_rx_frag(skb, frag_idx, page, 0, fraglen);
+	}
+
+	return ret < 0 ? ret : fraglen;
+}
+
+struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	struct sk_buff *skb = NULL;
+	int i;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return (struct sk_buff *)h;
+
+	if (h->lin_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket linear buffer too big (%u > %lu)\n",
+			   h->lin_len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	} else if (h->frg_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket frag size too big (%u > %lu\n",
+			   h->frg_len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	} else if (h->nr_frags >= MAX_SKB_FRAGS) {
+		ckpt_debug("socket frag count too big (%u > %lu\n",
+			   h->nr_frags, MAX_SKB_FRAGS);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	skb = alloc_skb(h->lin_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = _ckpt_read_obj_type(ctx, skb_put(skb, h->lin_len),
+				  h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("read linear skb length %u: %i\n", h->lin_len, ret);
+	if (ret < 0) {
+		goto out;
+	}
+
+	for (i = 0; i < h->nr_frags; i++) {
+		ret = sock_restore_skb_frag(ctx, skb, i);
+		ckpt_debug("read skb frag %i/%i: %i\n",
+			   i + 1, h->nr_frags, ret);
+		if (ret < 0)
+			goto out;
+		h->frg_len -= ret;
+	}
+
+	if (h->frg_len != 0) {
+		ckpt_debug("length %u remaining after reading frags\n",
+			   h->frg_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sock_restore_header_info(skb, h);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0) {
+		kfree_skb(skb);
+		skb = ERR_PTR(ret);
+	}
+
+	return skb;
+}
+
+static int __sock_write_skb(struct ckpt_ctx *ctx,
+			    struct sk_buff *skb,
+			    int dst_objref)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (!h)
+		return -ENOMEM;
+
+	if (dst_objref > 0) {
+		BUG_ON(!skb->sk);
+		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
+		if (ret < 0)
+			goto out;
+		h->sk_objref = ret;
+		h->pr_objref = dst_objref;
+	}
+
+	sock_record_header_info(skb, h);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj_type(ctx, skb->head, h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("writing skb linear region %u: %i\n", h->lin_len, ret);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		u8 *vaddr = kmap(frag->page);
+
+		ckpt_debug("writing buffer fragment %i/%i (%i)\n",
+			   i + 1, h->nr_frags, frag->size);
+		ret = ckpt_write_obj_type(ctx, vaddr + frag->page_offset,
+					  frag->size, CKPT_HDR_BUFFER);
+		kunmap(frag->page);
+		h->frg_len -= frag->size;
+		if (ret < 0)
+			goto out;
+	}
+
+	WARN_ON(h->frg_len != 0);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
 				int dst_objref)
@@ -95,13 +329,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 	struct sk_buff *skb;
 
 	skb_queue_walk(queue, skb) {
-		struct ckpt_hdr_socket_buffer *h;
 		int ret = 0;
 
-		/* FIXME: This could be a false positive for non-unix
-		 *        buffers, so add a type check here in the
-		 *        future
-		 */
 		if (UNIXCB(skb).fp) {
 			ckpt_err(ctx, -EBUSY, "%(T)af_unix: pass fd\n");
 			return -EBUSY;
@@ -113,25 +342,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		 * because we don't save out (or restore) the control
 		 * information contained in the skb.
 		 */
-		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
-		if (!h)
-			return -ENOMEM;
-
-		BUG_ON(!skb->sk);
-		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
-		if (ret < 0)
-			goto end;
-		h->sk_objref = ret;
-		h->pr_objref = dst_objref;
-
-		ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
-		if (ret < 0)
-			goto end;
 
-		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
-					  CKPT_HDR_BUFFER);
-	end:
-		ckpt_hdr_put(ctx, h);
+		ret = __sock_write_skb(ctx, skb, dst_objref);
 		if (ret < 0)
 			return ret;
 	}
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v5)
  2009-11-17 15:26 Add support for connected INET sockets Dan Smith
       [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-17 15:26 ` Dan Smith
  2009-11-18  4:34   ` Serge E. Hallyn
  1 sibling, 1 reply; 11+ messages in thread
From: Dan Smith @ 2009-11-17 15:26 UTC (permalink / raw)
  To: containers; +Cc: orenl, netdev

This patch adds basic support for C/R of open INET sockets.  I think that
all the important bits of the TCP and ICSK socket structures is saved,
but I think there is still some additional IPv6 stuff that needs to be
handled.

With this patch applied, the following script can be used to demonstrate
the functionality:

  https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html

It shows that this enables migration of a sendmail process with open
connections from one machine to another without dropping.

We probably need comments from the netdev people about the quality of
sanity checking we do on the values in the ckpt_hdr_socket_inet
structure on restart.

Note that this still doesn't address lingering sockets yet.

Changes in v5:
 - Change ckpt_write_err() to ckpt_err()

Changes in v4:
 - Use the new socket buffer restore functions introduced in the
   previous patch
 - Move listen_sockets list under the restart items in ckpt_ctx
 - Rename RESTART_SOCK_LISTENONLY to RESTART_CONN_RESET

Changes in v3:
 - Prevent restart from allowing a bind on a <1024 port unless the
   user is granted that capability
 - Add some sanity checking in the inet_precheck() function to make sure
   the values read from the checkpoint image are within acceptable ranges
 - Check the result of sock_restore_header_info() and fail if needed

Changes in v2:
 - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
   structure instead of saving them separately
 - Fix 'sock' naming in sock_cptrst()
 - Don't take the queue lock before skb_queue_tail() since it is
   done for us
 - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
   flag is specified on sys_restart()
 - Pull the implementation of the list of listening sockets back into
   this patch
 - Fix dangling printk
 - Add some comments around the parent/child restore logic

Cc: netdev@vger.kernel.org
Acked-by: Oren Laadan <orenl@librato.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 checkpoint/sys.c                 |    4 +
 include/linux/checkpoint.h       |    5 +-
 include/linux/checkpoint_hdr.h   |   95 +++++++++
 include/linux/checkpoint_types.h |    1 +
 net/checkpoint.c                 |   27 ++--
 net/ipv4/checkpoint.c            |  391 ++++++++++++++++++++++++++++++++++----
 6 files changed, 473 insertions(+), 50 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 9f9e825..baed891 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -244,6 +244,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
 	kfree(ctx->pids_arr);
 
+	sock_listening_list_free(&ctx->listen_sockets);
+
 	kfree(ctx);
 }
 
@@ -274,6 +276,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 
 	mutex_init(&ctx->msg_mutex);
 
+	INIT_LIST_HEAD(&ctx->listen_sockets);
+
 	err = -EBADF;
 	ctx->file = fget(fd);
 	if (!ctx->file)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 0eff43e..ddc9aa0 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -20,6 +20,7 @@
 #define RESTART_FROZEN		0x2
 #define RESTART_GHOST		0x4
 #define RESTART_KEEP_LSM	0x8
+#define RESTART_CONN_RESET      0x10
 
 /* misc user visible */
 #define CHECKPOINT_FD_NONE	-1
@@ -53,7 +54,8 @@
 	(RESTART_TASKSELF | \
 	 RESTART_FROZEN | \
 	 RESTART_KEEP_LSM | \
-	 RESTART_GHOST)
+	 RESTART_GHOST | \
+	 RESTART_CONN_RESET)
 #define CKPT_LSM_INFO_LEN 200
 #define CKPT_LSM_STRING_MAX 1024
 
@@ -105,6 +107,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct sockaddr *loc, unsigned *loc_len,
 			      struct sockaddr *rem, unsigned *rem_len);
 struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
+void sock_listening_list_free(struct list_head *head);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 787cf89..d1a93e3 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -15,6 +15,7 @@
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #else
 #include <sys/types.h>
 #include <linux/types.h>
@@ -625,6 +626,100 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_socket_inet {
 	struct ckpt_hdr h;
+	__u32 daddr;
+	__u32 rcv_saddr;
+	__u32 saddr;
+	__u16 dport;
+	__u16 num;
+	__u16 sport;
+	__s16 uc_ttl;
+	__u16 cmsg_flags;
+
+	struct {
+		__u64 timeout;
+		__u32 ato;
+		__u32 lrcvtime;
+		__u16 last_seg_size;
+		__u16 rcv_mss;
+		__u8 pending;
+		__u8 quick;
+		__u8 pingpong;
+		__u8 blocked;
+	} icsk_ack __attribute__ ((aligned(8)));
+
+	/* FIXME: Skipped opt, tos, multicast, cork settings */
+
+	struct {
+		__u32 rcv_nxt;
+		__u32 copied_seq;
+		__u32 rcv_wup;
+		__u32 snd_nxt;
+		__u32 snd_una;
+		__u32 snd_sml;
+		__u32 rcv_tstamp;
+		__u32 lsndtime;
+
+		__u32 snd_wl1;
+		__u32 snd_wnd;
+		__u32 max_window;
+		__u32 mss_cache;
+		__u32 window_clamp;
+		__u32 rcv_ssthresh;
+		__u32 frto_highmark;
+
+		__u32 srtt;
+		__u32 mdev;
+		__u32 mdev_max;
+		__u32 rttvar;
+		__u32 rtt_seq;
+
+		__u32 packets_out;
+		__u32 retrans_out;
+
+		__u32 snd_up;
+		__u32 rcv_wnd;
+		__u32 write_seq;
+		__u32 pushed_seq;
+		__u32 lost_out;
+		__u32 sacked_out;
+		__u32 fackets_out;
+		__u32 tso_deferred;
+		__u32 bytes_acked;
+
+		__s32 lost_cnt_hint;
+		__u32 retransmit_high;
+
+		__u32 lost_retrans_low;
+
+		__u32 prior_ssthresh;
+		__u32 high_seq;
+
+		__u32 retrans_stamp;
+		__u32 undo_marker;
+		__s32 undo_retrans;
+		__u32 total_retrans;
+
+		__u32 urg_seq;
+		__u32 keepalive_time;
+		__u32 keepalive_intvl;
+
+		__u16 urg_data;
+		__u16 advmss;
+		__u8 frto_counter;
+		__u8 nonagle;
+
+		__u8 ecn_flags;
+		__u8 reordering;
+
+		__u8 keepalive_probes;
+	} tcp __attribute__ ((aligned(8)));
+
+	struct {
+		struct in6_addr saddr;
+		struct in6_addr rcv_saddr;
+		struct in6_addr daddr;
+	} inet6 __attribute__ ((aligned(8)));
+
 	__u32 laddr_len;
 	__u32 raddr_len;
 	struct sockaddr_in laddr;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 77f8592..79c9c09 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -82,6 +82,7 @@ struct ckpt_ctx {
 	wait_queue_head_t waitq;	/* waitqueue for restarting tasks */
 	wait_queue_head_t ghostq;	/* waitqueue for ghost tasks */
 	struct cred *realcred, *ecred;	/* tmp storage for cred at restart */
+	struct list_head listen_sockets;/* listening parent sockets */
 
 	struct ckpt_stats stats;	/* statistics */
 
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 49d9a2f..aba1497 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -324,6 +324,7 @@ static int __sock_write_skb(struct ckpt_ctx *ctx,
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
+				uint16_t family,
 				int dst_objref)
 {
 	struct sk_buff *skb;
@@ -336,11 +337,11 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 			return -EBUSY;
 		}
 
-		/* The other ancillary messages are always present
-		 * unlike descriptors.  Even though we can't detect
-		 * them and fail the checkpoint, we're not at risk
-		 * because we don't save out (or restore) the control
-		 * information contained in the skb.
+		/* The other ancillary messages UNIX are always
+		 * present unlike descriptors.  Even though we can't
+		 * detect them and fail the checkpoint, we're not at
+		 * risk because we don't restore the control
+		 * information in the UNIX code.
 		 */
 
 		ret = __sock_write_skb(ctx, skb, dst_objref);
@@ -353,6 +354,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 
 static int sock_write_buffers(struct ckpt_ctx *ctx,
 			      struct sk_buff_head *queue,
+			      uint16_t family,
 			      int dst_objref)
 {
 	struct ckpt_hdr_socket_queue *h;
@@ -372,7 +374,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
 	h->skb_count = ret;
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (!ret)
-		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
+		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
 
  out:
 	ckpt_hdr_put(ctx, h);
@@ -394,12 +396,14 @@ int sock_deferred_write_buffers(void *data)
 		return dst_objref;
 	}
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write recv buffers: %i\n", ret);
 	if (ret < 0)
 		return ret;
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write send buffers: %i\n", ret);
 
 	return ret;
@@ -924,10 +928,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 		goto err;
 
 	if ((h->sock_common.family == AF_INET) &&
-	    (h->sock.state != TCP_LISTEN)) {
-		/* Temporary hack to enable restore of TCP_LISTEN sockets
-		 * while forcing anything else to a closed state
-		 */
+	    (h->sock.state != TCP_LISTEN) &&
+	    (ctx->uflags & RESTART_CONN_RESET)) {
+		ckpt_debug("Forcing open socket closed\n");
 		sock->sk->sk_state = TCP_CLOSE;
 		sock->state = SS_UNCONNECTED;
 	}
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
index 9cbbf5e..3e20cc9 100644
--- a/net/ipv4/checkpoint.c
+++ b/net/ipv4/checkpoint.c
@@ -17,6 +17,7 @@
 #include <linux/deferqueue.h>
 #include <net/tcp_states.h>
 #include <net/tcp.h>
+#include <net/ipv6.h>
 
 struct dq_sock {
 	struct ckpt_ctx *ctx;
@@ -28,6 +29,236 @@ struct dq_buffers {
 	struct sock *sk;
 };
 
+struct listen_item {
+	struct sock *sk;
+	struct list_head list;
+};
+
+void sock_listening_list_free(struct list_head *head)
+{
+	struct listen_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, head, list) {
+		list_del(&item->list);
+		kfree(item);
+	}
+}
+
+static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->sk = sk;
+	list_add(&item->list, &ctx->listen_sockets);
+
+	return 0;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	list_for_each_entry(item, &ctx->listen_sockets, list) {
+		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
+			return item->sk;
+	}
+
+	return NULL;
+}
+
+static int sock_hash_parent(void *data)
+{
+	struct dq_sock *dq = (struct dq_sock *)data;
+	struct sock *parent;
+
+	ckpt_debug("INET post-restart hash\n");
+
+	dq->sk->sk_prot->hash(dq->sk);
+
+	/* If there is a listening socket with the same source port,
+	 * then become a child of that socket [we are the result of an
+	 * accept()].  Otherwise hash ourselves directly in [we are
+	 * the result of a connect()]
+	 */
+
+	parent = sock_get_parent(dq->ctx, dq->sk);
+	if (parent) {
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+		local_bh_disable();
+		__inet_inherit_port(parent, dq->sk);
+		local_bh_enable();
+	} else {
+		inet_sk(dq->sk)->num = 0;
+		inet_hash_connect(&tcp_death_row, dq->sk);
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+	}
+
+	return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+	struct dq_sock dq;
+
+	dq.sk = sock;
+	dq.ctx = ctx;
+
+	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+			      sock_hash_parent, NULL);
+}
+
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+				struct tcp_sock *sk,
+				struct ckpt_hdr_socket_inet *hh,
+				int op)
+{
+	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+	if (!skb_queue_empty(&sk->ucopy.prequeue))
+		printk("PREQUEUE!\n");
+
+	return 0;
+}
+
+static int sock_inet_restore_addrs(struct inet_sock *inet,
+				   struct ckpt_hdr_socket_inet *hh)
+{
+	inet->daddr = hh->raddr.sin_addr.s_addr;
+	inet->saddr = hh->laddr.sin_addr.s_addr;
+	inet->rcv_saddr = inet->saddr;
+
+	inet->dport = hh->raddr.sin_port;
+	inet->sport = hh->laddr.sin_port;
+
+	return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+			    struct sock *sk,
+			    struct ckpt_hdr_socket_inet *hh,
+			    int op)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret;
+
+	if (op == CKPT_CPT) {
+		CKPT_COPY(op, hh->daddr, inet->daddr);
+		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
+		CKPT_COPY(op, hh->dport, inet->dport);
+		CKPT_COPY(op, hh->saddr, inet->saddr);
+		CKPT_COPY(op, hh->sport, inet->sport);
+	} else {
+		ret = sock_inet_restore_addrs(inet, hh);
+		if (ret)
+			return ret;
+	}
+
+	CKPT_COPY(op, hh->num, inet->num);
+	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
+	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
+
+	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+	CKPT_COPY(op,
+		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+	if (sk->sk_protocol == IPPROTO_TCP)
+		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
+	else if (sk->sk_protocol == IPPROTO_UDP)
+		ret = 0;
+	else {
+		ret = -EINVAL;
+		ckpt_err(ctx, ret, "unknown socket protocol %d",
+			 sk->sk_protocol);
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *inet6 = inet6_sk(sk);
+		if (op == CKPT_CPT) {
+			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
+			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
+			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
+		} else {
+			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
+			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
+			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
+		}
+	}
+
+	return ret;
+}
+
 int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct ckpt_hdr_socket_inet *in;
@@ -43,6 +274,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 	if (ret)
 		goto out;
 
+	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
+	if (ret < 0)
+		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
  out:
 	ckpt_hdr_put(ctx, in);
@@ -55,51 +290,22 @@ int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
 	return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
 }
 
-static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+static int inet_read_buffer(struct ckpt_ctx *ctx,
+			    struct sk_buff_head *queue)
 {
-	struct ckpt_hdr_socket_buffer *h;
-	int len;
-	int ret;
 	struct sk_buff *skb = NULL;
 
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
-	if (len < 0) {
-		ret = len;
-		goto out;
-	} else if (len > SKB_MAX_ALLOC) {
-		ckpt_debug("Socket buffer too big (%i > %lu)",
-			   len, SKB_MAX_ALLOC);
-		ret = -ENOSPC;
-		goto out;
-	}
-
-	skb = alloc_skb(len, GFP_KERNEL);
-	if (!skb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ckpt_kread(ctx, skb_put(skb, len), len);
-	if (ret < 0)
-		goto out;
+	skb = sock_restore_skb(ctx);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
 
-	spin_lock(&queue->lock);
 	skb_queue_tail(queue, skb);
-	spin_unlock(&queue->lock);
- out:
-	ckpt_hdr_put(ctx, h);
-
-	if ((ret < 0) && skb)
-		kfree_skb(skb);
 
-	return ret;
+	return skb->len;
 }
 
-static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+static int inet_read_buffers(struct ckpt_ctx *ctx,
+			     struct sk_buff_head *queue)
 {
 	struct ckpt_hdr_socket_queue *h;
 	int ret = 0;
@@ -162,6 +368,19 @@ static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
 
 static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
 {
+	__u8 icsk_ack_mask = ICSK_ACK_SCHED | ICSK_ACK_TIMER |
+		ICSK_ACK_PUSHED | ICSK_ACK_PUSHED2;
+	__u16 urg_mask = TCP_URG_VALID | TCP_URG_NOTYET | TCP_URG_READ;
+	__u8 nonagle_mask = TCP_NAGLE_OFF | TCP_NAGLE_CORK | TCP_NAGLE_PUSH;
+	__u8 ecn_mask = TCP_ECN_OK | TCP_ECN_QUEUE_CWR | TCP_ECN_DEMAND_CWR;
+
+	if ((htons(in->laddr.sin_port) < PROT_SOCK) &&
+	    !capable(CAP_NET_BIND_SERVICE)) {
+		ckpt_debug("unable to bind to port %hu\n",
+			   htons(in->laddr.sin_port));
+		return -EINVAL;
+	}
+
 	if (in->laddr_len > sizeof(struct sockaddr_in)) {
 		ckpt_debug("laddr_len is too big\n");
 		return -EINVAL;
@@ -172,6 +391,77 @@ static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
 		return -EINVAL;
 	}
 
+	/* Set ato to the default */
+	in->icsk_ack.ato = TCP_ATO_MIN;
+
+	/* No quick acks are scheduled after a restart */
+	in->icsk_ack.quick = 0;
+
+	if (in->icsk_ack.pending & ~icsk_ack_mask) {
+		ckpt_debug("invalid pending flags 0x%x\n",
+			   in->icsk_ack.pending & ~icsk_ack_mask);
+		return -EINVAL;
+	}
+
+	if (in->icsk_ack.pingpong > 1) {
+		ckpt_debug("invalid icsk_ack.pingpong value\n");
+		return -EINVAL;
+	}
+
+	if (in->icsk_ack.blocked > 1) {
+		ckpt_debug("invalid icsk_ack.blocked value\n");
+		return -EINVAL;
+	}
+
+	/* do_tcp_setsockopt() quietly makes this coercion */
+	if (in->tcp.window_clamp < (SOCK_MIN_RCVBUF / 2))
+		in->tcp.window_clamp = SOCK_MIN_RCVBUF / 2;
+	else if (in->tcp.window_clamp > 65535U) {
+		ckpt_debug("invalid window_clamp value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.rcv_ssthresh > (4U * in->tcp.advmss))
+		in->tcp.rcv_ssthresh = 4U * in->tcp.advmss;
+
+	/* These will all be recalculated on the next call to
+	 * tcp_rtt_estimator()
+	 */
+	in->tcp.srtt = in->tcp.mdev = in->tcp.mdev_max = 0;
+	in->tcp.rttvar = in->tcp.rtt_seq = 0;
+
+	/* Might want to set packets_out to zero ? */
+
+	if (in->tcp.rcv_wnd > MAX_TCP_WINDOW)
+		in->tcp.rcv_wnd = MAX_TCP_WINDOW;
+
+	if (in->tcp.keepalive_intvl > MAX_TCP_KEEPINTVL) {
+		ckpt_debug("keepalive_intvl %i out of range\n",
+			   in->tcp.keepalive_intvl);
+		return -EINVAL;
+	}
+
+	if (in->tcp.keepalive_probes > MAX_TCP_KEEPCNT) {
+		ckpt_debug("Invalid keepalive_probes value %i\n",
+			   in->tcp.keepalive_probes);
+		return -EINVAL;
+	}
+
+	if (in->tcp.urg_data & ~urg_mask) {
+		ckpt_debug("Invalid urg_data value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.nonagle & ~nonagle_mask) {
+		ckpt_debug("Invalid nonagle value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.ecn_flags & ~ecn_mask) {
+		ckpt_debug("Invalid ecn_flags value\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -209,8 +499,35 @@ int inet_restore(struct ckpt_ctx *ctx,
 			ckpt_debug("inet listen: %i\n", ret);
 			if (ret < 0)
 				goto out;
+
+			/* We are a listening socket, so add ourselves
+			 * to the list of parent sockets.  This will
+			 * allow our children to find us later and
+			 * link up
+			 */
+
+			ret = sock_listening_list_add(ctx, sock->sk);
+			if (ret < 0)
+				goto out;
 		}
 	} else {
+		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
+		if (ret)
+			goto out;
+
+		if ((h->sock.state == TCP_ESTABLISHED) &&
+		    (h->sock.protocol == IPPROTO_TCP)) {
+			/* A connected socket that was spawned from an
+			 * accept() needs to be hashed with its parent
+			 * listening socket in order to receive
+			 * traffic on the original port.  Since we may
+			 * not have restarted the parent yet, we defer
+			 * this until later when we know we have all
+			 * the listening sockets accounted for.
+			 */
+			ret = sock_defer_hash(ctx, sock->sk);
+		}
+
 		if (!sock_flag(sock->sk, SOCK_DEAD))
 			ret = inet_defer_restore_buffers(ctx, sock->sk);
 	}
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-17 15:26   ` [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers (v2) Dan Smith
@ 2009-11-17 15:26   ` Dan Smith
       [not found]     ` <1258471590-29768-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-17 15:26   ` [PATCH 4/4] Add some content to the readme.txt for socket c/r Dan Smith
  2009-11-25 18:51   ` Add support for connected INET sockets Oren Laadan
  3 siblings, 1 reply; 11+ messages in thread
From: Dan Smith @ 2009-11-17 15:26 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 net/unix/checkpoint.c |   54 ++++++++++++++++++++++++++----------------------
 1 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
index 749673d..c6a1b27 100644
--- a/net/unix/checkpoint.c
+++ b/net/unix/checkpoint.c
@@ -201,7 +201,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 	uint8_t peer_shutdown = 0;
 	void *buf = NULL;
 	int sndbuf;
-	int len;
 	int ret = 0;
 
 	memset(&msg, 0, sizeof(msg));
@@ -210,14 +209,30 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 
-	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
-	if (len < 0) {
-		ret = len;
+	if (h->lin_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket buffer too big (%u > %lu)\n",
+			   h->lin_len, SKB_MAX_ALLOC);
+		ret = -EINVAL;
+		goto out;
+	} else if (h->nr_frags != 0) {
+		ckpt_debug("unix socket claims to have fragments\n");
+		ret = -EINVAL;
 		goto out;
-	} else if (len > SKB_MAX_ALLOC) {
-		ckpt_debug("Socket buffer too big (%i > %lu)",
-			   len, SKB_MAX_ALLOC);
-		ret = -ENOSPC;
+	}
+
+	buf = kmalloc(h->lin_len, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	kvec.iov_len = h->lin_len;
+	kvec.iov_base = buf;
+	ret = _ckpt_read_obj_type(ctx, kvec.iov_base,
+				  h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("read unix socket buffer %u: %i\n", h->lin_len, ret);
+	if (ret < h->lin_len) {
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -246,18 +261,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 		}
 	}
 
-	kvec.iov_len = len;
-	buf = kmalloc(len, GFP_KERNEL);
-	kvec.iov_base = buf;
-	if (!buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ckpt_kread(ctx, kvec.iov_base, len);
-	if (ret < 0)
-		goto out;
-
 	msg.msg_name = addr;
 	msg.msg_namelen = addrlen;
 
@@ -273,15 +276,16 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 
 	/* Make sure there's room in the send buffer */
 	sndbuf = sk->sk_sndbuf;
-	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
+	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
 	    capable(CAP_NET_ADMIN))
-		sk->sk_sndbuf += len;
+		sk->sk_sndbuf += h->lin_len;
 	else
 		sk->sk_sndbuf = sysctl_wmem_max;
 
-	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, len);
-	ckpt_debug("kernel_sendmsg(%i,%i): %i\n", h->sk_objref, len, ret);
-	if ((ret > 0) && (ret != len))
+	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, h->lin_len);
+	ckpt_debug("kernel_sendmsg(%i,%u): %i\n",
+		   h->sk_objref, h->lin_len, ret);
+	if ((ret > 0) && (ret != h->lin_len))
 		ret = -ENOMEM;
 
 	sk->sk_sndbuf = sndbuf;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/4] Add some content to the readme.txt for socket c/r
       [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-17 15:26   ` [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers (v2) Dan Smith
  2009-11-17 15:26   ` [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file Dan Smith
@ 2009-11-17 15:26   ` Dan Smith
  2009-11-25 18:51   ` Add support for connected INET sockets Oren Laadan
  3 siblings, 0 replies; 11+ messages in thread
From: Dan Smith @ 2009-11-17 15:26 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 Documentation/checkpoint/readme.txt |   21 +++++++++++++++++++++
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
index 0553d90..d74c144 100644
--- a/Documentation/checkpoint/readme.txt
+++ b/Documentation/checkpoint/readme.txt
@@ -365,6 +365,27 @@ It should always be safe to take a checkpoint of an application
 under LSM_1, and restart it without the RESTART_KEEP_LSM flag
 under LSM_2.
 
+Sockets
+=======
+
+For AF_UNIX sockets, both endpoints must be within the checkpointed
+task set to maintain a connected state after restart.  UNIX sockets
+that are in the process of passing a descriptor will cause the
+checkpoint to fail with -EBUSY indicating a transient state that
+cannot be checkpointed.  Listening sockets with an unaccepted peer
+will also cause an -EBUSY result.
+
+AF_INET sockets with endpoints outside the checkpointed task set may
+remain open if care is taken to avoid TCP timeouts and resets.
+Careful use of a virtual IP address can help avoid emission of an RST
+to the non-checkpointed endpoint.  If desired, the
+RESTART_SOCK_LISTENONLY flag may be passed to the restart syscall
+which will cause all connected AF_INET sockets to be closed during the
+restore process.  Listening sockets will still be restored to their
+original state, which makes this mode a candidate for something like
+an HTTP server.
+
+
 Kernel interfaces
 =================
 
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers (v2)
       [not found]     ` <1258471590-29768-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-18  4:34       ` Serge E. Hallyn
  0 siblings, 0 replies; 11+ messages in thread
From: Serge E. Hallyn @ 2009-11-18  4:34 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> The INET code often creates socket buffers by attaching fragments instead
> of writing to the linear region.  This extends the skb write functions
> to write out the linear and fragment regions of an skb, and adds a
> function to be used by others wishing to restore an skb in the same way.
> This also includes the header-mark-setting bits from a previous patch.
> 
> Changes in v2:
>  - Change the length variables in ckpt_hdr_socket_buffer to u32
>  - Check frag number against MAX_SKB_FRAGS
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

> ---
>  include/linux/checkpoint.h     |    1 +
>  include/linux/checkpoint_hdr.h |   11 ++
>  net/checkpoint.c               |  258 ++++++++++++++++++++++++++++++++++++----
>  3 files changed, 247 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 550f6e8..0eff43e 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -104,6 +104,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
>  			      struct socket *socket,
>  			      struct sockaddr *loc, unsigned *loc_len,
>  			      struct sockaddr *rem, unsigned *rem_len);
> +struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
> 
>  /* ckpt kflags */
>  #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 667b7aa..787cf89 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -594,8 +594,19 @@ struct ckpt_hdr_socket_queue {
> 
>  struct ckpt_hdr_socket_buffer {
>  	struct ckpt_hdr h;
> +	__u32 transport_header;
> +	__u32 network_header;
> +	__u32 mac_header;
> +	__u32 lin_len; /* Length of linear data */
> +	__u32 frg_len; /* Length of fragment data */
> +	__u32 skb_len; /* Length of skb (adjusted) */
> +	__u32 hdr_len; /* Length of skipped header */
> +	__u32 mac_len;
>  	__s32 sk_objref;
>  	__s32 pr_objref;
> +	__u16 protocol;
> +	__u16 nr_frags;
> +	__u8 cb[48];
>  };
> 
>  #define CKPT_UNIX_LINKED 1
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> index 7e8e137..49d9a2f 100644
> --- a/net/checkpoint.c
> +++ b/net/checkpoint.c
> @@ -17,9 +17,11 @@
>  #include <linux/syscalls.h>
>  #include <linux/sched.h>
>  #include <linux/fs_struct.h>
> +#include <linux/highmem.h>
> 
>  #include <net/af_unix.h>
>  #include <net/tcp_states.h>
> +#include <net/tcp.h>
> 
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
> @@ -88,6 +90,238 @@ static int sock_copy_buffers(struct sk_buff_head *from,
>  	return -EAGAIN;
>  }
> 
> +static void sock_record_header_info(struct sk_buff *skb,
> +				    struct ckpt_hdr_socket_buffer *h)
> +{
> +
> +	h->mac_len = skb->mac_len;
> +	h->skb_len = skb->len;
> +	h->hdr_len = skb->data - skb->head;
> +	h->lin_len = (skb->tail - skb->head);
> +	h->frg_len = skb->data_len;
> +
> +#ifdef NET_SKBUFF_DATA_USES_OFFSET
> +	h->transport_header = skb->transport_hdr;
> +	h->network_header = skb->network_header;
> +	h->mac_header = skb->mac_header;
> +#else
> +	h->transport_header = skb->transport_header - skb->head;
> +	h->network_header = skb->network_header - skb->head;
> +	h->mac_header = skb->mac_header - skb->head;
> +#endif
> +
> +	memcpy(h->cb, skb->cb, sizeof(skb->cb));
> +	h->nr_frags = skb_shinfo(skb)->nr_frags;
> +}
> +
> +int sock_restore_header_info(struct sk_buff *skb,
> +			     struct ckpt_hdr_socket_buffer *h)
> +{
> +	if (h->mac_header + h->mac_len != h->network_header) {
> +		ckpt_debug("skb mac_header %u+%u != network header %u\n",
> +			   h->mac_header, h->mac_len, h->network_header);
> +		return -EINVAL;
> +	}
> +
> +	if (h->network_header > h->lin_len) {
> +		ckpt_debug("skb network header %u > linear length %u\n",
> +			   h->network_header, h->lin_len);
> +		return -EINVAL;
> +	}
> +
> +	if (h->transport_header > h->lin_len) {
> +		ckpt_debug("skb transport header %u > linear length %u\n",
> +			   h->transport_header, h->lin_len);
> +		return -EINVAL;
> +	}
> +
> +	if (h->skb_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("skb total length %u larger than max of %lu\n",
> +			   h->skb_len, SKB_MAX_ALLOC);
> +		return -EINVAL;
> +	}
> +
> +	skb_set_transport_header(skb, h->transport_header);
> +	skb_set_network_header(skb, h->network_header);
> +	skb_set_mac_header(skb, h->mac_header);
> +	skb->mac_len = h->mac_len;
> +
> +	/* FIXME: This should probably be sanitized per-protocol to
> +	 * make sure nothing bad happens if it is hijacked.  For the
> +	 * current set of protocols that we restore this way, the data
> +	 * contained within is not very risky (flags and sequence
> +	 * numbers) but could still be evalutated from a
> +	 * could-the-user- have-set-these-flags point of view.
> +	 */
> +	memcpy(skb->cb, h->cb, sizeof(skb->cb));
> +
> +	skb->data = skb->head + skb->hdr_len;
> +	skb->len = h->skb_len;
> +
> +	return 0;
> +}
> +
> +static int sock_restore_skb_frag(struct ckpt_ctx *ctx,
> +				 struct sk_buff *skb,
> +				 int frag_idx)
> +{
> +	int ret = 0;
> +	int fraglen;
> +	struct page *page;
> +	void *buf;
> +
> +	fraglen = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> +	if (fraglen < 0)
> +		return fraglen;
> +
> +	if (fraglen > PAGE_SIZE) {
> +		ckpt_debug("skb frag size %i > PAGE_SIZE\n", fraglen);
> +		return -EINVAL;
> +	}
> +
> +	page = alloc_page(GFP_KERNEL);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	buf = kmap(page);
> +	ret = ckpt_kread(ctx, buf, fraglen);
> +	kunmap(page);
> +
> +	if (ret) {
> +		ckpt_debug("failed to read fragment: %i\n", ret);
> +		ret = -EINVAL;
> +		__free_page(page);
> +	} else {
> +		ckpt_debug("read %i for fragment %i\n", fraglen, frag_idx);
> +		skb_add_rx_frag(skb, frag_idx, page, 0, fraglen);
> +	}
> +
> +	return ret < 0 ? ret : fraglen;
> +}
> +
> +struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff *skb = NULL;
> +	int i;
> +	int ret = 0;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return (struct sk_buff *)h;
> +
> +	if (h->lin_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket linear buffer too big (%u > %lu)\n",
> +			   h->lin_len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	} else if (h->frg_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket frag size too big (%u > %lu\n",
> +			   h->frg_len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	} else if (h->nr_frags >= MAX_SKB_FRAGS) {
> +		ckpt_debug("socket frag count too big (%u > %lu\n",
> +			   h->nr_frags, MAX_SKB_FRAGS);
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +
> +	skb = alloc_skb(h->lin_len, GFP_KERNEL);
> +	if (!skb) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = _ckpt_read_obj_type(ctx, skb_put(skb, h->lin_len),
> +				  h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("read linear skb length %u: %i\n", h->lin_len, ret);
> +	if (ret < 0) {
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->nr_frags; i++) {
> +		ret = sock_restore_skb_frag(ctx, skb, i);
> +		ckpt_debug("read skb frag %i/%i: %i\n",
> +			   i + 1, h->nr_frags, ret);
> +		if (ret < 0)
> +			goto out;
> +		h->frg_len -= ret;
> +	}
> +
> +	if (h->frg_len != 0) {
> +		ckpt_debug("length %u remaining after reading frags\n",
> +			   h->frg_len);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	sock_restore_header_info(skb, h);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	if (ret < 0) {
> +		kfree_skb(skb);
> +		skb = ERR_PTR(ret);
> +	}
> +
> +	return skb;
> +}
> +
> +static int __sock_write_skb(struct ckpt_ctx *ctx,
> +			    struct sk_buff *skb,
> +			    int dst_objref)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	if (dst_objref > 0) {
> +		BUG_ON(!skb->sk);
> +		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
> +		if (ret < 0)
> +			goto out;
> +		h->sk_objref = ret;
> +		h->pr_objref = dst_objref;
> +	}
> +
> +	sock_record_header_info(skb, h);
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = ckpt_write_obj_type(ctx, skb->head, h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("writing skb linear region %u: %i\n", h->lin_len, ret);
> +	if (ret < 0)
> +		goto out;
> +
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +		u8 *vaddr = kmap(frag->page);
> +
> +		ckpt_debug("writing buffer fragment %i/%i (%i)\n",
> +			   i + 1, h->nr_frags, frag->size);
> +		ret = ckpt_write_obj_type(ctx, vaddr + frag->page_offset,
> +					  frag->size, CKPT_HDR_BUFFER);
> +		kunmap(frag->page);
> +		h->frg_len -= frag->size;
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	WARN_ON(h->frg_len != 0);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
>  static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  				struct sk_buff_head *queue,
>  				int dst_objref)
> @@ -95,13 +329,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  	struct sk_buff *skb;
> 
>  	skb_queue_walk(queue, skb) {
> -		struct ckpt_hdr_socket_buffer *h;
>  		int ret = 0;
> 
> -		/* FIXME: This could be a false positive for non-unix
> -		 *        buffers, so add a type check here in the
> -		 *        future
> -		 */
>  		if (UNIXCB(skb).fp) {
>  			ckpt_err(ctx, -EBUSY, "%(T)af_unix: pass fd\n");
>  			return -EBUSY;
> @@ -113,25 +342,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  		 * because we don't save out (or restore) the control
>  		 * information contained in the skb.
>  		 */
> -		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> -		if (!h)
> -			return -ENOMEM;
> -
> -		BUG_ON(!skb->sk);
> -		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
> -		if (ret < 0)
> -			goto end;
> -		h->sk_objref = ret;
> -		h->pr_objref = dst_objref;
> -
> -		ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> -		if (ret < 0)
> -			goto end;
> 
> -		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
> -					  CKPT_HDR_BUFFER);
> -	end:
> -		ckpt_hdr_put(ctx, h);
> +		ret = __sock_write_skb(ctx, skb, dst_objref);
>  		if (ret < 0)
>  			return ret;
>  	}
> -- 
> 1.6.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v5)
  2009-11-17 15:26 ` [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v5) Dan Smith
@ 2009-11-18  4:34   ` Serge E. Hallyn
  0 siblings, 0 replies; 11+ messages in thread
From: Serge E. Hallyn @ 2009-11-18  4:34 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers, netdev

Quoting Dan Smith (danms@us.ibm.com):
> This patch adds basic support for C/R of open INET sockets.  I think that
> all the important bits of the TCP and ICSK socket structures is saved,
> but I think there is still some additional IPv6 stuff that needs to be
> handled.
> 
> With this patch applied, the following script can be used to demonstrate
> the functionality:
> 
>   https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html
> 
> It shows that this enables migration of a sendmail process with open
> connections from one machine to another without dropping.
> 
> We probably need comments from the netdev people about the quality of
> sanity checking we do on the values in the ckpt_hdr_socket_inet
> structure on restart.
> 
> Note that this still doesn't address lingering sockets yet.
> 
> Changes in v5:
>  - Change ckpt_write_err() to ckpt_err()
> 
> Changes in v4:
>  - Use the new socket buffer restore functions introduced in the
>    previous patch
>  - Move listen_sockets list under the restart items in ckpt_ctx
>  - Rename RESTART_SOCK_LISTENONLY to RESTART_CONN_RESET
> 
> Changes in v3:
>  - Prevent restart from allowing a bind on a <1024 port unless the
>    user is granted that capability
>  - Add some sanity checking in the inet_precheck() function to make sure
>    the values read from the checkpoint image are within acceptable ranges
>  - Check the result of sock_restore_header_info() and fail if needed
> 
> Changes in v2:
>  - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
>    structure instead of saving them separately
>  - Fix 'sock' naming in sock_cptrst()
>  - Don't take the queue lock before skb_queue_tail() since it is
>    done for us
>  - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
>    flag is specified on sys_restart()
>  - Pull the implementation of the list of listening sockets back into
>    this patch
>  - Fix dangling printk
>  - Add some comments around the parent/child restore logic
> 
> Cc: netdev@vger.kernel.org
> Acked-by: Oren Laadan <orenl@librato.com>
> Signed-off-by: Dan Smith <danms@us.ibm.com>

Acked-by: Serge Hallyn <serue@us.ibm.com>

> ---
>  checkpoint/sys.c                 |    4 +
>  include/linux/checkpoint.h       |    5 +-
>  include/linux/checkpoint_hdr.h   |   95 +++++++++
>  include/linux/checkpoint_types.h |    1 +
>  net/checkpoint.c                 |   27 ++--
>  net/ipv4/checkpoint.c            |  391 ++++++++++++++++++++++++++++++++++----
>  6 files changed, 473 insertions(+), 50 deletions(-)
> 
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 9f9e825..baed891 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -244,6 +244,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
> 
>  	kfree(ctx->pids_arr);
> 
> +	sock_listening_list_free(&ctx->listen_sockets);
> +
>  	kfree(ctx);
>  }
> 
> @@ -274,6 +276,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
> 
>  	mutex_init(&ctx->msg_mutex);
> 
> +	INIT_LIST_HEAD(&ctx->listen_sockets);
> +
>  	err = -EBADF;
>  	ctx->file = fget(fd);
>  	if (!ctx->file)
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 0eff43e..ddc9aa0 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -20,6 +20,7 @@
>  #define RESTART_FROZEN		0x2
>  #define RESTART_GHOST		0x4
>  #define RESTART_KEEP_LSM	0x8
> +#define RESTART_CONN_RESET      0x10
> 
>  /* misc user visible */
>  #define CHECKPOINT_FD_NONE	-1
> @@ -53,7 +54,8 @@
>  	(RESTART_TASKSELF | \
>  	 RESTART_FROZEN | \
>  	 RESTART_KEEP_LSM | \
> -	 RESTART_GHOST)
> +	 RESTART_GHOST | \
> +	 RESTART_CONN_RESET)
>  #define CKPT_LSM_INFO_LEN 200
>  #define CKPT_LSM_STRING_MAX 1024
> 
> @@ -105,6 +107,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
>  			      struct sockaddr *loc, unsigned *loc_len,
>  			      struct sockaddr *rem, unsigned *rem_len);
>  struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
> +void sock_listening_list_free(struct list_head *head);
> 
>  /* ckpt kflags */
>  #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 787cf89..d1a93e3 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -15,6 +15,7 @@
>  #include <linux/socket.h>
>  #include <linux/un.h>
>  #include <linux/in.h>
> +#include <linux/in6.h>
>  #else
>  #include <sys/types.h>
>  #include <linux/types.h>
> @@ -625,6 +626,100 @@ struct ckpt_hdr_socket_unix {
> 
>  struct ckpt_hdr_socket_inet {
>  	struct ckpt_hdr h;
> +	__u32 daddr;
> +	__u32 rcv_saddr;
> +	__u32 saddr;
> +	__u16 dport;
> +	__u16 num;
> +	__u16 sport;
> +	__s16 uc_ttl;
> +	__u16 cmsg_flags;
> +
> +	struct {
> +		__u64 timeout;
> +		__u32 ato;
> +		__u32 lrcvtime;
> +		__u16 last_seg_size;
> +		__u16 rcv_mss;
> +		__u8 pending;
> +		__u8 quick;
> +		__u8 pingpong;
> +		__u8 blocked;
> +	} icsk_ack __attribute__ ((aligned(8)));
> +
> +	/* FIXME: Skipped opt, tos, multicast, cork settings */
> +
> +	struct {
> +		__u32 rcv_nxt;
> +		__u32 copied_seq;
> +		__u32 rcv_wup;
> +		__u32 snd_nxt;
> +		__u32 snd_una;
> +		__u32 snd_sml;
> +		__u32 rcv_tstamp;
> +		__u32 lsndtime;
> +
> +		__u32 snd_wl1;
> +		__u32 snd_wnd;
> +		__u32 max_window;
> +		__u32 mss_cache;
> +		__u32 window_clamp;
> +		__u32 rcv_ssthresh;
> +		__u32 frto_highmark;
> +
> +		__u32 srtt;
> +		__u32 mdev;
> +		__u32 mdev_max;
> +		__u32 rttvar;
> +		__u32 rtt_seq;
> +
> +		__u32 packets_out;
> +		__u32 retrans_out;
> +
> +		__u32 snd_up;
> +		__u32 rcv_wnd;
> +		__u32 write_seq;
> +		__u32 pushed_seq;
> +		__u32 lost_out;
> +		__u32 sacked_out;
> +		__u32 fackets_out;
> +		__u32 tso_deferred;
> +		__u32 bytes_acked;
> +
> +		__s32 lost_cnt_hint;
> +		__u32 retransmit_high;
> +
> +		__u32 lost_retrans_low;
> +
> +		__u32 prior_ssthresh;
> +		__u32 high_seq;
> +
> +		__u32 retrans_stamp;
> +		__u32 undo_marker;
> +		__s32 undo_retrans;
> +		__u32 total_retrans;
> +
> +		__u32 urg_seq;
> +		__u32 keepalive_time;
> +		__u32 keepalive_intvl;
> +
> +		__u16 urg_data;
> +		__u16 advmss;
> +		__u8 frto_counter;
> +		__u8 nonagle;
> +
> +		__u8 ecn_flags;
> +		__u8 reordering;
> +
> +		__u8 keepalive_probes;
> +	} tcp __attribute__ ((aligned(8)));
> +
> +	struct {
> +		struct in6_addr saddr;
> +		struct in6_addr rcv_saddr;
> +		struct in6_addr daddr;
> +	} inet6 __attribute__ ((aligned(8)));
> +
>  	__u32 laddr_len;
>  	__u32 raddr_len;
>  	struct sockaddr_in laddr;
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index 77f8592..79c9c09 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -82,6 +82,7 @@ struct ckpt_ctx {
>  	wait_queue_head_t waitq;	/* waitqueue for restarting tasks */
>  	wait_queue_head_t ghostq;	/* waitqueue for ghost tasks */
>  	struct cred *realcred, *ecred;	/* tmp storage for cred at restart */
> +	struct list_head listen_sockets;/* listening parent sockets */
> 
>  	struct ckpt_stats stats;	/* statistics */
> 
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> index 49d9a2f..aba1497 100644
> --- a/net/checkpoint.c
> +++ b/net/checkpoint.c
> @@ -324,6 +324,7 @@ static int __sock_write_skb(struct ckpt_ctx *ctx,
> 
>  static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  				struct sk_buff_head *queue,
> +				uint16_t family,
>  				int dst_objref)
>  {
>  	struct sk_buff *skb;
> @@ -336,11 +337,11 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  			return -EBUSY;
>  		}
> 
> -		/* The other ancillary messages are always present
> -		 * unlike descriptors.  Even though we can't detect
> -		 * them and fail the checkpoint, we're not at risk
> -		 * because we don't save out (or restore) the control
> -		 * information contained in the skb.
> +		/* The other ancillary messages UNIX are always
> +		 * present unlike descriptors.  Even though we can't
> +		 * detect them and fail the checkpoint, we're not at
> +		 * risk because we don't restore the control
> +		 * information in the UNIX code.
>  		 */
> 
>  		ret = __sock_write_skb(ctx, skb, dst_objref);
> @@ -353,6 +354,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
> 
>  static int sock_write_buffers(struct ckpt_ctx *ctx,
>  			      struct sk_buff_head *queue,
> +			      uint16_t family,
>  			      int dst_objref)
>  {
>  	struct ckpt_hdr_socket_queue *h;
> @@ -372,7 +374,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
>  	h->skb_count = ret;
>  	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
>  	if (!ret)
> -		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
> +		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
> 
>   out:
>  	ckpt_hdr_put(ctx, h);
> @@ -394,12 +396,14 @@ int sock_deferred_write_buffers(void *data)
>  		return dst_objref;
>  	}
> 
> -	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
> +	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
> +				 dq->sk->sk_family, dst_objref);
>  	ckpt_debug("write recv buffers: %i\n", ret);
>  	if (ret < 0)
>  		return ret;
> 
> -	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
> +	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
> +				 dq->sk->sk_family, dst_objref);
>  	ckpt_debug("write send buffers: %i\n", ret);
> 
>  	return ret;
> @@ -924,10 +928,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
>  		goto err;
> 
>  	if ((h->sock_common.family == AF_INET) &&
> -	    (h->sock.state != TCP_LISTEN)) {
> -		/* Temporary hack to enable restore of TCP_LISTEN sockets
> -		 * while forcing anything else to a closed state
> -		 */
> +	    (h->sock.state != TCP_LISTEN) &&
> +	    (ctx->uflags & RESTART_CONN_RESET)) {
> +		ckpt_debug("Forcing open socket closed\n");
>  		sock->sk->sk_state = TCP_CLOSE;
>  		sock->state = SS_UNCONNECTED;
>  	}
> diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
> index 9cbbf5e..3e20cc9 100644
> --- a/net/ipv4/checkpoint.c
> +++ b/net/ipv4/checkpoint.c
> @@ -17,6 +17,7 @@
>  #include <linux/deferqueue.h>
>  #include <net/tcp_states.h>
>  #include <net/tcp.h>
> +#include <net/ipv6.h>
> 
>  struct dq_sock {
>  	struct ckpt_ctx *ctx;
> @@ -28,6 +29,236 @@ struct dq_buffers {
>  	struct sock *sk;
>  };
> 
> +struct listen_item {
> +	struct sock *sk;
> +	struct list_head list;
> +};
> +
> +void sock_listening_list_free(struct list_head *head)
> +{
> +	struct listen_item *item, *tmp;
> +
> +	list_for_each_entry_safe(item, tmp, head, list) {
> +		list_del(&item->list);
> +		kfree(item);
> +	}
> +}
> +
> +static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct listen_item *item;
> +
> +	item = kmalloc(sizeof(*item), GFP_KERNEL);
> +	if (!item)
> +		return -ENOMEM;
> +
> +	item->sk = sk;
> +	list_add(&item->list, &ctx->listen_sockets);
> +
> +	return 0;
> +}
> +
> +static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct listen_item *item;
> +
> +	list_for_each_entry(item, &ctx->listen_sockets, list) {
> +		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
> +			return item->sk;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sock_hash_parent(void *data)
> +{
> +	struct dq_sock *dq = (struct dq_sock *)data;
> +	struct sock *parent;
> +
> +	ckpt_debug("INET post-restart hash\n");
> +
> +	dq->sk->sk_prot->hash(dq->sk);
> +
> +	/* If there is a listening socket with the same source port,
> +	 * then become a child of that socket [we are the result of an
> +	 * accept()].  Otherwise hash ourselves directly in [we are
> +	 * the result of a connect()]
> +	 */
> +
> +	parent = sock_get_parent(dq->ctx, dq->sk);
> +	if (parent) {
> +		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
> +		local_bh_disable();
> +		__inet_inherit_port(parent, dq->sk);
> +		local_bh_enable();
> +	} else {
> +		inet_sk(dq->sk)->num = 0;
> +		inet_hash_connect(&tcp_death_row, dq->sk);
> +		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
> +{
> +	struct dq_sock dq;
> +
> +	dq.sk = sock;
> +	dq.ctx = ctx;
> +
> +	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
> +			      sock_hash_parent, NULL);
> +}
> +
> +static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
> +				struct tcp_sock *sk,
> +				struct ckpt_hdr_socket_inet *hh,
> +				int op)
> +{
> +	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
> +	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
> +	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
> +	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
> +	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
> +	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
> +	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
> +	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
> +
> +	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
> +	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
> +	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
> +	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
> +	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
> +	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
> +	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
> +	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
> +	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
> +	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
> +
> +	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
> +	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
> +	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
> +	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
> +	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
> +
> +	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
> +	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
> +
> +	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
> +	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
> +	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
> +	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
> +
> +	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
> +
> +	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
> +	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
> +	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
> +	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
> +	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
> +	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
> +	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
> +	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
> +
> +	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
> +	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
> +
> +	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
> +
> +	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
> +	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
> +
> +	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
> +	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
> +	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
> +	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
> +
> +	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
> +	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
> +	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
> +
> +	if (!skb_queue_empty(&sk->ucopy.prequeue))
> +		printk("PREQUEUE!\n");
> +
> +	return 0;
> +}
> +
> +static int sock_inet_restore_addrs(struct inet_sock *inet,
> +				   struct ckpt_hdr_socket_inet *hh)
> +{
> +	inet->daddr = hh->raddr.sin_addr.s_addr;
> +	inet->saddr = hh->laddr.sin_addr.s_addr;
> +	inet->rcv_saddr = inet->saddr;
> +
> +	inet->dport = hh->raddr.sin_port;
> +	inet->sport = hh->laddr.sin_port;
> +
> +	return 0;
> +}
> +
> +static int sock_inet_cptrst(struct ckpt_ctx *ctx,
> +			    struct sock *sk,
> +			    struct ckpt_hdr_socket_inet *hh,
> +			    int op)
> +{
> +	struct inet_sock *inet = inet_sk(sk);
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +	int ret;
> +
> +	if (op == CKPT_CPT) {
> +		CKPT_COPY(op, hh->daddr, inet->daddr);
> +		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
> +		CKPT_COPY(op, hh->dport, inet->dport);
> +		CKPT_COPY(op, hh->saddr, inet->saddr);
> +		CKPT_COPY(op, hh->sport, inet->sport);
> +	} else {
> +		ret = sock_inet_restore_addrs(inet, hh);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	CKPT_COPY(op, hh->num, inet->num);
> +	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
> +	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
> +
> +	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
> +	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
> +	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
> +	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
> +	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
> +	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
> +	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
> +	CKPT_COPY(op,
> +		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
> +	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
> +
> +	if (sk->sk_protocol == IPPROTO_TCP)
> +		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
> +	else if (sk->sk_protocol == IPPROTO_UDP)
> +		ret = 0;
> +	else {
> +		ret = -EINVAL;
> +		ckpt_err(ctx, ret, "unknown socket protocol %d",
> +			 sk->sk_protocol);
> +	}
> +
> +	if (sk->sk_family == AF_INET6) {
> +		struct ipv6_pinfo *inet6 = inet6_sk(sk);
> +		if (op == CKPT_CPT) {
> +			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
> +			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
> +			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
> +		} else {
> +			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
> +			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
> +			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
> +		}
> +	}
> +
> +	return ret;
> +}
> +
>  int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
>  {
>  	struct ckpt_hdr_socket_inet *in;
> @@ -43,6 +274,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
>  	if (ret)
>  		goto out;
> 
> +	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
> +	if (ret < 0)
> +		goto out;
> +
>  	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
>   out:
>  	ckpt_hdr_put(ctx, in);
> @@ -55,51 +290,22 @@ int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
>  	return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
>  }
> 
> -static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +static int inet_read_buffer(struct ckpt_ctx *ctx,
> +			    struct sk_buff_head *queue)
>  {
> -	struct ckpt_hdr_socket_buffer *h;
> -	int len;
> -	int ret;
>  	struct sk_buff *skb = NULL;
> 
> -	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> -	if (IS_ERR(h))
> -		return PTR_ERR(h);
> -
> -	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> -	if (len < 0) {
> -		ret = len;
> -		goto out;
> -	} else if (len > SKB_MAX_ALLOC) {
> -		ckpt_debug("Socket buffer too big (%i > %lu)",
> -			   len, SKB_MAX_ALLOC);
> -		ret = -ENOSPC;
> -		goto out;
> -	}
> -
> -	skb = alloc_skb(len, GFP_KERNEL);
> -	if (!skb) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -
> -	ret = ckpt_kread(ctx, skb_put(skb, len), len);
> -	if (ret < 0)
> -		goto out;
> +	skb = sock_restore_skb(ctx);
> +	if (IS_ERR(skb))
> +		return PTR_ERR(skb);
> 
> -	spin_lock(&queue->lock);
>  	skb_queue_tail(queue, skb);
> -	spin_unlock(&queue->lock);
> - out:
> -	ckpt_hdr_put(ctx, h);
> -
> -	if ((ret < 0) && skb)
> -		kfree_skb(skb);
> 
> -	return ret;
> +	return skb->len;
>  }
> 
> -static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +static int inet_read_buffers(struct ckpt_ctx *ctx,
> +			     struct sk_buff_head *queue)
>  {
>  	struct ckpt_hdr_socket_queue *h;
>  	int ret = 0;
> @@ -162,6 +368,19 @@ static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
> 
>  static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
>  {
> +	__u8 icsk_ack_mask = ICSK_ACK_SCHED | ICSK_ACK_TIMER |
> +		ICSK_ACK_PUSHED | ICSK_ACK_PUSHED2;
> +	__u16 urg_mask = TCP_URG_VALID | TCP_URG_NOTYET | TCP_URG_READ;
> +	__u8 nonagle_mask = TCP_NAGLE_OFF | TCP_NAGLE_CORK | TCP_NAGLE_PUSH;
> +	__u8 ecn_mask = TCP_ECN_OK | TCP_ECN_QUEUE_CWR | TCP_ECN_DEMAND_CWR;
> +
> +	if ((htons(in->laddr.sin_port) < PROT_SOCK) &&
> +	    !capable(CAP_NET_BIND_SERVICE)) {
> +		ckpt_debug("unable to bind to port %hu\n",
> +			   htons(in->laddr.sin_port));
> +		return -EINVAL;
> +	}
> +
>  	if (in->laddr_len > sizeof(struct sockaddr_in)) {
>  		ckpt_debug("laddr_len is too big\n");
>  		return -EINVAL;
> @@ -172,6 +391,77 @@ static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
>  		return -EINVAL;
>  	}
> 
> +	/* Set ato to the default */
> +	in->icsk_ack.ato = TCP_ATO_MIN;
> +
> +	/* No quick acks are scheduled after a restart */
> +	in->icsk_ack.quick = 0;
> +
> +	if (in->icsk_ack.pending & ~icsk_ack_mask) {
> +		ckpt_debug("invalid pending flags 0x%x\n",
> +			   in->icsk_ack.pending & ~icsk_ack_mask);
> +		return -EINVAL;
> +	}
> +
> +	if (in->icsk_ack.pingpong > 1) {
> +		ckpt_debug("invalid icsk_ack.pingpong value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->icsk_ack.blocked > 1) {
> +		ckpt_debug("invalid icsk_ack.blocked value\n");
> +		return -EINVAL;
> +	}
> +
> +	/* do_tcp_setsockopt() quietly makes this coercion */
> +	if (in->tcp.window_clamp < (SOCK_MIN_RCVBUF / 2))
> +		in->tcp.window_clamp = SOCK_MIN_RCVBUF / 2;
> +	else if (in->tcp.window_clamp > 65535U) {
> +		ckpt_debug("invalid window_clamp value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.rcv_ssthresh > (4U * in->tcp.advmss))
> +		in->tcp.rcv_ssthresh = 4U * in->tcp.advmss;
> +
> +	/* These will all be recalculated on the next call to
> +	 * tcp_rtt_estimator()
> +	 */
> +	in->tcp.srtt = in->tcp.mdev = in->tcp.mdev_max = 0;
> +	in->tcp.rttvar = in->tcp.rtt_seq = 0;
> +
> +	/* Might want to set packets_out to zero ? */
> +
> +	if (in->tcp.rcv_wnd > MAX_TCP_WINDOW)
> +		in->tcp.rcv_wnd = MAX_TCP_WINDOW;
> +
> +	if (in->tcp.keepalive_intvl > MAX_TCP_KEEPINTVL) {
> +		ckpt_debug("keepalive_intvl %i out of range\n",
> +			   in->tcp.keepalive_intvl);
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.keepalive_probes > MAX_TCP_KEEPCNT) {
> +		ckpt_debug("Invalid keepalive_probes value %i\n",
> +			   in->tcp.keepalive_probes);
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.urg_data & ~urg_mask) {
> +		ckpt_debug("Invalid urg_data value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.nonagle & ~nonagle_mask) {
> +		ckpt_debug("Invalid nonagle value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.ecn_flags & ~ecn_mask) {
> +		ckpt_debug("Invalid ecn_flags value\n");
> +		return -EINVAL;
> +	}
> +
>  	return 0;
>  }
> 
> @@ -209,8 +499,35 @@ int inet_restore(struct ckpt_ctx *ctx,
>  			ckpt_debug("inet listen: %i\n", ret);
>  			if (ret < 0)
>  				goto out;
> +
> +			/* We are a listening socket, so add ourselves
> +			 * to the list of parent sockets.  This will
> +			 * allow our children to find us later and
> +			 * link up
> +			 */
> +
> +			ret = sock_listening_list_add(ctx, sock->sk);
> +			if (ret < 0)
> +				goto out;
>  		}
>  	} else {
> +		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
> +		if (ret)
> +			goto out;
> +
> +		if ((h->sock.state == TCP_ESTABLISHED) &&
> +		    (h->sock.protocol == IPPROTO_TCP)) {
> +			/* A connected socket that was spawned from an
> +			 * accept() needs to be hashed with its parent
> +			 * listening socket in order to receive
> +			 * traffic on the original port.  Since we may
> +			 * not have restarted the parent yet, we defer
> +			 * this until later when we know we have all
> +			 * the listening sockets accounted for.
> +			 */
> +			ret = sock_defer_hash(ctx, sock->sk);
> +		}
> +
>  		if (!sock_flag(sock->sk, SOCK_DEAD))
>  			ret = inet_defer_restore_buffers(ctx, sock->sk);
>  	}
> -- 
> 1.6.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]     ` <1258471590-29768-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-18  4:39       ` Serge E. Hallyn
       [not found]         ` <20091118043920.GD19841-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Serge E. Hallyn @ 2009-11-18  4:39 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  net/unix/checkpoint.c |   54 ++++++++++++++++++++++++++----------------------
>  1 files changed, 29 insertions(+), 25 deletions(-)
> 
> diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
> index 749673d..c6a1b27 100644
> --- a/net/unix/checkpoint.c
> +++ b/net/unix/checkpoint.c
> @@ -201,7 +201,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
>  	uint8_t peer_shutdown = 0;
>  	void *buf = NULL;
>  	int sndbuf;
> -	int len;
>  	int ret = 0;
> 
>  	memset(&msg, 0, sizeof(msg));
> @@ -210,14 +209,30 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
>  	if (IS_ERR(h))
>  		return PTR_ERR(h);
> 
> -	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> -	if (len < 0) {
> -		ret = len;
> +	if (h->lin_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket buffer too big (%u > %lu)\n",
> +			   h->lin_len, SKB_MAX_ALLOC);
> +		ret = -EINVAL;
> +		goto out;
> +	} else if (h->nr_frags != 0) {
> +		ckpt_debug("unix socket claims to have fragments\n");
> +		ret = -EINVAL;
>  		goto out;
> -	} else if (len > SKB_MAX_ALLOC) {
> -		ckpt_debug("Socket buffer too big (%i > %lu)",
> -			   len, SKB_MAX_ALLOC);
> -		ret = -ENOSPC;
> +	}
> +
> +	buf = kmalloc(h->lin_len, GFP_KERNEL);
> +	if (!buf) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	kvec.iov_len = h->lin_len;
> +	kvec.iov_base = buf;
> +	ret = _ckpt_read_obj_type(ctx, kvec.iov_base,
> +				  h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("read unix socket buffer %u: %i\n", h->lin_len, ret);
> +	if (ret < h->lin_len) {
> +		ret = -EINVAL;
>  		goto out;
>  	}
> 
> @@ -246,18 +261,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
>  		}
>  	}
> 
> -	kvec.iov_len = len;
> -	buf = kmalloc(len, GFP_KERNEL);
> -	kvec.iov_base = buf;
> -	if (!buf) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -
> -	ret = ckpt_kread(ctx, kvec.iov_base, len);
> -	if (ret < 0)
> -		goto out;
> -
>  	msg.msg_name = addr;
>  	msg.msg_namelen = addrlen;
> 
> @@ -273,15 +276,16 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
> 
>  	/* Make sure there's room in the send buffer */
>  	sndbuf = sk->sk_sndbuf;
> -	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
> +	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
>  	    capable(CAP_NET_ADMIN))
> -		sk->sk_sndbuf += len;
> +		sk->sk_sndbuf += h->lin_len;
>  	else
>  		sk->sk_sndbuf = sysctl_wmem_max;

Again, should you check whether h->len_len < sysctl_wmem_max before doing
the capable(CAP_NET_ADMIN)?  The goal is to prevent setting PF_SUPERPRIV
in task->flags if it wasn't absolutely necessary.

> 
> -	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, len);
> -	ckpt_debug("kernel_sendmsg(%i,%i): %i\n", h->sk_objref, len, ret);
> -	if ((ret > 0) && (ret != len))
> +	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, h->lin_len);
> +	ckpt_debug("kernel_sendmsg(%i,%u): %i\n",
> +		   h->sk_objref, h->lin_len, ret);
> +	if ((ret > 0) && (ret != h->lin_len))
>  		ret = -ENOMEM;
> 
>  	sk->sk_sndbuf = sndbuf;
> -- 
> 1.6.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]         ` <20091118043920.GD19841-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-18 14:28           ` Dan Smith
       [not found]             ` <876397vrf1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Dan Smith @ 2009-11-18 14:28 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

sk-> sk_sndbuf = sysctl_wmem_max;

SH> Again, should you check whether h->len_len < sysctl_wmem_max
SH> before doing the capable(CAP_NET_ADMIN)?  The goal is to prevent
SH> setting PF_SUPERPRIV in task->flags if it wasn't absolutely
SH> necessary.

Yes, I think we agreed that we should, I just wasn't going to fold
that into these other unrelated changes.   I'll spin another small
patch for that right away.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]             ` <876397vrf1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-11-18 15:16               ` Serge E. Hallyn
  0 siblings, 0 replies; 11+ messages in thread
From: Serge E. Hallyn @ 2009-11-18 15:16 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> sk-> sk_sndbuf = sysctl_wmem_max;
> 
> SH> Again, should you check whether h->len_len < sysctl_wmem_max
> SH> before doing the capable(CAP_NET_ADMIN)?  The goal is to prevent
> SH> setting PF_SUPERPRIV in task->flags if it wasn't absolutely
> SH> necessary.
> 
> Yes, I think we agreed that we should, I just wasn't going to fold
> that into these other unrelated changes.   I'll spin another small
> patch for that right away.

Ok, then

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

to the combination

thanks,
-serge

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Add support for connected INET sockets
       [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-11-17 15:26   ` [PATCH 4/4] Add some content to the readme.txt for socket c/r Dan Smith
@ 2009-11-25 18:51   ` Oren Laadan
  3 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2009-11-25 18:51 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg


Great !  will add to v19-rc2.

Oren.

Dan Smith wrote:
> This is an update of the previous set to apply on top of v19-rc1 and to
> address some of the trivial changes suggested on list.  Those are detailed
> per-patch.  I will follow up with a patch to change ckpt_debug to ckpt_err
> where appropriate, across the whole net/*/checkpoint.c set.  Since this
> will apply to the unchanged gso patch as well, I decided to do this as a
> follow-on cleanup patch.
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-11-25 18:51 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-17 15:26 Add support for connected INET sockets Dan Smith
     [not found] ` <1258471590-29768-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-17 15:26   ` [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers (v2) Dan Smith
     [not found]     ` <1258471590-29768-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-18  4:34       ` Serge E. Hallyn
2009-11-17 15:26   ` [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file Dan Smith
     [not found]     ` <1258471590-29768-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-18  4:39       ` Serge E. Hallyn
     [not found]         ` <20091118043920.GD19841-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-18 14:28           ` Dan Smith
     [not found]             ` <876397vrf1.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-11-18 15:16               ` Serge E. Hallyn
2009-11-17 15:26   ` [PATCH 4/4] Add some content to the readme.txt for socket c/r Dan Smith
2009-11-25 18:51   ` Add support for connected INET sockets Oren Laadan
2009-11-17 15:26 ` [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v5) Dan Smith
2009-11-18  4:34   ` Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.