All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Sockets as proper objects and buffers with owners
@ 2009-09-02 18:22 Dan Smith
       [not found] ` <1251915760-20118-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-02 18:22 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This is an update of my previous set to make sockets proper objhash citizens
and use that functionality to attach owners to socket buffers for proper
recording of the sender post-restart.

I still haven't done the collect() operation, but am sending this out to get
feedback on the updated last patch which changes the sequence of events
required to restore a socket.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found] ` <1251915760-20118-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-02 18:22   ` Dan Smith
       [not found]     ` <1251915760-20118-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-02 18:22   ` [PATCH 2/3] Add post-file deferqueue Dan Smith
  2009-09-02 18:22   ` [PATCH 3/3] [RFC] Track socket buffer owners Dan Smith
  2 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-02 18:22 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This changes the checkpoint/restart procedure for sockets a bit.  The
socket file header is now checkpointed separately from the socket itself,
which allows us to checkpoint a socket without arriving at it from a
file descriptor.  Thus, most sockets will be checkpointed as a result
of processing the file table, calling sock_file_checkpoint(fd), which
in turn calls checkpoint_obj(socket).

However, we may arrive at some sockets while checkpointing other objects,
such as the other end of an AF_UNIX socket with buffers in flight.  This
patch just opens that door, which is utilized by the next patch.

Changes in v2:
 - If we attempt to checkpoint an orphan socket, create a struct socket
   to adopt it for the purposes of the checkpoint

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/objhash.c           |    2 +
 include/linux/checkpoint_hdr.h |    6 +-
 include/net/sock.h             |    2 +
 net/checkpoint.c               |  140 +++++++++++++++++++++++++++++++--------
 net/unix/checkpoint.c          |    3 +-
 5 files changed, 120 insertions(+), 33 deletions(-)

diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index a9a10d1..a410346 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -381,6 +381,8 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.obj_type = CKPT_OBJ_SOCK,
 		.ref_drop = obj_sock_drop,
 		.ref_grab = obj_sock_grab,
+		.checkpoint = checkpoint_sock,
+		.restore = restore_sock,
 	},
 };
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 06bc6e2..b75562c 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -70,6 +70,7 @@ enum {
 	CKPT_HDR_USER,
 	CKPT_HDR_GROUPINFO,
 	CKPT_HDR_TASK_CREDS,
+	CKPT_HDR_SOCKET,
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -368,7 +369,8 @@ struct ckpt_hdr_file_pipe {
 } __attribute__((aligned(8)));
 
 /* socket */
-struct ckpt_socket {
+struct ckpt_hdr_socket {
+	struct ckpt_hdr h;
 	struct { /* struct socket */
 		__u64 flags;
 		__u8 state;
@@ -428,7 +430,7 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_file_socket {
 	struct ckpt_hdr_file common;
-	struct ckpt_socket socket;
+	__s32 sock_objref;
 } __attribute__((aligned(8)));
 
 struct ckpt_hdr_utsns {
diff --git a/include/net/sock.h b/include/net/sock.h
index 8e3b050..0db1ca3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1644,6 +1644,8 @@ extern __u32 sysctl_rmem_default;
 /* Checkpoint/Restart Functions */
 struct ckpt_ctx;
 struct ckpt_hdr_file;
+extern int checkpoint_sock(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_sock(struct ckpt_ctx *ctx);
 extern int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
 extern struct file *sock_file_restore(struct ckpt_ctx *ctx,
 				      struct ckpt_hdr_file *h);
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 2541e81..42a8853 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -428,31 +428,26 @@ static int sock_cptrst(struct ckpt_ctx *ctx, struct sock *sk,
 		return 0;
 }
 
-int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+static int __do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
 {
-	struct ckpt_hdr_file_socket *h;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
 	int ret;
+	struct socket *sock = sk->sk_socket;
+	struct ckpt_hdr_socket *h;
 
 	if (!sock->ops->checkpoint) {
 		ckpt_write_err(ctx, "socket (proto_ops: %pS)", sock->ops);
 		return -ENOSYS;
 	}
 
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
 	if (!h)
 		return -ENOMEM;
 
-	h->common.f_type = CKPT_FILE_SOCKET;
-
 	/* part I: common to all sockets */
-	ret = sock_cptrst(ctx, sk, &h->socket, CKPT_CPT);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_file_common(ctx, file, &h->common);
+	ret = sock_cptrst(ctx, sk, h, CKPT_CPT);
 	if (ret < 0)
 		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (ret < 0)
 		goto out;
@@ -463,12 +458,71 @@ int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
 		goto out;
 
 	/* part III: socket buffers */
-	if (sk->sk_state != TCP_LISTEN) {
+	if ((sk->sk_state != TCP_LISTEN) && (!sock_flag(sk, SOCK_DEAD))) {
 		ret = sock_write_buffers(ctx, &sk->sk_receive_queue);
 		if (ret)
 			goto out;
 		ret = sock_write_buffers(ctx, &sk->sk_write_queue);
 	}
+
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct socket *sock;
+	int ret;
+
+	if (sk->sk_socket)
+		return __do_sock_checkpoint(ctx, sk);
+
+	/* Temporarily adopt this orphan socket */
+	ret = sock_create(sk->sk_family, sk->sk_type, 0, &sock);
+	if (ret < 0)
+		return ret;
+	sock_graft(sk, sock);
+
+	ret = __do_sock_checkpoint(ctx, sk);
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+	sock_release(sock);
+
+	return ret;
+}
+
+int checkpoint_sock(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_sock_checkpoint(ctx, (struct sock *)ptr);
+}
+
+int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_socket *h;
+	struct socket *sock = file->private_data;
+	struct sock *sk = sock->sk;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_SOCKET;
+
+	h->sock_objref = checkpoint_obj(ctx, sk, CKPT_OBJ_SOCK);
+	if (h->sock_objref < 0) {
+		ret = h->sock_objref;
+		goto out;
+	}
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
  out:
 	ckpt_hdr_put(ctx, h);
 	return ret;
@@ -525,27 +579,31 @@ static struct file *sock_alloc_attach_fd(struct socket *sock)
 		file = ERR_PTR(err);
 	}
 
+	/* Since objhash assumes the initial reference for a socket,
+	 * we bump it here for this descriptor, unlike other places in the
+	 * socket code which assume the descriptor is the owner.
+	 */
+	sock_hold(sock->sk);
+
 	return file;
 }
 
 struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
 {
-	struct ckpt_hdr_file_socket *hh = (struct ckpt_hdr_file_socket *) ptr;
-	struct ckpt_socket *h = &hh->socket;
+	struct ckpt_hdr_socket *h;
 	struct socket *sock;
-	struct file *file;
 	int ret;
 
-	if (ptr->h.type != CKPT_HDR_FILE  ||
-	    ptr->h.len != sizeof(*hh) || ptr->f_type != CKPT_FILE_SOCKET)
-		return ERR_PTR(-EINVAL);
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
 
 	/* silently clear flags, e.g. SOCK_NONBLOCK or SOCK_CLOEXEC */
 	h->sock.type &= SOCK_TYPE_MASK;
 
 	ret = sock_create(h->sock_common.family, h->sock.type, 0, &sock);
 	if (ret < 0)
-		return ERR_PTR(ret);
+		goto err;
 
 	if (!sock->ops->restore) {
 		ckpt_debug("proto_ops lacks checkpoint: %pS\n", sock->ops);
@@ -566,21 +624,45 @@ struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
 	if (ret < 0)
 		goto err;
 
-	file = sock_alloc_attach_fd(sock);
-	if (IS_ERR(file)) {
-		ret = PTR_ERR(file);
-		goto err;
-	}
+	ckpt_hdr_put(ctx, h);
+
+	return sock->sk;
+ err:
+	ckpt_hdr_put(ctx, h);
+	sock_release(sock);
+
+	return ERR_PTR(ret);
+}
+
+void *restore_sock(struct ckpt_ctx *ctx)
+{
+	return do_sock_restore(ctx);
+}
+
+struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+{
+	struct ckpt_hdr_file_socket *h = (struct ckpt_hdr_file_socket *)ptr;
+	struct sock *sk;
+	struct file *file;
+	int ret;
+
+	if (ptr->h.type != CKPT_HDR_FILE || ptr->f_type != CKPT_FILE_SOCKET)
+		return ERR_PTR(-EINVAL);
+
+	sk = ckpt_obj_fetch(ctx, h->sock_objref, CKPT_OBJ_SOCK);
+	if (IS_ERR(sk))
+		return ERR_PTR(PTR_ERR(sk));
+
+	file = sock_alloc_attach_fd(sk->sk_socket);
+	if (IS_ERR(file))
+		return file;
 
 	ret = restore_file_common(ctx, file, ptr);
 	if (ret < 0) {
 		fput(file);
-		file = ERR_PTR(ret);
+		return ERR_PTR(ret);
 	}
-	return file;
 
- err:
-	sock_release(sock);
-	return ERR_PTR(ret);
+	return file;
 }
 
diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
index 08e664b..f4905db 100644
--- a/net/unix/checkpoint.c
+++ b/net/unix/checkpoint.c
@@ -57,7 +57,6 @@ static int unix_write_cwd(struct ckpt_ctx *ctx,
 int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct unix_sock *sk = unix_sk(sock->sk);
-	struct unix_sock *pr = unix_sk(sk->peer);
 	struct ckpt_hdr_socket_unix *un;
 	int new;
 	int ret = -ENOMEM;
@@ -86,7 +85,7 @@ int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 		goto out;
 
 	if (sk->peer)
-		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+		un->peer = checkpoint_obj(ctx, sk->peer, CKPT_OBJ_SOCK);
 	else
 		un->peer = 0;
 
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] Add post-file deferqueue
       [not found] ` <1251915760-20118-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-02 18:22   ` [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2) Dan Smith
@ 2009-09-02 18:22   ` Dan Smith
       [not found]     ` <1251915760-20118-3-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-02 18:22   ` [PATCH 3/3] [RFC] Track socket buffer owners Dan Smith
  2 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-02 18:22 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This is the deferq bit of Matt's patch, which is needed by the subsequent
socket patch.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Cc: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/files.c               |   28 ++++++++++++++++++++++++++++
 include/linux/checkpoint_types.h |    1 +
 2 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/checkpoint/files.c b/checkpoint/files.c
index 204055b..784793c 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
 #include <net/sock.h>
 
 
@@ -289,11 +290,25 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
 		goto out;
 
 	ckpt_debug("nfds %d\n", nfds);
+	ctx->files_deferq = deferqueue_create();
+	if (!ctx->files_deferq) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	for (n = 0; n < nfds; n++) {
 		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
 		if (ret < 0)
 			break;
 	}
+	if (!ret) {
+		ret = deferqueue_run(ctx->files_deferq);
+		if (ret > 0) {
+			pr_warning("c/r: files deferqueue had %d entries\n",
+				   ret);
+			ret = 0;
+		}
+	}
+	deferqueue_destroy(ctx->files_deferq);
  out:
 	kfree(fdtable);
 	return ret;
@@ -692,11 +707,24 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		goto out;
 
+	ret = -ENOMEM;
+	ctx->files_deferq = deferqueue_create();
+	if (!ctx->files_deferq)
+		goto out;
 	for (i = 0; i < h->fdt_nfds; i++) {
 		ret = restore_file_desc(ctx);
 		if (ret < 0)
 			break;
 	}
+	if (!ret) {
+		ret = deferqueue_run(ctx->files_deferq);
+		if (ret > 0) {
+			pr_warning("c/r: files deferqueue had %d entries\n",
+				   ret);
+			ret = 0;
+		}
+	}
+	deferqueue_destroy(ctx->files_deferq);
  out:
 	ckpt_hdr_put(ctx, h);
 	if (!ret) {
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index a18846f..b6f130c 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -48,6 +48,7 @@ struct ckpt_ctx {
 
 	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
 	struct deferqueue_head *deferqueue;	/* queue of deferred work */
+	struct deferqueue_head *files_deferq;
 
 	struct path fs_mnt;     /* container root (FIXME) */
 
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] [RFC] Track socket buffer owners
       [not found] ` <1251915760-20118-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-02 18:22   ` [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2) Dan Smith
  2009-09-02 18:22   ` [PATCH 2/3] Add post-file deferqueue Dan Smith
@ 2009-09-02 18:22   ` Dan Smith
       [not found]     ` <1251915760-20118-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-02 18:22 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This patch is a superset of the previous attempt to store socket buffers
with their owners.  It defers the writing and reading of the socket buffers
until after the end of the file phase to avoid a recursive nose-dive of
checkpointing socket owners.

This also moves the join logic to the deferqueue as well, since that too
can lead us down a deep hole.  The buffer restore logic may perform a join
if it decides that the join is inevitable (but not yet performed) and
necessary.

Note that I've been staring at this for too long, so I'm sending it as an
RFC with hopes that it's not too much of a mess.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/checkpoint.h     |    3 -
 include/linux/checkpoint_hdr.h |    6 +
 include/linux/net.h            |    4 +-
 include/net/af_unix.h          |    4 +-
 net/checkpoint.c               |  131 ++++++++++------
 net/unix/checkpoint.c          |  339 ++++++++++++++++++++++++++--------------
 6 files changed, 312 insertions(+), 175 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 761cad5..88861a1 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -84,9 +84,6 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct socket *socket,
 			      struct sockaddr *loc, unsigned *loc_len,
 			      struct sockaddr *rem, unsigned *rem_len);
-extern struct ckpt_hdr_socket_queue *
-ckpt_sock_read_buffer_hdr(struct ckpt_ctx *ctx,
-			  uint32_t *bufsize);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index b75562c..6b74a51 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -414,6 +414,12 @@ struct ckpt_hdr_socket_queue {
 	__u32 total_bytes;
 } __attribute__ ((aligned(8)));
 
+struct ckpt_hdr_socket_buffer {
+	struct ckpt_hdr h;
+	__s32 src_objref;
+	__s32 dst_objref;
+};
+
 #define CKPT_UNIX_LINKED 1
 struct ckpt_hdr_socket_unix {
 	struct ckpt_hdr h;
diff --git a/include/linux/net.h b/include/linux/net.h
index 27187a4..96c7e22 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -148,7 +148,7 @@ struct msghdr;
 struct module;
 
 struct ckpt_ctx;
-struct ckpt_socket;
+struct ckpt_hdr_socket;
 
 struct proto_ops {
 	int		family;
@@ -197,7 +197,7 @@ struct proto_ops {
 	int		(*checkpoint)(struct ckpt_ctx *ctx,
 				      struct socket *sock);
 	int		(*restore)(struct ckpt_ctx *ctx, struct socket *sock,
-				   struct ckpt_socket *h);
+				   struct ckpt_hdr_socket *h);
 };
 
 struct net_proto_family {
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1a1fd20..61f666b 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -71,10 +71,10 @@ static inline void unix_sysctl_unregister(struct net *net) {}
 
 #ifdef CONFIG_CHECKPOINT
 struct ckpt_ctx;
-struct ckpt_socket;
+struct ckpt_hdr_socket;
 extern int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
 extern int unix_restore(struct ckpt_ctx *ctx, struct socket *sock,
-			struct ckpt_socket *h);
+			struct ckpt_hdr_socket *h);
 #else
 #define unix_checkpoint NULL
 #define unix_restore NULL
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 42a8853..cb37ef4 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -23,6 +23,12 @@
 
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+
+struct dq_buffers {
+	struct ckpt_ctx *ctx;
+	struct sock *sk;
+};
 
 static int sock_copy_buffers(struct sk_buff_head *from,
 			     struct sk_buff_head *to,
@@ -58,6 +64,7 @@ static int sock_copy_buffers(struct sk_buff_head *from,
 			break; /* The queue changed as we read it */
 
 		skb_morph(skbs[i], skb);
+		skbs[i]->sk = skb->sk;
 		skb_queue_tail(to, skbs[i]);
 
 		*total_bytes += skb->len;
@@ -82,12 +89,15 @@ static int sock_copy_buffers(struct sk_buff_head *from,
 }
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
-				struct sk_buff_head *queue)
+				struct sk_buff_head *queue,
+				int dst_objref)
 {
 	struct sk_buff *skb;
-	int ret = 0;
 
 	skb_queue_walk(queue, skb) {
+		struct ckpt_hdr_socket_buffer *h;
+		int ret = 0;
+
 		/* FIXME: This could be a false positive for non-unix
 		 *        buffers, so add a type check here in the
 		 *        future
@@ -104,16 +114,35 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		 * information contained in the skb.
 		 */
 
+		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+		if (!h)
+			return -ENOMEM;
+
+		BUG_ON(!skb->sk);
+		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
+		if (ret < 0)
+			goto end;
+		h->src_objref = ret;
+		h->dst_objref = dst_objref;
+
+		ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+		if (ret < 0)
+			goto end;
+
 		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
-					  CKPT_HDR_SOCKET_BUFFER);
-		if (ret)
+					  CKPT_HDR_BUFFER);
+	end:
+		ckpt_hdr_put(ctx, h);
+		if (ret < 0)
 			return ret;
 	}
 
 	return 0;
 }
 
-static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+static int sock_write_buffers(struct ckpt_ctx *ctx,
+			      struct sk_buff_head *queue,
+			      int dst_objref)
 {
 	struct ckpt_hdr_socket_queue *h;
 	struct sk_buff_head tmpq;
@@ -132,7 +161,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
 	h->skb_count = ret;
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (!ret)
-		ret = __sock_write_buffers(ctx, &tmpq);
+		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
 
  out:
 	ckpt_hdr_put(ctx, h);
@@ -141,6 +170,44 @@ static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
 	return ret;
 }
 
+int sock_deferred_write_buffers(void *data)
+{
+	struct dq_buffers *dq = (struct dq_buffers *)data;
+	struct ckpt_ctx *ctx = dq->ctx;
+	int ret;
+	int dst_objref;
+
+	dst_objref = ckpt_obj_lookup(ctx, dq->sk, CKPT_OBJ_SOCK);
+	if (dst_objref < 0) {
+		ckpt_write_err(ctx,
+			       "Unable to get objref of owner socket: %i\n",
+			       dst_objref);
+		return dst_objref;
+	}
+
+	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+	ckpt_debug("write recv buffers: %i\n", ret);
+	if (ret < 0)
+		return ret;
+
+	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+	ckpt_debug("write send buffers: %i\n", ret);
+
+	return ret;
+}
+
+int sock_defer_write_buffers(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct dq_buffers dq;
+
+	dq.ctx = ctx;
+	dq.sk = sk;
+
+	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+			      sock_deferred_write_buffers,
+			      sock_deferred_write_buffers);
+}
+
 int ckpt_sock_getnames(struct ckpt_ctx *ctx, struct socket *sock,
 		       struct sockaddr *loc, unsigned *loc_len,
 		       struct sockaddr *rem, unsigned *rem_len)
@@ -166,7 +233,7 @@ int ckpt_sock_getnames(struct ckpt_ctx *ctx, struct socket *sock,
 	return 0;
 }
 
-static int sock_cptrst_verify(struct ckpt_socket *h)
+static int sock_cptrst_verify(struct ckpt_hdr_socket *h)
 {
 	uint8_t userlocks_mask = SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK |
 		                 SOCK_BINDADDR_LOCK | SOCK_BINDPORT_LOCK;
@@ -204,7 +271,7 @@ static int sock_cptrst_opt(int op, struct socket *sock,
 	sock_cptrst_opt(op, sk->sk_socket, name, (char *)opt, sizeof(*opt))
 
 static int sock_cptrst_bufopts(int op, struct sock *sk,
-			       struct ckpt_socket *h)
+			       struct ckpt_hdr_socket *h)
 
 {
 	if (CKPT_COPY_SOPT(op, sk, SO_RCVBUF, &h->sock.rcvbuf))
@@ -270,7 +337,7 @@ static int sock_restore_flag(struct socket *sock,
 
 
 static int sock_restore_flags(struct socket *sock,
-                             struct ckpt_socket *h)
+                             struct ckpt_hdr_socket *h)
 {
        int ret;
        int i;
@@ -309,6 +376,9 @@ static int sock_restore_flags(struct socket *sock,
                return -ENOSYS;
        }
 
+       if (test_and_clear_bit(SOCK_DEAD, &sk_flags))
+	       sock_flag(sock->sk, SOCK_DEAD);
+
        /* Anything that is still set in the flags that isn't part of
         * our protocol's default set, indicates an error
         */
@@ -339,7 +409,7 @@ static int sock_copy_timeval(int op, struct sock *sk,
 }
 
 static int sock_cptrst(struct ckpt_ctx *ctx, struct sock *sk,
-		       struct ckpt_socket *h, int op)
+		       struct ckpt_hdr_socket *h, int op)
 {
 	if (sk->sk_socket) {
 		CKPT_COPY(op, h->socket.state, sk->sk_socket->state);
@@ -459,10 +529,9 @@ static int __do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
 
 	/* part III: socket buffers */
 	if ((sk->sk_state != TCP_LISTEN) && (!sock_flag(sk, SOCK_DEAD))) {
-		ret = sock_write_buffers(ctx, &sk->sk_receive_queue);
+		ret = sock_defer_write_buffers(ctx, sk);
 		if (ret)
 			goto out;
-		ret = sock_write_buffers(ctx, &sk->sk_write_queue);
 	}
 
  out:
@@ -528,42 +597,6 @@ int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
 	return ret;
 }
 
-struct ckpt_hdr_socket_queue *ckpt_sock_read_buffer_hdr(struct ckpt_ctx *ctx,
-							uint32_t *bufsize)
-{
-	struct ckpt_hdr_socket_queue *h;
-	int err = 0;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_QUEUE);
-	if (IS_ERR(h))
-		return h;
-
-	if (!bufsize) {
-		if (h->total_bytes != 0) {
-			ckpt_debug("Expected empty buffer, got %u\n",
-				   h->total_bytes);
-			err = -EINVAL;
-		}
-	} else if (h->total_bytes > *bufsize) {
-		/* NB: We let CAP_NET_ADMIN override the system buffer limit
-		 *     as setsockopt() does
-		 */
-		if (capable(CAP_NET_ADMIN))
-			*bufsize = h->total_bytes;
-		else {
-			ckpt_debug("Buffer total %u exceeds limit %u\n",
-			   h->total_bytes, *bufsize);
-			err = -EINVAL;
-		}
-	}
-
-	if (err) {
-		ckpt_hdr_put(ctx, h);
-		return ERR_PTR(err);
-	} else
-		return h;
-}
-
 static struct file *sock_alloc_attach_fd(struct socket *sock)
 {
 	struct file *file;
@@ -588,7 +621,7 @@ static struct file *sock_alloc_attach_fd(struct socket *sock)
 	return file;
 }
 
-struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 {
 	struct ckpt_hdr_socket *h;
 	struct socket *sock;
diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
index f4905db..a6b17d1 100644
--- a/net/unix/checkpoint.c
+++ b/net/unix/checkpoint.c
@@ -4,11 +4,23 @@
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 #include <linux/user.h>
+#include <linux/deferqueue.h>
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
 
 #define UNIX_ADDR_EMPTY(a) (a <= sizeof(short))
 
+struct dq_join {
+	struct ckpt_ctx *ctx;
+	int src_ref;
+	int dst_ref;
+};
+
+struct dq_buffers {
+	struct ckpt_ctx *ctx;
+	int sk_ref; /* objref of the socket these buffers belong to */
+};
+
 static inline int unix_need_cwd(struct sockaddr_un *addr, unsigned long len)
 {
 	return (!UNIX_ADDR_EMPTY(len)) &&
@@ -16,6 +28,54 @@ static inline int unix_need_cwd(struct sockaddr_un *addr, unsigned long len)
 		(addr->sun_path[0] != '/');
 }
 
+static int unix_join(struct sock *src, struct sock *dst)
+{
+	if (unix_sk(src)->peer != NULL)
+		return 0; /* We're second */
+
+	sock_hold(dst);
+	unix_sk(src)->peer = dst;
+
+	return 0;
+
+}
+
+static int unix_deferred_join(void *data)
+{
+	struct dq_join *dq = (struct dq_join *)data;
+	struct ckpt_ctx *ctx = dq->ctx;
+	struct sock *src;
+	struct sock *dst;
+
+	src = ckpt_obj_fetch(ctx, dq->src_ref, CKPT_OBJ_SOCK);
+	if (!src) {
+		ckpt_debug("Missing src sock ref %i\n", dq->src_ref);
+		return -EINVAL;
+	}
+
+	dst = ckpt_obj_fetch(ctx, dq->dst_ref, CKPT_OBJ_SOCK);
+	if (!src) {
+		ckpt_debug("Missing dst sock ref %i\n", dq->dst_ref);
+		return -EINVAL;
+	}
+
+	return unix_join(src, dst);
+}
+
+static int unix_defer_join(struct ckpt_ctx *ctx,
+			   int src_ref,
+			   int dst_ref)
+{
+	struct dq_join dq;
+
+	dq.ctx = ctx;
+	dq.src_ref = src_ref;
+	dq.dst_ref = dst_ref;
+
+	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+			      unix_deferred_join, unix_deferred_join);
+}
+
 static int unix_write_cwd(struct ckpt_ctx *ctx,
 			  struct sock *sk, const char *sockpath)
 {
@@ -109,24 +169,63 @@ int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 	return ret;
 }
 
-static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx, struct sock *sk)
+static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
+				    struct sockaddr *addr,
+				    unsigned int addrlen)
 {
 	struct msghdr msg;
 	struct kvec kvec;
+	struct ckpt_hdr_socket_buffer *h;
+	struct sock *sk;
+	uint8_t sock_shutdown;
+	uint8_t peer_shutdown = 0;
 	void *buf;
 	int ret = 0;
 	int len;
+	int sndbuf;
 
 	memset(&msg, 0, sizeof(msg));
 
-	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_SOCKET_BUFFER);
-	if (len < 0)
-		return len;
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+	if (len < 0) {
+		ret = len;
+		goto out;
+	}
 
 	if (len > SKB_MAX_ALLOC) {
 		ckpt_debug("Socket buffer too big (%i > %lu)",
 			   len, SKB_MAX_ALLOC);
-		return -ENOSPC;
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	sk = ckpt_obj_fetch(ctx, h->src_objref, CKPT_OBJ_SOCK);
+	if (IS_ERR(sk)) {
+		ret = PTR_ERR(sk);
+		goto out;
+	}
+
+	/* If we don't have a destination or a peer and we know the
+	 * destination of this skb, then we must need to join with our
+	 * peer
+	 */
+	if (!addrlen && !unix_sk(sk)->peer && (h->dst_objref != 0)) {
+		struct sock *pr;
+		pr = ckpt_obj_fetch(ctx, h->dst_objref, CKPT_OBJ_SOCK);
+		if (IS_ERR(pr)) {
+			ckpt_debug("Failed to get our peer: %li\n", PTR_ERR(pr));
+			ret = PTR_ERR(pr);
+			goto out;
+		}
+		ret = unix_join(sk, pr);
+		if (ret < 0) {
+			ckpt_debug("Failed to join: %i\n", ret);
+			goto out;
+		}
 	}
 
 	kvec.iov_len = len;
@@ -139,54 +238,123 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx, struct sock *sk)
 	if (ret < 0)
 		goto out;
 
+	msg.msg_name = addr;
+	msg.msg_namelen = addrlen;
+
+	/* If peer is shutdown, unshutdown it for this process */
+	sock_shutdown = sk->sk_shutdown;
+	sk->sk_shutdown &= ~SHUTDOWN_MASK;
+
+	/* Unshutdown peer too, if necessary */
+	if (unix_sk(sk)->peer) {
+		peer_shutdown = unix_sk(sk)->peer->sk_shutdown;
+		unix_sk(sk)->peer->sk_shutdown &= ~SHUTDOWN_MASK;
+	}
+
+	/* Make sure there's room in the send buffer */
+	sndbuf = sk->sk_sndbuf;
+	if (capable(CAP_NET_ADMIN) &&
+	    ((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len))
+		sk->sk_sndbuf += len;
+	else
+		sk->sk_sndbuf = sysctl_wmem_max;
+
 	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, len);
-	ckpt_debug("kernel_sendmsg(%i): %i\n", len, ret);
+	ckpt_debug("kernel_sendmsg(%i,%i): %i\n", h->src_objref, len, ret);
 	if ((ret > 0) && (ret != len))
 		ret = -ENOMEM;
+
+	sk->sk_sndbuf = sndbuf;
+	sk->sk_shutdown = sock_shutdown;
+	if (peer_shutdown)
+		unix_sk(sk)->peer->sk_shutdown = peer_shutdown;
  out:
+	ckpt_hdr_put(ctx, h);
 	kfree(buf);
 
 	return ret;
 }
 
 static int unix_read_buffers(struct ckpt_ctx *ctx,
-			     struct sock *sk, uint32_t *bufsize)
+			     struct sockaddr *addr,
+			     unsigned int addrlen)
 {
-	uint8_t sock_shutdown;
 	struct ckpt_hdr_socket_queue *h;
 	int ret = 0;
 	int i;
 
-	h = ckpt_sock_read_buffer_hdr(ctx, bufsize);
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_QUEUE);
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 
-	/* If peer is shutdown, unshutdown it for this process */
-	sock_shutdown = sk->sk_shutdown;
-	sk->sk_shutdown &= ~SHUTDOWN_MASK;
-
 	for (i = 0; i < h->skb_count; i++) {
-		ret = sock_read_buffer_sendmsg(ctx, sk);
+		ret = sock_read_buffer_sendmsg(ctx, addr, addrlen);
 		ckpt_debug("read_buffer_sendmsg(%i): %i\n", i, ret);
 		if (ret < 0)
-			break;
+			goto out;
 
 		if (ret > h->total_bytes) {
 			ckpt_debug("Buffers exceeded claim");
 			ret = -EINVAL;
-			break;
+			goto out;
 		}
 
 		h->total_bytes -= ret;
 		ret = 0;
 	}
 
-	sk->sk_shutdown = sock_shutdown;
+	ret = h->skb_count;
+ out:
 	ckpt_hdr_put(ctx, h);
 
 	return ret;
 }
 
+static int unix_deferred_restore_buffers(void *data)
+{
+	struct dq_buffers *dq = (struct dq_buffers *)data;
+	struct ckpt_ctx *ctx = dq->ctx;
+	struct sock *sk;
+	struct sockaddr *addr = NULL;
+	unsigned int addrlen = 0;
+	int ret;
+
+	sk = ckpt_obj_fetch(ctx, dq->sk_ref, CKPT_OBJ_SOCK);
+	if (!sk) {
+		ckpt_debug("Missing sock ref %i\n", dq->sk_ref);
+		return -EINVAL;
+	}
+
+	if ((sk->sk_type == SOCK_DGRAM) && (unix_sk(sk)->addr != NULL)) {
+		addr = (struct sockaddr *)&unix_sk(sk)->addr->name;
+		addrlen = unix_sk(sk)->addr->len;
+	}
+
+	ret = unix_read_buffers(ctx, addr, addrlen);
+	ckpt_debug("read recv buffers: %i\n", ret);
+	if (ret < 0)
+		return ret;
+
+	ret = unix_read_buffers(ctx, addr, addrlen);
+	ckpt_debug("read send buffers: %i\n", ret);
+	if (ret != 0)
+		ret = -EINVAL; /* No send buffers for UNIX sockets */
+
+	return ret;
+}
+
+static int unix_defer_restore_buffers(struct ckpt_ctx *ctx, int sk_ref)
+{
+	struct dq_buffers dq;
+
+	dq.ctx = ctx;
+	dq.sk_ref = sk_ref;
+
+	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+			      unix_deferred_restore_buffers,
+			      unix_deferred_restore_buffers);
+}
+
 static struct unix_address *unix_makeaddr(struct sockaddr_un *sun_addr,
 					  unsigned len)
 {
@@ -206,91 +374,33 @@ static struct unix_address *unix_makeaddr(struct sockaddr_un *sun_addr,
 	return addr;
 }
 
-static int unix_join(struct ckpt_ctx *ctx,
-		     struct sock *a, struct sock *b,
-		     struct ckpt_hdr_socket_unix *un)
-{
-	struct unix_address *addr = NULL;
-
-	/* FIXME: Do we need to call some security hooks here? */
-
-	sock_hold(a);
-	sock_hold(b);
-
-	unix_sk(a)->peer = b;
-	unix_sk(b)->peer = a;
-
-	if (!UNIX_ADDR_EMPTY(un->raddr_len))
-		addr = unix_makeaddr(&un->raddr, un->raddr_len);
-	else if (!UNIX_ADDR_EMPTY(un->laddr_len))
-		addr = unix_makeaddr(&un->laddr, un->laddr_len);
-
-	if (IS_ERR(addr))
-		return PTR_ERR(addr);
-	else if (addr) {
-		atomic_inc(&addr->refcnt); /* Held by both ends */
-		unix_sk(a)->addr = unix_sk(b)->addr = addr;
-	}
-
-	return 0;
-}
-
 static int unix_restore_connected(struct ckpt_ctx *ctx,
-				  struct ckpt_socket *h,
+				  struct ckpt_hdr_socket *h,
 				  struct ckpt_hdr_socket_unix *un,
 				  struct socket *sock)
 {
-	struct sock *this = ckpt_obj_fetch(ctx, un->this, CKPT_OBJ_SOCK);
-	struct sock *peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
-	struct socket *tmp = NULL;
+	struct sock *sk = sock->sk;
+	struct sockaddr *addr = NULL;
+	unsigned int addrlen = 0;
 	int ret;
-
-	if (!IS_ERR(this) && !IS_ERR(peer)) {
-		/* We're last */
-		struct socket *old = this->sk_socket;
-
-		old->sk = NULL;
-		sock_release(old);
-		sock_graft(this, sock);
-
-	} else if ((PTR_ERR(this) == -EINVAL) && (PTR_ERR(peer) == -EINVAL)) {
-		/* We're first */
-		int family = sock->sk->sk_family;
-		int type = sock->sk->sk_type;
-
-		ret = sock_create(family, type, 0, &tmp);
-		ckpt_debug("sock_create: %i\n", ret);
-		if (ret)
-			goto out;
-
-		this = sock->sk;
-		peer = tmp->sk;
-
-		ret = ckpt_obj_insert(ctx, this, un->this, CKPT_OBJ_SOCK);
-		if (ret < 0)
-			goto out;
-
-		ret = ckpt_obj_insert(ctx, peer, un->peer, CKPT_OBJ_SOCK);
-		if (ret < 0)
-			goto out;
-
-		ret = unix_join(ctx, this, peer, un);
-		ckpt_debug("unix_join: %i\n", ret);
-		if (ret)
-			goto out;
-
-	} else {
-		ckpt_debug("Order Error\n");
-		ret = PTR_ERR(this);
-		goto out;
+	unsigned long flags = h->sock.flags;
+	int dead = test_bit(SOCK_DEAD, &flags);
+
+	if (un->peer == 0) {
+		/* These get propagated to the msghdr, so only set them
+		 * if we're not connected to a peer, else we'll get an error
+		 * when we sendmsg()
+		 */
+		addr = (struct sockaddr *)&un->laddr;
+		addrlen = un->laddr_len;
 	}
 
-	this->sk_peercred.pid = task_tgid_vnr(current);
+	sk->sk_peercred.pid = task_tgid_vnr(current);
 
 	if (may_setuid(ctx->realcred->user->user_ns, un->peercred_uid) &&
 	    may_setgid(un->peercred_gid)) {
-		this->sk_peercred.uid = un->peercred_uid;
-		this->sk_peercred.gid = un->peercred_gid;
+		sk->sk_peercred.uid = un->peercred_uid;
+		sk->sk_peercred.gid = un->peercred_gid;
 	} else {
 		ckpt_debug("peercred %i:%i would require setuid",
 			   un->peercred_uid, un->peercred_gid);
@@ -298,30 +408,16 @@ static int unix_restore_connected(struct ckpt_ctx *ctx,
 		goto out;
 	}
 
-	/* Prime the socket's buffer limit with the maximum.  These will be
-	 * overwritten with the values in the checkpoint stream in a later
-	 * phase.
-	 */
-	peer->sk_userlocks |= SOCK_SNDBUF_LOCK;
-	peer->sk_sndbuf = sysctl_wmem_max;
-
-	/* Read my buffers and sendmsg() them back to me via my peer */
-
-	/* TODO: handle the unconnected case, as well, as the case
-	 *       where sendto() has been used on some of the buffers
-	 */
-
-	ret = unix_read_buffers(ctx, peer, &peer->sk_sndbuf);
-	ckpt_debug("unix_read_buffers: %i\n", ret);
-	if (ret)
-		goto out;
+	if (!dead && (un->peer > 0)) {
+		ret = unix_defer_join(ctx, un->this, un->peer);
+		ckpt_debug("unix_defer_join: %i\n", ret);
+		if (ret)
+			goto out;
+	}
 
-	/* Read peer's buffers and expect 0 */
-	ret = unix_read_buffers(ctx, peer, NULL);
+	if (!dead)
+		ret = unix_defer_restore_buffers(ctx, un->this);
  out:
-	if (tmp && ret)
-		sock_release(tmp);
-
 	return ret;
 }
 
@@ -422,15 +518,19 @@ static int unix_fakebind(struct socket *sock,
 	return 0;
 }
 
-static int unix_restore_bind(struct ckpt_socket *h,
+static int unix_restore_bind(struct ckpt_hdr_socket *h,
 			     struct ckpt_hdr_socket_unix *un,
 			     struct socket *sock,
 			     const char *path)
 {
 	struct sockaddr *addr = (struct sockaddr *)&un->laddr;
 	unsigned long len = un->laddr_len;
+	unsigned long flags = h->sock.flags;
+	int dead = test_bit(SOCK_DEAD, &flags);
 
-	if (!un->laddr.sun_path[0])
+	if (dead)
+		return unix_fakebind(sock, &un->laddr, len);
+	else if (!un->laddr.sun_path[0])
 		return sock_bind(sock, addr, len);
 	else if (!(un->flags & CKPT_UNIX_LINKED))
 		return unix_fakebind(sock, &un->laddr, len);
@@ -439,9 +539,10 @@ static int unix_restore_bind(struct ckpt_socket *h,
 }
 
 /* Some easy pre-flight checks before we get underway */
-static int unix_precheck(struct socket *sock, struct ckpt_socket *h)
+static int unix_precheck(struct socket *sock, struct ckpt_hdr_socket *h)
 {
 	struct net *net = sock_net(sock->sk);
+	unsigned long sk_flags = h->sock.flags;
 
 	if ((h->socket.state == SS_CONNECTING) ||
 	    (h->socket.state == SS_DISCONNECTING) ||
@@ -461,7 +562,7 @@ static int unix_precheck(struct socket *sock, struct ckpt_socket *h)
 		return -EINVAL;
 	}
 
-	if (h->sock.flags & SOCK_USE_WRITE_QUEUE) {
+	if (test_bit(SOCK_USE_WRITE_QUEUE, &sk_flags)) {
 		ckpt_debug("AF_UNIX socket has SOCK_USE_WRITE_QUEUE set");
 		return -EINVAL;
 	}
@@ -470,7 +571,7 @@ static int unix_precheck(struct socket *sock, struct ckpt_socket *h)
 }
 
 int unix_restore(struct ckpt_ctx *ctx, struct socket *sock,
-		      struct ckpt_socket *h)
+		      struct ckpt_hdr_socket *h)
 
 {
 	struct ckpt_hdr_socket_unix *un;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] Add post-file deferqueue
       [not found]     ` <1251915760-20118-3-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-03  0:04       ` Oren Laadan
       [not found]         ` <4A9F0804.7080704-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Oren Laadan @ 2009-09-03  0:04 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> This is the deferq bit of Matt's patch, which is needed by the subsequent
> socket patch.

Hmm... I guess you missed my comments to his post - see below:

> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> Cc: Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/files.c               |   28 ++++++++++++++++++++++++++++
>  include/linux/checkpoint_types.h |    1 +
>  2 files changed, 29 insertions(+), 0 deletions(-)
> 
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index 204055b..784793c 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -21,6 +21,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
>  #include <net/sock.h>
>  
>  
> @@ -289,11 +290,25 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
>  		goto out;
>  
>  	ckpt_debug("nfds %d\n", nfds);
> +	ctx->files_deferq = deferqueue_create();
> +	if (!ctx->files_deferq) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}

You can create/destroy the deferqueue once, in ckpt_ctx_alloc()
and ckpt_ctx_free(), respectively. It will also simplify the logic
here.

>  	for (n = 0; n < nfds; n++) {
>  		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
>  		if (ret < 0)
>  			break;
>  	}
> +	if (!ret) {
> +		ret = deferqueue_run(ctx->files_deferq);
> +		if (ret > 0) {
> +			pr_warning("c/r: files deferqueue had %d entries\n",
> +				   ret);
> +			ret = 0;
> +		}
> +	}
> +	deferqueue_destroy(ctx->files_deferq);
>   out:
>  	kfree(fdtable);
>  	return ret;
> @@ -692,11 +707,24 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
>  	if (ret < 0)
>  		goto out;
>  
> +	ret = -ENOMEM;
> +	ctx->files_deferq = deferqueue_create();
> +	if (!ctx->files_deferq)
> +		goto out;

Ditto for alloc/free of ctx->files_deferq.

Oren.

>  	for (i = 0; i < h->fdt_nfds; i++) {
>  		ret = restore_file_desc(ctx);
>  		if (ret < 0)
>  			break;
>  	}
> +	if (!ret) {
> +		ret = deferqueue_run(ctx->files_deferq);
> +		if (ret > 0) {
> +			pr_warning("c/r: files deferqueue had %d entries\n",
> +				   ret);
> +			ret = 0;
> +		}
> +	}
> +	deferqueue_destroy(ctx->files_deferq);
>   out:
>  	ckpt_hdr_put(ctx, h);
>  	if (!ret) {
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index a18846f..b6f130c 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -48,6 +48,7 @@ struct ckpt_ctx {
>  
>  	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
>  	struct deferqueue_head *deferqueue;	/* queue of deferred work */
> +	struct deferqueue_head *files_deferq;
>  
>  	struct path fs_mnt;     /* container root (FIXME) */
>  

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]     ` <1251915760-20118-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-03  0:32       ` Oren Laadan
       [not found]         ` <4A9F0E96.70106-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
  2009-09-03  5:40       ` Serge E. Hallyn
  1 sibling, 1 reply; 20+ messages in thread
From: Oren Laadan @ 2009-09-03  0:32 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> This changes the checkpoint/restart procedure for sockets a bit.  The
> socket file header is now checkpointed separately from the socket itself,
> which allows us to checkpoint a socket without arriving at it from a
> file descriptor.  Thus, most sockets will be checkpointed as a result
> of processing the file table, calling sock_file_checkpoint(fd), which
> in turn calls checkpoint_obj(socket).
> 
> However, we may arrive at some sockets while checkpointing other objects,
> such as the other end of an AF_UNIX socket with buffers in flight.  This
> patch just opens that door, which is utilized by the next patch.
> 
> Changes in v2:
>  - If we attempt to checkpoint an orphan socket, create a struct socket
>    to adopt it for the purposes of the checkpoint
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Nice. See comments inline.

> ---
>  checkpoint/objhash.c           |    2 +
>  include/linux/checkpoint_hdr.h |    6 +-
>  include/net/sock.h             |    2 +
>  net/checkpoint.c               |  140 +++++++++++++++++++++++++++++++--------
>  net/unix/checkpoint.c          |    3 +-
>  5 files changed, 120 insertions(+), 33 deletions(-)
> 

[...]

> -int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +static int __do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
>  {

> +static int do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct socket *sock;
> +	int ret;
> +
> +	if (sk->sk_socket)
> +		return __do_sock_checkpoint(ctx, sk);
> +

I wonder if temporarily grafting the @sk is better than explicitly
testing for sk->sk_socket everywhere else ?  (in some places, the test
already exists, so if we keep this approach, it should be removed).

As it is, it simplifies the code. However, how can we be certain that
grafting a dead socket doesn't break any assumptions elsewhere ?
For instance, unix_sock_destructor() will spit a warning if it finds
sk->sk_socket != NULL. There may be other places (per protocol ?).

The alternative is to always pass around an 'sk' instead of sometimes
a 'sock', and to skip parts of the checkpoint (and restart) entirely
for dead sockets.

[...]

Oren.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] [RFC] Track socket buffer owners
       [not found]     ` <1251915760-20118-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-03  1:52       ` Oren Laadan
       [not found]         ` <4A9F2158.4010405-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Oren Laadan @ 2009-09-03  1:52 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> This patch is a superset of the previous attempt to store socket buffers
> with their owners.  It defers the writing and reading of the socket buffers
> until after the end of the file phase to avoid a recursive nose-dive of
> checkpointing socket owners.
> 
> This also moves the join logic to the deferqueue as well, since that too
> can lead us down a deep hole.  The buffer restore logic may perform a join
> if it decides that the join is inevitable (but not yet performed) and
> necessary.
> 
> Note that I've been staring at this for too long, so I'm sending it as an
> RFC with hopes that it's not too much of a mess.
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Nice, so this solves the chicken-and-egg of socket/peer and the recursion
of dgram socket "chains".

IIUC, sockets discovered while writing the receive buffers (being the
source of an skb) are checkpointed on the fly. In particular, they are
then added to the defer queue.

IOW, they are added to a list that is being traversed (executing the
deferred tasks). I think it's OK: deferqueue_add() uses list_add_tail()
and deferqueue_run user() uses list_for_each_entry_safe().

This probably deserves a comment ?

[...]

>  
> +struct ckpt_hdr_socket_buffer {
> +	struct ckpt_hdr h;
> +	__s32 src_objref;
> +	__s32 dst_objref;
> +};

Super nit: perhaps use 'sk_objref' for socket, and 'peer_objref' for
peer. The latter won't be used in inet sockets. Also, while effectively
a no-op, dst_objref is "source" for send buffers :p

[...]

 > +int sock_deferred_write_buffers(void *data)
> +{
> +	struct dq_buffers *dq = (struct dq_buffers *)data;
> +	struct ckpt_ctx *ctx = dq->ctx;
> +	int ret;
> +	int dst_objref;
> +
> +	dst_objref = ckpt_obj_lookup(ctx, dq->sk, CKPT_OBJ_SOCK);
> +	if (dst_objref < 0) {
> +		ckpt_write_err(ctx,
> +			       "Unable to get objref of owner socket: %i\n",
> +			       dst_objref);
> +		return dst_objref;
> +	}
> +
> +	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
> +	ckpt_debug("write recv buffers: %i\n", ret);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
								^^^^^^^ :p

> +	ckpt_debug("write send buffers: %i\n", ret);
> +
> +	return ret;
> +}
> +
> +int sock_defer_write_buffers(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct dq_buffers dq;
> +
> +	dq.ctx = ctx;
> +	dq.sk = sk;
> +
> +	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
> +			      sock_deferred_write_buffers,
> +			      sock_deferred_write_buffers);

The dtor should be NULL ?

[...]

 >  static int sock_restore_flags(struct socket *sock,
> -                             struct ckpt_socket *h)
> +                             struct ckpt_hdr_socket *h)
>  {
>         int ret;
>         int i;
> @@ -309,6 +376,9 @@ static int sock_restore_flags(struct socket *sock,
>                 return -ENOSYS;
>         }
>  
> +       if (test_and_clear_bit(SOCK_DEAD, &sk_flags))
> +	       sock_flag(sock->sk, SOCK_DEAD);
> +

I think this snippet belongs to the 1st patch.

[...]

> +struct dq_join {
> +	struct ckpt_ctx *ctx;
> +	int src_ref;
> +	int dst_ref;
> +};
> +
> +struct dq_buffers {
> +	struct ckpt_ctx *ctx;
> +	int sk_ref; /* objref of the socket these buffers belong to */
> +};

Another nit:  s/ref/objref/  (consistency...)

[...]

> +static int unix_defer_join(struct ckpt_ctx *ctx,
> +			   int src_ref,
> +			   int dst_ref)
> +{
> +	struct dq_join dq;
> +
> +	dq.ctx = ctx;
> +	dq.src_ref = src_ref;
> +	dq.dst_ref = dst_ref;
> +
> +	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
> +			      unix_deferred_join, unix_deferred_join);

Here, too, dtor should be NULL ?

[...]

> +
> +	/* If we don't have a destination or a peer and we know the
> +	 * destination of this skb, then we must need to join with our
> +	 * peer
> +	 */
> +	if (!addrlen && !unix_sk(sk)->peer && (h->dst_objref != 0)) {

IIUC, h->dst_objref should always be > 0, no ?

[...]

> +static int unix_deferred_restore_buffers(void *data)
> +{
> +	struct dq_buffers *dq = (struct dq_buffers *)data;
> +	struct ckpt_ctx *ctx = dq->ctx;
> +	struct sock *sk;
> +	struct sockaddr *addr = NULL;
> +	unsigned int addrlen = 0;
> +	int ret;
> +
> +	sk = ckpt_obj_fetch(ctx, dq->sk_ref, CKPT_OBJ_SOCK);
> +	if (!sk) {
> +		ckpt_debug("Missing sock ref %i\n", dq->sk_ref);
> +		return -EINVAL;
> +	}
> +
> +	if ((sk->sk_type == SOCK_DGRAM) && (unix_sk(sk)->addr != NULL)) {
> +		addr = (struct sockaddr *)&unix_sk(sk)->addr->name;
> +		addrlen = unix_sk(sk)->addr->len;
> +	}
> +
> +	ret = unix_read_buffers(ctx, addr, addrlen);
> +	ckpt_debug("read recv buffers: %i\n", ret);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = unix_read_buffers(ctx, addr, addrlen);
> +	ckpt_debug("read send buffers: %i\n", ret);
> +	if (ret != 0)
> +		ret = -EINVAL; /* No send buffers for UNIX sockets */

Change to "if (ret > 0)", otherwise you may mask out a real error.

> +
> +	return ret;
> +}
> +
> +static int unix_defer_restore_buffers(struct ckpt_ctx *ctx, int sk_ref)
> +{
> +	struct dq_buffers dq;
> +
> +	dq.ctx = ctx;
> +	dq.sk_ref = sk_ref;
> +
> +	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
> +			      unix_deferred_restore_buffers,
> +			      unix_deferred_restore_buffers);

... dtor should be NULL.

[...]

Oren.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]     ` <1251915760-20118-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-03  0:32       ` Oren Laadan
@ 2009-09-03  5:40       ` Serge E. Hallyn
       [not found]         ` <20090903054058.GA7189-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 20+ messages in thread
From: Serge E. Hallyn @ 2009-09-03  5:40 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> +static int do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct socket *sock;
> +	int ret;
> +
> +	if (sk->sk_socket)
> +		return __do_sock_checkpoint(ctx, sk);
> +
> +	/* Temporarily adopt this orphan socket */
> +	ret = sock_create(sk->sk_family, sk->sk_type, 0, &sock);
> +	if (ret < 0)
> +		return ret;
> +	sock_graft(sk, sock);
> +
> +	ret = __do_sock_checkpoint(ctx, sk);

I'm sure I sound like an idiot, but... at restore, a socket will
be created for sk now.  Is that a problem?  I don't see where
sk_free() will cause that sock to be freed, and you are not
attaching it do a file whose close would cause it to be released...

-serge

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]         ` <20090903054058.GA7189-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-03 14:03           ` Dan Smith
       [not found]             ` <87eiqoyvkz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-03 14:03 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

SH> I'm sure I sound like an idiot, but... at restore, a socket will
SH> be created for sk now.  Is that a problem?  I don't see where
SH> sk_free() will cause that sock to be freed, and you are not
SH> attaching it do a file whose close would cause it to be
SH> released...

In the regular socket code, the reference that is taken during
allocation is assumed to be for the owning object (the file).  So,
they don't take another ref when they actually attach it to a file.

The objhash assumes that for all objects on restore, the allocation
routine for the object has already incremented the reference count for
it as the owner.  If you look at restore_obj() it drops the ref count
right after stuffing the object in the hash because the act of
insertion grabs a reference.  On checkpoint, this is the reference for
the hash.  On restore, it would be redundant because it is the first
owner (not a file, etc).

My code takes *another* reference when it attaches to a file, if
appropriate.  In the case of the adopted socket, only the objhash
holds a reference to it, so unless it was joined to a peer, it will be
freed when the objhash does its final obj_ref_drop() at tear-down
time.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]             ` <87eiqoyvkz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-09-03 14:43               ` Serge E. Hallyn
       [not found]                 ` <20090903144341.GA13182-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-09-03 14:46               ` Oren Laadan
  1 sibling, 1 reply; 20+ messages in thread
From: Serge E. Hallyn @ 2009-09-03 14:43 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> SH> I'm sure I sound like an idiot, but... at restore, a socket will
> SH> be created for sk now.  Is that a problem?  I don't see where
> SH> sk_free() will cause that sock to be freed, and you are not
> SH> attaching it do a file whose close would cause it to be
> SH> released...
> 
> In the regular socket code, the reference that is taken during
> allocation is assumed to be for the owning object (the file).  So,
> they don't take another ref when they actually attach it to a file.
> 
> The objhash assumes that for all objects on restore, the allocation
> routine for the object has already incremented the reference count for
> it as the owner.  If you look at restore_obj() it drops the ref count
> right after stuffing the object in the hash because the act of
> insertion grabs a reference.  On checkpoint, this is the reference for
> the hash.  On restore, it would be redundant because it is the first
> owner (not a file, etc).
> 
> My code takes *another* reference when it attaches to a file, if
> appropriate.  In the case of the adopted socket, only the objhash
> holds a reference to it, so unless it was joined to a peer, it will be
> freed when the objhash does its final obj_ref_drop() at tear-down
> time.

Yes.  On the struct sock.  But what will drop the ref on the struct
socket?  Or has one of your later patches, not yet in ckpt-v17-dev,
added that?  Or, am I just missing a place where sock_put() will
actually sock_release(sk->sk_socket)?

-serge

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]             ` <87eiqoyvkz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-09-03 14:43               ` Serge E. Hallyn
@ 2009-09-03 14:46               ` Oren Laadan
  1 sibling, 0 replies; 20+ messages in thread
From: Oren Laadan @ 2009-09-03 14:46 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On Thu, 3 Sep 2009, Dan Smith wrote:

> SH> I'm sure I sound like an idiot, but... at restore, a socket will
> SH> be created for sk now.  Is that a problem?  I don't see where
> SH> sk_free() will cause that sock to be freed, and you are not
> SH> attaching it do a file whose close would cause it to be
> SH> released...
> 
> In the regular socket code, the reference that is taken during
> allocation is assumed to be for the owning object (the file).  So,
> they don't take another ref when they actually attach it to a file.
> 
> The objhash assumes that for all objects on restore, the allocation
> routine for the object has already incremented the reference count for
> it as the owner.  If you look at restore_obj() it drops the ref count
> right after stuffing the object in the hash because the act of
> insertion grabs a reference.  On checkpoint, this is the reference for
> the hash.  On restore, it would be redundant because it is the first
> owner (not a file, etc).
> 
> My code takes *another* reference when it attaches to a file, if
> appropriate.  In the case of the adopted socket, only the objhash
> holds a reference to it, so unless it was joined to a peer, it will be
> freed when the objhash does its final obj_ref_drop() at tear-down
> time.

Hmm... one little detail is missing: if this is a dead socket,
then the reason we checkpoint it is either:

1) because it is a peer of some other socket: on restart we will
'join' them and bump the reference. This reference will be removed
when the other socket is closed - and the (dead) sk will be freed.

2) because it is the source of an skb sent to another socket. The
unix socket code will take an extra reference to the (dead) sk
in the skb that is placed in the other socket's receive queue. 
This reference will be dropped when the skb is consumed and the
(dead) sk will be freed.

(In both cases, the extra reference keeps the dead sk around after
the objhash is cleaned up).

Oren.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]                 ` <20090903144341.GA13182-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-03 15:08                   ` Dan Smith
  0 siblings, 0 replies; 20+ messages in thread
From: Dan Smith @ 2009-09-03 15:08 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

SH> Yes.  On the struct sock.  But what will drop the ref on the
SH> struct socket?  Or has one of your later patches, not yet in
SH> ckpt-v17-dev, added that?  Or, am I just missing a place where
SH> sock_put() will actually sock_release(sk->sk_socket)?

Hmm, I see what you mean.  I can't find any path where sock_put() will
release the struct socket.  What's weird is that there is a WARN_ON()
in af_unix.c:354 that should get tripped if we call sk_free() when we
still have a socket.  I don't see that, but now I'm not sure why.

Perhaps what we should do is orphan the struct sock before we add it
to the hash and then graft it onto a new struct socket before
attaching it to a struct file?

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] Add post-file deferqueue
       [not found]         ` <4A9F0804.7080704-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-09-03 15:19           ` Dan Smith
  0 siblings, 0 replies; 20+ messages in thread
From: Dan Smith @ 2009-09-03 15:19 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

OL> Hmm... I guess you missed my comments to his post - see below:

Yep, I missed them, sorry.  I was just including Matt's code almost
verbatim here.  I'll make those changes.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]         ` <4A9F0E96.70106-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-09-03 15:21           ` Dan Smith
  0 siblings, 0 replies; 20+ messages in thread
From: Dan Smith @ 2009-09-03 15:21 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

OL> As it is, it simplifies the code. However, how can we be certain
OL> that grafting a dead socket doesn't break any assumptions
OL> elsewhere ?  For instance, unix_sock_destructor() will spit a
OL> warning if it finds
sk-> sk_socket != NULL. There may be other places (per protocol ?).

I think that in most cases the grafted struct socket is short-lived
and therefore shouldn't be a problem with something like the
destructor.

I definitely prefer assuming we have an sk->sk_socket elsewhere and
will be happy to remove the other checks, unless there's a compelling
reason not to.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] [RFC] Track socket buffer owners
       [not found]         ` <4A9F2158.4010405-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-09-03 15:24           ` Dan Smith
       [not found]             ` <87skf4xd9a.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-03 15:24 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

OL> Super nit: perhaps use 'sk_objref' for socket, and 'peer_objref' for
OL> peer.

Can I use sk_objref and pr_objref?  It will annoy me to no end if the
underscores don't line up :)

I'll make the other suggested changes, thanks :)

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] [RFC] Track socket buffer owners
       [not found]             ` <87skf4xd9a.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-09-03 15:46               ` Oren Laadan
  0 siblings, 0 replies; 20+ messages in thread
From: Oren Laadan @ 2009-09-03 15:46 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On Thu, 3 Sep 2009, Dan Smith wrote:

> OL> Super nit: perhaps use 'sk_objref' for socket, and 'peer_objref' for
> OL> peer.
> 
> Can I use sk_objref and pr_objref?  It will annoy me to no end if the
> underscores don't line up :)

:D

(I coulnd't have said it better...)

> 
> I'll make the other suggested changes, thanks :)
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found] ` <1252508756-4278-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-09 15:05   ` Dan Smith
       [not found]     ` <1252508756-4278-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-09 15:05 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This changes the checkpoint/restart procedure for sockets a bit.  The
socket file header is now checkpointed separately from the socket itself,
which allows us to checkpoint a socket without arriving at it from a
file descriptor.  Thus, most sockets will be checkpointed as a result
of processing the file table, calling sock_file_checkpoint(fd), which
in turn calls checkpoint_obj(socket).

However, we may arrive at some sockets while checkpointing other objects,
such as the other end of an AF_UNIX socket with buffers in flight.  This
patch just opens that door, which is utilized by the next patch.

Changes in v2:
 - If we attempt to checkpoint an orphan socket, create a struct socket
   to adopt it for the purposes of the checkpoint

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/objhash.c           |    2 +
 include/linux/checkpoint_hdr.h |    6 +-
 include/linux/net.h            |    4 +-
 include/net/af_unix.h          |    4 +-
 include/net/sock.h             |    2 +
 net/checkpoint.c               |  153 +++++++++++++++++++++++++++++++---------
 net/unix/checkpoint.c          |   11 ++--
 7 files changed, 136 insertions(+), 46 deletions(-)

diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index a9a10d1..a410346 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -381,6 +381,8 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.obj_type = CKPT_OBJ_SOCK,
 		.ref_drop = obj_sock_drop,
 		.ref_grab = obj_sock_grab,
+		.checkpoint = checkpoint_sock,
+		.restore = restore_sock,
 	},
 };
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 06bc6e2..b75562c 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -70,6 +70,7 @@ enum {
 	CKPT_HDR_USER,
 	CKPT_HDR_GROUPINFO,
 	CKPT_HDR_TASK_CREDS,
+	CKPT_HDR_SOCKET,
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -368,7 +369,8 @@ struct ckpt_hdr_file_pipe {
 } __attribute__((aligned(8)));
 
 /* socket */
-struct ckpt_socket {
+struct ckpt_hdr_socket {
+	struct ckpt_hdr h;
 	struct { /* struct socket */
 		__u64 flags;
 		__u8 state;
@@ -428,7 +430,7 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_file_socket {
 	struct ckpt_hdr_file common;
-	struct ckpt_socket socket;
+	__s32 sock_objref;
 } __attribute__((aligned(8)));
 
 struct ckpt_hdr_utsns {
diff --git a/include/linux/net.h b/include/linux/net.h
index 27187a4..96c7e22 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -148,7 +148,7 @@ struct msghdr;
 struct module;
 
 struct ckpt_ctx;
-struct ckpt_socket;
+struct ckpt_hdr_socket;
 
 struct proto_ops {
 	int		family;
@@ -197,7 +197,7 @@ struct proto_ops {
 	int		(*checkpoint)(struct ckpt_ctx *ctx,
 				      struct socket *sock);
 	int		(*restore)(struct ckpt_ctx *ctx, struct socket *sock,
-				   struct ckpt_socket *h);
+				   struct ckpt_hdr_socket *h);
 };
 
 struct net_proto_family {
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1a1fd20..61f666b 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -71,10 +71,10 @@ static inline void unix_sysctl_unregister(struct net *net) {}
 
 #ifdef CONFIG_CHECKPOINT
 struct ckpt_ctx;
-struct ckpt_socket;
+struct ckpt_hdr_socket;
 extern int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
 extern int unix_restore(struct ckpt_ctx *ctx, struct socket *sock,
-			struct ckpt_socket *h);
+			struct ckpt_hdr_socket *h);
 #else
 #define unix_checkpoint NULL
 #define unix_restore NULL
diff --git a/include/net/sock.h b/include/net/sock.h
index 8e3b050..0db1ca3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1644,6 +1644,8 @@ extern __u32 sysctl_rmem_default;
 /* Checkpoint/Restart Functions */
 struct ckpt_ctx;
 struct ckpt_hdr_file;
+extern int checkpoint_sock(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_sock(struct ckpt_ctx *ctx);
 extern int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
 extern struct file *sock_file_restore(struct ckpt_ctx *ctx,
 				      struct ckpt_hdr_file *h);
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 2541e81..d52389a 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -166,7 +166,7 @@ int ckpt_sock_getnames(struct ckpt_ctx *ctx, struct socket *sock,
 	return 0;
 }
 
-static int sock_cptrst_verify(struct ckpt_socket *h)
+static int sock_cptrst_verify(struct ckpt_hdr_socket *h)
 {
 	uint8_t userlocks_mask = SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK |
 		                 SOCK_BINDADDR_LOCK | SOCK_BINDPORT_LOCK;
@@ -204,7 +204,7 @@ static int sock_cptrst_opt(int op, struct socket *sock,
 	sock_cptrst_opt(op, sk->sk_socket, name, (char *)opt, sizeof(*opt))
 
 static int sock_cptrst_bufopts(int op, struct sock *sk,
-			       struct ckpt_socket *h)
+			       struct ckpt_hdr_socket *h)
 
 {
 	if (CKPT_COPY_SOPT(op, sk, SO_RCVBUF, &h->sock.rcvbuf))
@@ -270,7 +270,7 @@ static int sock_restore_flag(struct socket *sock,
 
 
 static int sock_restore_flags(struct socket *sock,
-                             struct ckpt_socket *h)
+                             struct ckpt_hdr_socket *h)
 {
        int ret;
        int i;
@@ -309,6 +309,9 @@ static int sock_restore_flags(struct socket *sock,
                return -ENOSYS;
        }
 
+       if (test_and_clear_bit(SOCK_DEAD, &sk_flags))
+	       sock_flag(sock->sk, SOCK_DEAD);
+
        /* Anything that is still set in the flags that isn't part of
         * our protocol's default set, indicates an error
         */
@@ -339,7 +342,7 @@ static int sock_copy_timeval(int op, struct sock *sk,
 }
 
 static int sock_cptrst(struct ckpt_ctx *ctx, struct sock *sk,
-		       struct ckpt_socket *h, int op)
+		       struct ckpt_hdr_socket *h, int op)
 {
 	if (sk->sk_socket) {
 		CKPT_COPY(op, h->socket.state, sk->sk_socket->state);
@@ -428,31 +431,26 @@ static int sock_cptrst(struct ckpt_ctx *ctx, struct sock *sk,
 		return 0;
 }
 
-int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+static int __do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
 {
-	struct ckpt_hdr_file_socket *h;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
 	int ret;
+	struct socket *sock = sk->sk_socket;
+	struct ckpt_hdr_socket *h;
 
 	if (!sock->ops->checkpoint) {
 		ckpt_write_err(ctx, "socket (proto_ops: %pS)", sock->ops);
 		return -ENOSYS;
 	}
 
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
 	if (!h)
 		return -ENOMEM;
 
-	h->common.f_type = CKPT_FILE_SOCKET;
-
 	/* part I: common to all sockets */
-	ret = sock_cptrst(ctx, sk, &h->socket, CKPT_CPT);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_file_common(ctx, file, &h->common);
+	ret = sock_cptrst(ctx, sk, h, CKPT_CPT);
 	if (ret < 0)
 		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (ret < 0)
 		goto out;
@@ -463,12 +461,71 @@ int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
 		goto out;
 
 	/* part III: socket buffers */
-	if (sk->sk_state != TCP_LISTEN) {
+	if ((sk->sk_state != TCP_LISTEN) && (!sock_flag(sk, SOCK_DEAD))) {
 		ret = sock_write_buffers(ctx, &sk->sk_receive_queue);
 		if (ret)
 			goto out;
 		ret = sock_write_buffers(ctx, &sk->sk_write_queue);
 	}
+
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int do_sock_checkpoint(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct socket *sock;
+	int ret;
+
+	if (sk->sk_socket)
+		return __do_sock_checkpoint(ctx, sk);
+
+	/* Temporarily adopt this orphan socket */
+	ret = sock_create(sk->sk_family, sk->sk_type, 0, &sock);
+	if (ret < 0)
+		return ret;
+	sock_graft(sk, sock);
+
+	ret = __do_sock_checkpoint(ctx, sk);
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+	sock_release(sock);
+
+	return ret;
+}
+
+int checkpoint_sock(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_sock_checkpoint(ctx, (struct sock *)ptr);
+}
+
+int sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_socket *h;
+	struct socket *sock = file->private_data;
+	struct sock *sk = sock->sk;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_SOCKET;
+
+	h->sock_objref = checkpoint_obj(ctx, sk, CKPT_OBJ_SOCK);
+	if (h->sock_objref < 0) {
+		ret = h->sock_objref;
+		goto out;
+	}
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
  out:
 	ckpt_hdr_put(ctx, h);
 	return ret;
@@ -525,27 +582,31 @@ static struct file *sock_alloc_attach_fd(struct socket *sock)
 		file = ERR_PTR(err);
 	}
 
+	/* Since objhash assumes the initial reference for a socket,
+	 * we bump it here for this descriptor, unlike other places in the
+	 * socket code which assume the descriptor is the owner.
+	 */
+	sock_hold(sock->sk);
+
 	return file;
 }
 
-struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 {
-	struct ckpt_hdr_file_socket *hh = (struct ckpt_hdr_file_socket *) ptr;
-	struct ckpt_socket *h = &hh->socket;
+	struct ckpt_hdr_socket *h;
 	struct socket *sock;
-	struct file *file;
 	int ret;
 
-	if (ptr->h.type != CKPT_HDR_FILE  ||
-	    ptr->h.len != sizeof(*hh) || ptr->f_type != CKPT_FILE_SOCKET)
-		return ERR_PTR(-EINVAL);
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
 
 	/* silently clear flags, e.g. SOCK_NONBLOCK or SOCK_CLOEXEC */
 	h->sock.type &= SOCK_TYPE_MASK;
 
 	ret = sock_create(h->sock_common.family, h->sock.type, 0, &sock);
 	if (ret < 0)
-		return ERR_PTR(ret);
+		goto err;
 
 	if (!sock->ops->restore) {
 		ckpt_debug("proto_ops lacks checkpoint: %pS\n", sock->ops);
@@ -566,21 +627,45 @@ struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
 	if (ret < 0)
 		goto err;
 
-	file = sock_alloc_attach_fd(sock);
-	if (IS_ERR(file)) {
-		ret = PTR_ERR(file);
-		goto err;
-	}
+	ckpt_hdr_put(ctx, h);
+
+	return sock->sk;
+ err:
+	ckpt_hdr_put(ctx, h);
+	sock_release(sock);
+
+	return ERR_PTR(ret);
+}
+
+void *restore_sock(struct ckpt_ctx *ctx)
+{
+	return do_sock_restore(ctx);
+}
+
+struct file *sock_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+{
+	struct ckpt_hdr_file_socket *h = (struct ckpt_hdr_file_socket *)ptr;
+	struct sock *sk;
+	struct file *file;
+	int ret;
+
+	if (ptr->h.type != CKPT_HDR_FILE || ptr->f_type != CKPT_FILE_SOCKET)
+		return ERR_PTR(-EINVAL);
+
+	sk = ckpt_obj_fetch(ctx, h->sock_objref, CKPT_OBJ_SOCK);
+	if (IS_ERR(sk))
+		return ERR_PTR(PTR_ERR(sk));
+
+	file = sock_alloc_attach_fd(sk->sk_socket);
+	if (IS_ERR(file))
+		return file;
 
 	ret = restore_file_common(ctx, file, ptr);
 	if (ret < 0) {
 		fput(file);
-		file = ERR_PTR(ret);
+		return ERR_PTR(ret);
 	}
-	return file;
 
- err:
-	sock_release(sock);
-	return ERR_PTR(ret);
+	return file;
 }
 
diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
index 08e664b..395f6fd 100644
--- a/net/unix/checkpoint.c
+++ b/net/unix/checkpoint.c
@@ -57,7 +57,6 @@ static int unix_write_cwd(struct ckpt_ctx *ctx,
 int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct unix_sock *sk = unix_sk(sock->sk);
-	struct unix_sock *pr = unix_sk(sk->peer);
 	struct ckpt_hdr_socket_unix *un;
 	int new;
 	int ret = -ENOMEM;
@@ -86,7 +85,7 @@ int unix_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 		goto out;
 
 	if (sk->peer)
-		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+		un->peer = checkpoint_obj(ctx, sk->peer, CKPT_OBJ_SOCK);
 	else
 		un->peer = 0;
 
@@ -237,7 +236,7 @@ static int unix_join(struct ckpt_ctx *ctx,
 }
 
 static int unix_restore_connected(struct ckpt_ctx *ctx,
-				  struct ckpt_socket *h,
+				  struct ckpt_hdr_socket *h,
 				  struct ckpt_hdr_socket_unix *un,
 				  struct socket *sock)
 {
@@ -423,7 +422,7 @@ static int unix_fakebind(struct socket *sock,
 	return 0;
 }
 
-static int unix_restore_bind(struct ckpt_socket *h,
+static int unix_restore_bind(struct ckpt_hdr_socket *h,
 			     struct ckpt_hdr_socket_unix *un,
 			     struct socket *sock,
 			     const char *path)
@@ -440,7 +439,7 @@ static int unix_restore_bind(struct ckpt_socket *h,
 }
 
 /* Some easy pre-flight checks before we get underway */
-static int unix_precheck(struct socket *sock, struct ckpt_socket *h)
+static int unix_precheck(struct socket *sock, struct ckpt_hdr_socket *h)
 {
 	struct net *net = sock_net(sock->sk);
 
@@ -471,7 +470,7 @@ static int unix_precheck(struct socket *sock, struct ckpt_socket *h)
 }
 
 int unix_restore(struct ckpt_ctx *ctx, struct socket *sock,
-		      struct ckpt_socket *h)
+		      struct ckpt_hdr_socket *h)
 
 {
 	struct ckpt_hdr_socket_unix *un;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]     ` <1252508756-4278-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-09-09 23:02       ` Oren Laadan
       [not found]         ` <4AA833F5.3040706-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Oren Laadan @ 2009-09-09 23:02 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> This changes the checkpoint/restart procedure for sockets a bit.  The
> socket file header is now checkpointed separately from the socket itself,
> which allows us to checkpoint a socket without arriving at it from a
> file descriptor.  Thus, most sockets will be checkpointed as a result
> of processing the file table, calling sock_file_checkpoint(fd), which
> in turn calls checkpoint_obj(socket).
> 
> However, we may arrive at some sockets while checkpointing other objects,
> such as the other end of an AF_UNIX socket with buffers in flight.  This
> patch just opens that door, which is utilized by the next patch.
> 
> Changes in v2:
>  - If we attempt to checkpoint an orphan socket, create a struct socket
>    to adopt it for the purposes of the checkpoint
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Looks good !

Did you also address this ?
https://lists.linux-foundation.org/pipermail/containers/2009-September/020385.html

-----
SH> Yes.  On the struct sock.  But what will drop the ref on the
SH> struct socket?  Or has one of your later patches, not yet in
SH> ckpt-v17-dev, added that?  Or, am I just missing a place where
SH> sock_put() will actually sock_release(sk->sk_socket)?

Hmm, I see what you mean.  I can't find any path where sock_put() will
release the struct socket.  What's weird is that there is a WARN_ON()
in af_unix.c:354 that should get tripped if we call sk_free() when we
still have a socket.  I don't see that, but now I'm not sure why.

Perhaps what we should do is orphan the struct sock before we add it
to the hash and then graft it onto a new struct socket before
attaching it to a struct file?
-----

[The reason sk_free() expects no sk_socket is because the socket at
this point must have been released already via proto_ops->release().
The callback is assumed to orphan the socket].

Oren.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]         ` <4AA833F5.3040706-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
@ 2009-09-11 14:31           ` Dan Smith
       [not found]             ` <87d45xlfhz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Dan Smith @ 2009-09-11 14:31 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

OL> Did you also address this ?

Sorry, I meant to put something in the intro mail about this.

OL> [The reason sk_free() expects no sk_socket is because the socket
OL> at this point must have been released already via
OL> proto_ops->release().  The callback is assumed to orphan the
OL> socket].

Right, which is why we're not seeing the warning.  I've got a
follow-on patch to the current set started but wanted to avoid packing
more onto these two patches.  If you'd like me to resubmit the set and
include that patch (when finished) I'll do that.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2)
       [not found]             ` <87d45xlfhz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-09-11 19:46               ` Oren Laadan
  0 siblings, 0 replies; 20+ messages in thread
From: Oren Laadan @ 2009-09-11 19:46 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> OL> Did you also address this ?
> 
> Sorry, I meant to put something in the intro mail about this.
> 
> OL> [The reason sk_free() expects no sk_socket is because the socket
> OL> at this point must have been released already via
> OL> proto_ops->release().  The callback is assumed to orphan the
> OL> socket].
> 
> Right, which is why we're not seeing the warning.  I've got a
> follow-on patch to the current set started but wanted to avoid packing
> more onto these two patches.  If you'd like me to resubmit the set and
> include that patch (when finished) I'll do that.
> 

I prefer not to pull in code that we know has a leak, but rather
wait for the fixed version. I'll take the 2nd patch (post-file
deferqueue) now.

Oren.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2009-09-11 19:46 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-02 18:22 [RFC] Sockets as proper objects and buffers with owners Dan Smith
     [not found] ` <1251915760-20118-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-02 18:22   ` [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2) Dan Smith
     [not found]     ` <1251915760-20118-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-03  0:32       ` Oren Laadan
     [not found]         ` <4A9F0E96.70106-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-03 15:21           ` Dan Smith
2009-09-03  5:40       ` Serge E. Hallyn
     [not found]         ` <20090903054058.GA7189-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-03 14:03           ` Dan Smith
     [not found]             ` <87eiqoyvkz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-09-03 14:43               ` Serge E. Hallyn
     [not found]                 ` <20090903144341.GA13182-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-03 15:08                   ` Dan Smith
2009-09-03 14:46               ` Oren Laadan
2009-09-02 18:22   ` [PATCH 2/3] Add post-file deferqueue Dan Smith
     [not found]     ` <1251915760-20118-3-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-03  0:04       ` Oren Laadan
     [not found]         ` <4A9F0804.7080704-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-03 15:19           ` Dan Smith
2009-09-02 18:22   ` [PATCH 3/3] [RFC] Track socket buffer owners Dan Smith
     [not found]     ` <1251915760-20118-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-03  1:52       ` Oren Laadan
     [not found]         ` <4A9F2158.4010405-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-03 15:24           ` Dan Smith
     [not found]             ` <87skf4xd9a.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-09-03 15:46               ` Oren Laadan
  -- strict thread matches above, loose matches on Subject: below --
2009-09-09 15:05 Dan Smith
     [not found] ` <1252508756-4278-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-09 15:05   ` [PATCH 1/3] Make sockets proper objhash objects and use checkpoint_obj() on them (v2) Dan Smith
     [not found]     ` <1252508756-4278-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-09 23:02       ` Oren Laadan
     [not found]         ` <4AA833F5.3040706-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-09-11 14:31           ` Dan Smith
     [not found]             ` <87d45xlfhz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-09-11 19:46               ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.