All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] c/r: Add AF_UNIX support
@ 2009-06-03 15:18 Dan Smith
       [not found] ` <1244042305-7770-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Dan Smith @ 2009-06-03 15:18 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
has been tested with a single and multiple processes, and with data inflight
at the time of checkpoint.  It supports both socketpair()s and path-based
sockets.

I have an almost-working AF_INET follow-on to this which I can submit after
this is reviewed and tweaked into acceptance.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/files.c             |    7 +
 checkpoint/objhash.c           |   27 +++
 include/linux/checkpoint_hdr.h |   71 ++++++++
 include/net/sock.h             |    8 +
 net/Makefile                   |    2 +
 net/socket.c                   |   58 ++++++
 net/socket_cr.c                |  378 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 551 insertions(+), 0 deletions(-)
 create mode 100644 net/socket_cr.c

diff --git a/checkpoint/files.c b/checkpoint/files.c
index b264e40..bb2cca0 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 
 /**************************************************************************
@@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
 		.file_type = CKPT_FILE_PIPE,
 		.restore = pipe_file_restore,
 	},
+	/* socket */
+	{
+		.file_name = "SOCKET",
+		.file_type = CKPT_FILE_SOCKET,
+		.restore = sock_file_restore,
+	},
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 045a920..7819e5e 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -19,6 +19,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 struct ckpt_obj;
 struct ckpt_obj_ops;
@@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
 	return atomic_read(&((struct ipc_namespace *) ptr)->count);
 }
 
+static int obj_sock_grab(void *ptr)
+{
+	sock_hold((struct sock *) ptr);
+	return 0;
+}
+
+static void obj_sock_drop(void *ptr)
+{
+	sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_bad,
 		.restore = restore_bad,
 	},
+	/* sock object */
+	{
+		.obj_name = "SOCKET",
+		.obj_type = CKPT_OBJ_SOCK,
+		.ref_drop = obj_sock_drop,
+		.ref_grab = obj_sock_grab,
+		.ref_users = obj_sock_users,
+		.checkpoint = sock_file_checkpoint,
+		.restore = sock_file_restore,
+	},
 };
 
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index cd427d8..252331a 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -12,6 +12,13 @@
 
 #include <linux/types.h>
 #include <linux/utsname.h>
+#include <linux/socket.h>
+
+/* In userspace, bring in the struct sockaddr_* definitions */
+#ifndef __KERNEL__
+#include <sys/socket.h>
+#include <sys/types.h>
+#endif
 
 /*
  * To maintain compatibility between 32-bit and 64-bit architecture flavors,
@@ -76,6 +83,11 @@ enum {
 	CKPT_HDR_IPC_MSG_MSG,
 	CKPT_HDR_IPC_SEM,
 
+ 	CKPT_HDR_FD_SOCKET = 601,
+ 	CKPT_HDR_SOCKET,
+ 	CKPT_HDR_SOCKET_BUFFERS,
+ 	CKPT_HDR_SOCKET_BUFFER,
+
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
@@ -103,6 +115,7 @@ enum obj_type {
 	CKPT_OBJ_NS,
 	CKPT_OBJ_UTS_NS,
 	CKPT_OBJ_IPC_NS,
+	CKPT_OBJ_SOCK,
 	CKPT_OBJ_MAX
 };
 
@@ -225,6 +238,7 @@ enum file_type {
 	CKPT_FILE_IGNORE = 0,
 	CKPT_FILE_GENERIC,
 	CKPT_FILE_PIPE,
+	CKPT_FILE_SOCKET,
 	CKPT_FILE_MAX
 };
 
@@ -248,6 +262,11 @@ struct ckpt_hdr_file_pipe {
 	__s32 pipe_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_file_socket {
+	struct ckpt_hdr_file common;
+	__u16 family;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_file_pipe_state {
 	struct ckpt_hdr h;
 	__s32 pipe_len;
@@ -394,4 +413,56 @@ struct ckpt_hdr_ipc_sem {
 #define CKPT_TST_OVERFLOW_64(a, b) \
 	((sizeof(a) > sizeof(b)) && ((a) > LONG_MAX))
 
+struct ckpt_hdr_socket {
+	struct ckpt_hdr h;
+
+	/* sock_common */
+	__u16 family;
+	__u8 state;
+	__u8 reuse;
+	__u32 bound_dev_if;
+
+	/* sock */
+	__u8 protocol;
+	__u16 type;
+	__u8 sock_state;
+	__u8 shutdown;
+	__u8 userlocks;
+	__u8 no_check;
+	__u32 err;
+	__u32 err_soft;
+	__u32 priority;
+	__u64 rcvlowat;
+	__u64 rcvtimeo;
+	__u64 sndtimeo;
+	__u16 backlog;
+	__s32 rcvbuf;
+	__s32 sndbuf;
+	__u64 flags;
+	__u64 lingertime;
+
+	/* socket */
+	__u64 socket_flags;
+	__u8 socket_state;
+
+	/* common to all supported families */
+	struct sockaddr laddr;
+	struct sockaddr raddr;
+	__u32 laddr_len;
+	__u32 raddr_len;
+
+	union {
+		struct {
+			__u32 this;
+			__u32 peer;
+		} un;
+	};
+
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket_buffer {
+	struct ckpt_hdr h;
+	__u32 skb_count;
+} __attribute__ ((aligned(8)));
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..ced8cd9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1482,4 +1482,12 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+/* Checkpoint/Restart Functions */
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
+extern struct socket *__sock_file_restore(struct ckpt_ctx *,
+					  struct ckpt_hdr_socket *);
+extern void *sock_file_restore(struct ckpt_ctx *);
+
 #endif	/* _SOCK_H */
diff --git a/net/Makefile b/net/Makefile
index 9e00a55..1c68a4e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
+
+obj-$(CONFIG_CHECKPOINT)	+= socket_cr.o
diff --git a/net/socket.c b/net/socket.c
index 791d71a..d1a187d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -96,6 +96,9 @@
 #include <net/sock.h>
 #include <linux/netfilter.h>
 
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
@@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
 	.sendpage =	sock_sendpage,
 	.splice_write = generic_splice_sendpage,
 	.splice_read =	sock_splice_read,
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint =   sock_file_checkpoint,
+#endif
 };
 
 /*
@@ -415,6 +421,58 @@ int sock_map_fd(struct socket *sock, int flags)
 	return fd;
 }
 
+static struct file *sock_alloc_attach_fd(struct socket *socket)
+{
+	struct file *file;
+	int err;
+
+	file = get_empty_filp();
+	if (!file)
+		return ERR_PTR(ENOMEM);
+
+	err = sock_attach_fd(socket, file, 0);
+	if (err < 0) {
+		put_filp(file);
+		file = ERR_PTR(err);
+	}
+
+	return file;
+}
+
+void *sock_file_restore(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_socket *h = NULL;
+	struct socket *socket = NULL;
+	struct file *file = NULL;
+	int err;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (IS_ERR(h))
+		return h;
+
+	socket = __sock_file_restore(ctx, h);
+	if (IS_ERR(socket)) {
+		err = PTR_ERR(socket);
+		goto err_put;
+	}
+
+	file = sock_alloc_attach_fd(socket);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_release;
+	}
+
+	ckpt_hdr_put(ctx, h);
+
+	return file;
+ err_release:
+	sock_release(socket);
+ err_put:
+	ckpt_hdr_put(ctx, h);
+
+	return ERR_PTR(err);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
 	if (file->f_op == &socket_file_ops)
diff --git a/net/socket_cr.c b/net/socket_cr.c
new file mode 100644
index 0000000..76759fe
--- /dev/null
+++ b/net/socket_cr.c
@@ -0,0 +1,378 @@
+/*
+ *  Copyright 2009 IBM Corporation
+ *
+ *  Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/socket.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
+{
+	int count = 0;
+	struct sk_buff *skb;
+
+	spin_lock(&from->lock);
+
+	skb_queue_walk(from, skb) {
+		struct sk_buff *tmp;
+
+		tmp = skb_copy(skb, GFP_KERNEL);
+		if (!tmp) {
+			count = -ENOMEM;
+			goto out;
+		}
+		skb_queue_tail(to, tmp);
+		count++;
+	}
+ out:
+	spin_unlock(&from->lock);
+
+	return count;
+}
+
+static int __sock_write_buffers(struct ckpt_ctx *ctx,
+				struct sk_buff_head *queue)
+{
+	struct sk_buff *skb;
+	int ret = 0;
+
+	skb_queue_walk(queue, skb) {
+		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
+					  CKPT_HDR_SOCKET_BUFFER);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	struct sk_buff_head tmpq;
+	int ret = -ENOMEM;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+	if (!h)
+		goto out;
+
+	skb_queue_head_init(&tmpq);
+
+	h->skb_count = sock_copy_buffers(queue, &tmpq);
+	if (h->skb_count < 0) {
+		ret = h->skb_count;
+		goto out;
+	}
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (!ret)
+		ret = __sock_write_buffers(ctx, &tmpq);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+	__skb_queue_purge(&tmpq);
+
+	return ret;
+}
+
+static int sock_un_checkpoint(struct ckpt_ctx *ctx,
+			      struct sock *sock,
+			      struct ckpt_hdr_socket *h)
+{
+	struct unix_sock *sk = unix_sk(sock);
+	struct unix_sock *pr = unix_sk(sk->peer);
+	int new;
+	int ret;
+
+	h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
+	if (h->un.this < 0)
+		goto out;
+
+	if (sk->peer)
+		h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+	else
+		h->un.peer = 0;
+
+	if (h->un.peer < 0) {
+		ret = h->un.peer;
+		goto out;
+	}
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ out:
+	return ret;
+}
+
+static int sock_cptrst(struct ckpt_ctx *ctx,
+		       struct sock *sock,
+		       struct ckpt_hdr_socket *h,
+		       int op)
+{
+	if (sock->sk_socket) {
+		CKPT_COPY(op, h->socket_flags, sock->sk_socket->flags);
+		CKPT_COPY(op, h->socket_state, sock->sk_socket->state);
+	}
+
+	CKPT_COPY(op, h->reuse, sock->sk_reuse);
+	CKPT_COPY(op, h->shutdown, sock->sk_shutdown);
+	CKPT_COPY(op, h->userlocks, sock->sk_userlocks);
+	CKPT_COPY(op, h->no_check, sock->sk_no_check);
+	CKPT_COPY(op, h->protocol, sock->sk_protocol);
+	CKPT_COPY(op, h->err, sock->sk_err);
+	CKPT_COPY(op, h->err_soft, sock->sk_err_soft);
+	CKPT_COPY(op, h->priority, sock->sk_priority);
+	CKPT_COPY(op, h->rcvlowat, sock->sk_rcvlowat);
+	CKPT_COPY(op, h->backlog, sock->sk_max_ack_backlog);
+	CKPT_COPY(op, h->rcvtimeo, sock->sk_rcvtimeo);
+	CKPT_COPY(op, h->sndtimeo, sock->sk_sndtimeo);
+	CKPT_COPY(op, h->rcvbuf, sock->sk_rcvbuf);
+	CKPT_COPY(op, h->sndbuf, sock->sk_sndbuf);
+	CKPT_COPY(op, h->bound_dev_if, sock->sk_bound_dev_if);
+	CKPT_COPY(op, h->flags, sock->sk_flags);
+	CKPT_COPY(op, h->lingertime, sock->sk_lingertime);
+
+	return 0;
+}
+
+int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct socket *socket = file->private_data;
+	struct sock *sock = socket->sk;
+	struct ckpt_hdr_socket *h;
+	int ret = 0;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (!h)
+		return -ENOMEM;
+
+	h->family = sock->sk_family;
+	h->state = socket->state;
+	h->sock_state = sock->sk_state;
+	h->reuse = sock->sk_reuse;
+	h->type = sock->sk_type;
+	h->protocol = sock->sk_protocol;
+
+	h->laddr_len = sizeof(h->laddr);
+	h->raddr_len = sizeof(h->raddr);
+
+	if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((h->sock_state != TCP_LISTEN) &&
+	    (h->type != SOCK_DGRAM) &&
+	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sock_cptrst(ctx, sock, h, CKPT_CPT);
+
+	if (h->family == AF_UNIX) {
+		ret = sock_un_checkpoint(ctx, sock, h);
+		if (ret)
+			goto out;
+	} else {
+		ckpt_debug("unsupported socket type %i\n", h->family);
+		ret = EINVAL;
+		goto out;
+	}
+
+	ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+	if (ret)
+		goto out;
+
+	ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+	if (ret)
+		goto out;
+
+	/* FIXME: write out-of-order queue for TCP */
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int sock_read_buffer(struct ckpt_ctx *ctx,
+			    struct sock *sock,
+			    struct sk_buff **skb)
+{
+	struct ckpt_hdr *h;
+	int ret = 0;
+	int len;
+
+	h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = h->len - sizeof(*h);
+
+	*skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
+	if (*skb == NULL) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	memcpy(skb_put(*skb, len), (char *)(h + 1), len);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int sock_read_buffers(struct ckpt_ctx *ctx,
+			     struct sock *sock,
+			     struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+	if (IS_ERR(h)) {
+		ret = PTR_ERR(h);
+		goto out;
+	}
+
+	for (i = 0; i < h->skb_count; i++) {
+		struct sk_buff *skb = NULL;
+
+		ret = sock_read_buffer(ctx, sock, &skb);
+		if (ret)
+			break;
+
+		skb_queue_tail(queue, skb);
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int sock_un_restart(struct ckpt_ctx *ctx,
+			   struct ckpt_hdr_socket *h,
+			   struct socket *socket)
+{
+	struct sock *peer;
+	int ret = 0;
+
+	if (h->sock_state == TCP_ESTABLISHED) {
+		peer = ckpt_obj_fetch(ctx, h->un.peer, CKPT_OBJ_SOCK);
+		if (peer && !IS_ERR(peer)) {
+			/* We're last, so join with peer */
+			struct sock *this = socket->sk;
+
+			sock_hold(this);
+			sock_hold(peer);
+
+			unix_sk(this)->peer = peer;
+			unix_sk(peer)->peer = this;
+
+			this->sk_peercred.pid = task_tgid_vnr(current);
+			current_euid_egid(&this->sk_peercred.uid,
+					  &this->sk_peercred.gid);
+
+			peer->sk_peercred.pid = task_tgid_vnr(current);
+			current_euid_egid(&peer->sk_peercred.uid,
+					  &peer->sk_peercred.gid);
+		} else {
+			/* We're first, so add our socket and wait for peer */
+			ckpt_obj_insert(ctx, socket->sk, h->un.this,
+					CKPT_OBJ_SOCK);
+		}
+
+	} else if (h->sock_state == TCP_LISTEN) {
+		ret = socket->ops->bind(socket,
+					(struct sockaddr *)&h->laddr,
+					h->laddr_len);
+		if (ret < 0)
+			goto out;
+
+		ret = socket->ops->listen(socket, h->backlog);
+		if (ret < 0)
+			goto out;
+	} else
+		ckpt_debug("unsupported UNIX socket state %i\n", h->state);
+
+	socket->state = h->state;
+	socket->sk->sk_state = h->sock_state;
+ out:
+	return ret;
+}
+
+struct socket *__sock_file_restore(struct ckpt_ctx *ctx,
+				   struct ckpt_hdr_socket *h)
+{
+	struct socket *socket;
+	int ret;
+
+	ret = sock_create(h->family, h->type, 0, &socket);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (h->family == AF_UNIX) {
+		ret = sock_un_restart(ctx, h, socket);
+		ckpt_debug("sock_un_restart: %i\n", ret);
+	} else {
+		ckpt_debug("unsupported family %i\n", h->family);
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
+	if (ret)
+		goto out;
+
+	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
+	if (ret)
+		goto out;
+ out:
+	if (ret) {
+		sock_release(socket);
+		socket = ERR_PTR(ret);
+	}
+
+	return socket;
+}
+
+int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr_file_socket *h;
+	int ret;
+	struct file *file = ptr;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_SOCKET;
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = __sock_file_checkpoint(ctx, file);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support
       [not found] ` <1244042305-7770-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-04 15:19   ` Serge E. Hallyn
       [not found]     ` <20090604151923.GA29519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-04 20:14   ` Louis Rilling
  1 sibling, 1 reply; 7+ messages in thread
From: Serge E. Hallyn @ 2009-06-04 15:19 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
> has been tested with a single and multiple processes, and with data inflight
> at the time of checkpoint.  It supports both socketpair()s and path-based
> sockets.
> 
> I have an almost-working AF_INET follow-on to this which I can submit after
> this is reviewed and tweaked into acceptance.
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Looks very nice, but a few comments.  I do think that the following
should be moved into network headers:

> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
...
> @@ -248,6 +262,11 @@ struct ckpt_hdr_file_pipe {
>  	__s32 pipe_objref;
>  } __attribute__((aligned(8)));
> 
> +struct ckpt_hdr_file_socket {
> +	struct ckpt_hdr_file common;
> +	__u16 family;
> +} __attribute__((aligned(8)));
> +
>  struct ckpt_hdr_file_pipe_state {
>  	struct ckpt_hdr h;
>  	__s32 pipe_len;
> @@ -394,4 +413,56 @@ struct ckpt_hdr_ipc_sem {
>  #define CKPT_TST_OVERFLOW_64(a, b) \
>  	((sizeof(a) > sizeof(b)) && ((a) > LONG_MAX))
> 
> +struct ckpt_hdr_socket {
> +	struct ckpt_hdr h;
> +
> +	/* sock_common */
> +	__u16 family;
> +	__u8 state;
> +	__u8 reuse;
> +	__u32 bound_dev_if;
> +
> +	/* sock */
> +	__u8 protocol;
> +	__u16 type;
> +	__u8 sock_state;
> +	__u8 shutdown;
> +	__u8 userlocks;
> +	__u8 no_check;
> +	__u32 err;
> +	__u32 err_soft;
> +	__u32 priority;
> +	__u64 rcvlowat;
> +	__u64 rcvtimeo;
> +	__u64 sndtimeo;
> +	__u16 backlog;
> +	__s32 rcvbuf;
> +	__s32 sndbuf;
> +	__u64 flags;
> +	__u64 lingertime;
> +
> +	/* socket */
> +	__u64 socket_flags;
> +	__u8 socket_state;
> +
> +	/* common to all supported families */
> +	struct sockaddr laddr;
> +	struct sockaddr raddr;
> +	__u32 laddr_len;
> +	__u32 raddr_len;
> +
> +	union {
> +		struct {
> +			__u32 this;
> +			__u32 peer;
> +		} un;
> +	};
> +
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket_buffer {
> +	struct ckpt_hdr h;
> +	__u32 skb_count;
> +} __attribute__ ((aligned(8)));
> +
>  #endif /* _CHECKPOINT_CKPT_HDR_H_ */

...

> +void *sock_file_restore(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket *h = NULL;
> +	struct socket *socket = NULL;
> +	struct file *file = NULL;
> +	int err;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (IS_ERR(h))
> +		return h;
> +
> +	socket = __sock_file_restore(ctx, h);
> +	if (IS_ERR(socket)) {
> +		err = PTR_ERR(socket);
> +		goto err_put;
> +	}
> +
> +	file = sock_alloc_attach_fd(socket);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		goto err_release;
> +	}
> +
> +	ckpt_hdr_put(ctx, h);
> +
> +	return file;

EXTREME nit: a blank line between the return and the error label.

> + err_release:
> +	sock_release(socket);
> + err_put:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ERR_PTR(err);
> +}

...

> +static int sock_un_checkpoint(struct ckpt_ctx *ctx,
> +			      struct sock *sock,
> +			      struct ckpt_hdr_socket *h)
> +{
> +	struct unix_sock *sk = unix_sk(sock);
> +	struct unix_sock *pr = unix_sk(sk->peer);
> +	int new;
> +	int ret;
> +
> +	h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
> +	if (h->un.this < 0)
> +		goto out;
> +
> +	if (sk->peer)
> +		h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
> +	else
> +		h->un.peer = 0;
> +
> +	if (h->un.peer < 0) {
> +		ret = h->un.peer;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> + out:
> +	return ret;
> +}

in the CHECKPOINT_SUBTREE case do we want to try to ensure that sk->peer
is owned by another checkpointed task?

...

> +int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct socket *socket = file->private_data;
> +	struct sock *sock = socket->sk;
> +	struct ckpt_hdr_socket *h;
> +	int ret = 0;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->family = sock->sk_family;
> +	h->state = socket->state;
> +	h->sock_state = sock->sk_state;
> +	h->reuse = sock->sk_reuse;
> +	h->type = sock->sk_type;
> +	h->protocol = sock->sk_protocol;
> +
> +	h->laddr_len = sizeof(h->laddr);
> +	h->raddr_len = sizeof(h->raddr);
> +
> +	if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((h->sock_state != TCP_LISTEN) &&
> +	    (h->type != SOCK_DGRAM) &&
> +	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	sock_cptrst(ctx, sock, h, CKPT_CPT);
> +
> +	if (h->family == AF_UNIX) {
> +		ret = sock_un_checkpoint(ctx, sock, h);
> +		if (ret)
> +			goto out;
> +	} else {
> +		ckpt_debug("unsupported socket type %i\n", h->family);
> +		ret = EINVAL;
> +		goto out;
> +	}
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_write_queue);
> +	if (ret)
> +		goto out;
> +
> +	/* FIXME: write out-of-order queue for TCP */
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_read_buffer(struct ckpt_ctx *ctx,
> +			    struct sock *sock,
> +			    struct sk_buff **skb)
> +{
> +	struct ckpt_hdr *h;
> +	int ret = 0;
> +	int len;
> +
> +	h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = h->len - sizeof(*h);
> +
> +	*skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
> +	if (*skb == NULL) {
> +		ret = ENOMEM;
> +		goto out;
> +	}
> +
> +	memcpy(skb_put(*skb, len), (char *)(h + 1), len);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static int sock_read_buffers(struct ckpt_ctx *ctx,
> +			     struct sock *sock,
> +			     struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (IS_ERR(h)) {
> +		ret = PTR_ERR(h);
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		struct sk_buff *skb = NULL;
> +
> +		ret = sock_read_buffer(ctx, sock, &skb);
> +		if (ret)
> +			break;
> +
> +		skb_queue_tail(queue, skb);
> +	}
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_un_restart(struct ckpt_ctx *ctx,
> +			   struct ckpt_hdr_socket *h,
> +			   struct socket *socket)
> +{
> +	struct sock *peer;
> +	int ret = 0;
> +
> +	if (h->sock_state == TCP_ESTABLISHED) {
> +		peer = ckpt_obj_fetch(ctx, h->un.peer, CKPT_OBJ_SOCK);
> +		if (peer && !IS_ERR(peer)) {
> +			/* We're last, so join with peer */
> +			struct sock *this = socket->sk;
> +
> +			sock_hold(this);
> +			sock_hold(peer);
> +
> +			unix_sk(this)->peer = peer;
> +			unix_sk(peer)->peer = this;
> +
> +			this->sk_peercred.pid = task_tgid_vnr(current);
> +			current_euid_egid(&this->sk_peercred.uid,
> +					  &this->sk_peercred.gid);

No, really, you can't just trust the uid and gid in the ckpt file :)

> +
> +			peer->sk_peercred.pid = task_tgid_vnr(current);

Will the peer's sk_peercred.pid always be current's pid?

> +			current_euid_egid(&peer->sk_peercred.uid,
> +					  &peer->sk_peercred.gid);
> +		} else {
> +			/* We're first, so add our socket and wait for peer */
> +			ckpt_obj_insert(ctx, socket->sk, h->un.this,
> +					CKPT_OBJ_SOCK);
> +		}
> +
> +	} else if (h->sock_state == TCP_LISTEN) {
> +		ret = socket->ops->bind(socket,
> +					(struct sockaddr *)&h->laddr,
> +					h->laddr_len);
> +		if (ret < 0)
> +			goto out;
> +
> +		ret = socket->ops->listen(socket, h->backlog);
> +		if (ret < 0)
> +			goto out;
> +	} else
> +		ckpt_debug("unsupported UNIX socket state %i\n", h->state);
> +
> +	socket->state = h->state;
> +	socket->sk->sk_state = h->sock_state;
> + out:
> +	return ret;
> +}
> +
> +struct socket *__sock_file_restore(struct ckpt_ctx *ctx,
> +				   struct ckpt_hdr_socket *h)
> +{
> +	struct socket *socket;
> +	int ret;
> +
> +	ret = sock_create(h->family, h->type, 0, &socket);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	if (h->family == AF_UNIX) {
> +		ret = sock_un_restart(ctx, h, socket);
> +		ckpt_debug("sock_un_restart: %i\n", ret);
> +	} else {
> +		ckpt_debug("unsupported family %i\n", h->family);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
> +	if (ret)
> +		goto out;
> + out:
> +	if (ret) {
> +		sock_release(socket);
> +		socket = ERR_PTR(ret);
> +	}
> +
> +	return socket;
> +}
> +
> +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
> +{
> +	struct ckpt_hdr_file_socket *h;
> +	int ret;
> +	struct file *file = ptr;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->common.f_type = CKPT_FILE_SOCKET;
> +
> +	ret = checkpoint_file_common(ctx, file, &h->common);
> +	if (ret < 0)
> +		goto out;
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = __sock_file_checkpoint(ctx, file);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}

thanks,
-serge

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support
       [not found]     ` <20090604151923.GA29519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-04 15:36       ` Serge E. Hallyn
  2009-06-04 20:20       ` Dan Smith
  2009-06-08  6:15       ` Oren Laadan
  2 siblings, 0 replies; 7+ messages in thread
From: Serge E. Hallyn @ 2009-06-04 15:36 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Serge E. Hallyn (serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> > +			current_euid_egid(&this->sk_peercred.uid,
> > +					  &this->sk_peercred.gid);
> 
> No, really, you can't just trust the uid and gid in the ckpt file :)

All right I have no idea what I was thinking.  Ignore this :)

-serge

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support
       [not found] ` <1244042305-7770-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-04 15:19   ` Serge E. Hallyn
@ 2009-06-04 20:14   ` Louis Rilling
  2009-06-04 21:16     ` Dan Smith
  1 sibling, 1 reply; 7+ messages in thread
From: Louis Rilling @ 2009-06-04 20:14 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg


[-- Attachment #1.1: Type: text/plain, Size: 11116 bytes --]

Hi,

On Wed, Jun 03, 2009 at 08:18:25AM -0700, Dan Smith wrote:
> This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
> has been tested with a single and multiple processes, and with data inflight
> at the time of checkpoint.  It supports both socketpair()s and path-based
> sockets.
> 
> I have an almost-working AF_INET follow-on to this which I can submit after
> this is reviewed and tweaked into acceptance.
> 

[...]

> diff --git a/net/socket_cr.c b/net/socket_cr.c
> new file mode 100644
> index 0000000..76759fe
> --- /dev/null
> +++ b/net/socket_cr.c
> @@ -0,0 +1,378 @@
> +/*
> + *  Copyright 2009 IBM Corporation
> + *
> + *  Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + */
> +
> +#include <linux/socket.h>
> +#include <linux/mount.h>
> +#include <linux/file.h>
> +
> +#include <net/af_unix.h>
> +#include <net/tcp_states.h>
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +
> +static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
> +{
> +	int count = 0;
> +	struct sk_buff *skb;
> +
> +	spin_lock(&from->lock);
> +
> +	skb_queue_walk(from, skb) {
> +		struct sk_buff *tmp;
> +
> +		tmp = skb_copy(skb, GFP_KERNEL);

GFP_KERNEL is not allowed here, since from->lock is locked. Not sure that
GFP_ATOMIC is acceptable though. Perhaps it would be better to temporarily move
the queue to a local head, copy it (no spinlock needed), and then push it again.
This would need to block concurrent senders/receivers during this operation,
unless it's guaranteed that they are all frozen.

Thanks,

Louis

> +		if (!tmp) {
> +			count = -ENOMEM;
> +			goto out;
> +		}
> +		skb_queue_tail(to, tmp);
> +		count++;
> +	}
> + out:
> +	spin_unlock(&from->lock);
> +
> +	return count;
> +}
> +
> +static int __sock_write_buffers(struct ckpt_ctx *ctx,
> +				struct sk_buff_head *queue)
> +{
> +	struct sk_buff *skb;
> +	int ret = 0;
> +
> +	skb_queue_walk(queue, skb) {
> +		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
> +					  CKPT_HDR_SOCKET_BUFFER);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff_head tmpq;
> +	int ret = -ENOMEM;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (!h)
> +		goto out;
> +
> +	skb_queue_head_init(&tmpq);
> +
> +	h->skb_count = sock_copy_buffers(queue, &tmpq);
> +	if (h->skb_count < 0) {
> +		ret = h->skb_count;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (!ret)
> +		ret = __sock_write_buffers(ctx, &tmpq);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	__skb_queue_purge(&tmpq);
> +
> +	return ret;
> +}
> +
> +static int sock_un_checkpoint(struct ckpt_ctx *ctx,
> +			      struct sock *sock,
> +			      struct ckpt_hdr_socket *h)
> +{
> +	struct unix_sock *sk = unix_sk(sock);
> +	struct unix_sock *pr = unix_sk(sk->peer);
> +	int new;
> +	int ret;
> +
> +	h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
> +	if (h->un.this < 0)
> +		goto out;
> +
> +	if (sk->peer)
> +		h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
> +	else
> +		h->un.peer = 0;
> +
> +	if (h->un.peer < 0) {
> +		ret = h->un.peer;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> + out:
> +	return ret;
> +}
> +
> +static int sock_cptrst(struct ckpt_ctx *ctx,
> +		       struct sock *sock,
> +		       struct ckpt_hdr_socket *h,
> +		       int op)
> +{
> +	if (sock->sk_socket) {
> +		CKPT_COPY(op, h->socket_flags, sock->sk_socket->flags);
> +		CKPT_COPY(op, h->socket_state, sock->sk_socket->state);
> +	}
> +
> +	CKPT_COPY(op, h->reuse, sock->sk_reuse);
> +	CKPT_COPY(op, h->shutdown, sock->sk_shutdown);
> +	CKPT_COPY(op, h->userlocks, sock->sk_userlocks);
> +	CKPT_COPY(op, h->no_check, sock->sk_no_check);
> +	CKPT_COPY(op, h->protocol, sock->sk_protocol);
> +	CKPT_COPY(op, h->err, sock->sk_err);
> +	CKPT_COPY(op, h->err_soft, sock->sk_err_soft);
> +	CKPT_COPY(op, h->priority, sock->sk_priority);
> +	CKPT_COPY(op, h->rcvlowat, sock->sk_rcvlowat);
> +	CKPT_COPY(op, h->backlog, sock->sk_max_ack_backlog);
> +	CKPT_COPY(op, h->rcvtimeo, sock->sk_rcvtimeo);
> +	CKPT_COPY(op, h->sndtimeo, sock->sk_sndtimeo);
> +	CKPT_COPY(op, h->rcvbuf, sock->sk_rcvbuf);
> +	CKPT_COPY(op, h->sndbuf, sock->sk_sndbuf);
> +	CKPT_COPY(op, h->bound_dev_if, sock->sk_bound_dev_if);
> +	CKPT_COPY(op, h->flags, sock->sk_flags);
> +	CKPT_COPY(op, h->lingertime, sock->sk_lingertime);
> +
> +	return 0;
> +}
> +
> +int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct socket *socket = file->private_data;
> +	struct sock *sock = socket->sk;
> +	struct ckpt_hdr_socket *h;
> +	int ret = 0;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->family = sock->sk_family;
> +	h->state = socket->state;
> +	h->sock_state = sock->sk_state;
> +	h->reuse = sock->sk_reuse;
> +	h->type = sock->sk_type;
> +	h->protocol = sock->sk_protocol;
> +
> +	h->laddr_len = sizeof(h->laddr);
> +	h->raddr_len = sizeof(h->raddr);
> +
> +	if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((h->sock_state != TCP_LISTEN) &&
> +	    (h->type != SOCK_DGRAM) &&
> +	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	sock_cptrst(ctx, sock, h, CKPT_CPT);
> +
> +	if (h->family == AF_UNIX) {
> +		ret = sock_un_checkpoint(ctx, sock, h);
> +		if (ret)
> +			goto out;
> +	} else {
> +		ckpt_debug("unsupported socket type %i\n", h->family);
> +		ret = EINVAL;
> +		goto out;
> +	}
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_write_queue);
> +	if (ret)
> +		goto out;
> +
> +	/* FIXME: write out-of-order queue for TCP */
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_read_buffer(struct ckpt_ctx *ctx,
> +			    struct sock *sock,
> +			    struct sk_buff **skb)
> +{
> +	struct ckpt_hdr *h;
> +	int ret = 0;
> +	int len;
> +
> +	h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = h->len - sizeof(*h);
> +
> +	*skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
> +	if (*skb == NULL) {
> +		ret = ENOMEM;
> +		goto out;
> +	}
> +
> +	memcpy(skb_put(*skb, len), (char *)(h + 1), len);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static int sock_read_buffers(struct ckpt_ctx *ctx,
> +			     struct sock *sock,
> +			     struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (IS_ERR(h)) {
> +		ret = PTR_ERR(h);
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		struct sk_buff *skb = NULL;
> +
> +		ret = sock_read_buffer(ctx, sock, &skb);
> +		if (ret)
> +			break;
> +
> +		skb_queue_tail(queue, skb);
> +	}
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_un_restart(struct ckpt_ctx *ctx,
> +			   struct ckpt_hdr_socket *h,
> +			   struct socket *socket)
> +{
> +	struct sock *peer;
> +	int ret = 0;
> +
> +	if (h->sock_state == TCP_ESTABLISHED) {
> +		peer = ckpt_obj_fetch(ctx, h->un.peer, CKPT_OBJ_SOCK);
> +		if (peer && !IS_ERR(peer)) {
> +			/* We're last, so join with peer */
> +			struct sock *this = socket->sk;
> +
> +			sock_hold(this);
> +			sock_hold(peer);
> +
> +			unix_sk(this)->peer = peer;
> +			unix_sk(peer)->peer = this;
> +
> +			this->sk_peercred.pid = task_tgid_vnr(current);
> +			current_euid_egid(&this->sk_peercred.uid,
> +					  &this->sk_peercred.gid);
> +
> +			peer->sk_peercred.pid = task_tgid_vnr(current);
> +			current_euid_egid(&peer->sk_peercred.uid,
> +					  &peer->sk_peercred.gid);
> +		} else {
> +			/* We're first, so add our socket and wait for peer */
> +			ckpt_obj_insert(ctx, socket->sk, h->un.this,
> +					CKPT_OBJ_SOCK);
> +		}
> +
> +	} else if (h->sock_state == TCP_LISTEN) {
> +		ret = socket->ops->bind(socket,
> +					(struct sockaddr *)&h->laddr,
> +					h->laddr_len);
> +		if (ret < 0)
> +			goto out;
> +
> +		ret = socket->ops->listen(socket, h->backlog);
> +		if (ret < 0)
> +			goto out;
> +	} else
> +		ckpt_debug("unsupported UNIX socket state %i\n", h->state);
> +
> +	socket->state = h->state;
> +	socket->sk->sk_state = h->sock_state;
> + out:
> +	return ret;
> +}
> +
> +struct socket *__sock_file_restore(struct ckpt_ctx *ctx,
> +				   struct ckpt_hdr_socket *h)
> +{
> +	struct socket *socket;
> +	int ret;
> +
> +	ret = sock_create(h->family, h->type, 0, &socket);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	if (h->family == AF_UNIX) {
> +		ret = sock_un_restart(ctx, h, socket);
> +		ckpt_debug("sock_un_restart: %i\n", ret);
> +	} else {
> +		ckpt_debug("unsupported family %i\n", h->family);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
> +	if (ret)
> +		goto out;
> + out:
> +	if (ret) {
> +		sock_release(socket);
> +		socket = ERR_PTR(ret);
> +	}
> +
> +	return socket;
> +}
> +
> +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
> +{
> +	struct ckpt_hdr_file_socket *h;
> +	int ret;
> +	struct file *file = ptr;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->common.f_type = CKPT_FILE_SOCKET;
> +
> +	ret = checkpoint_file_common(ctx, file, &h->common);
> +	if (ret < 0)
> +		goto out;
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = __sock_file_checkpoint(ctx, file);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +
> -- 
> 1.6.0.4
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 206 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support
       [not found]     ` <20090604151923.GA29519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-04 15:36       ` Serge E. Hallyn
@ 2009-06-04 20:20       ` Dan Smith
  2009-06-08  6:15       ` Oren Laadan
  2 siblings, 0 replies; 7+ messages in thread
From: Dan Smith @ 2009-06-04 20:20 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

SH> I do think that the following should be moved into network
SH> headers:

>> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
SH> ...
>> @@ -248,6 +262,11 @@ struct ckpt_hdr_file_pipe {
>> __s32 pipe_objref;
>> } __attribute__((aligned(8)));
>> 
>> +struct ckpt_hdr_file_socket {
>> +	struct ckpt_hdr_file common;
>> +	__u16 family;
>> +} __attribute__((aligned(8)));
>> +
>> struct ckpt_hdr_file_pipe_state {
>> struct ckpt_hdr h;
>> __s32 pipe_len;
>> @@ -394,4 +413,56 @@ struct ckpt_hdr_ipc_sem {
>> #define CKPT_TST_OVERFLOW_64(a, b) \
>> ((sizeof(a) > sizeof(b)) && ((a) > LONG_MAX))
>> 
>> +struct ckpt_hdr_socket {
>> +	struct ckpt_hdr h;
>> +
>> +	/* sock_common */
>> +	__u16 family;
>> +	__u8 state;
>> +	__u8 reuse;
>> +	__u32 bound_dev_if;
>> +
>> +	/* sock */
>> +	__u8 protocol;
>> +	__u16 type;
>> +	__u8 sock_state;
>> +	__u8 shutdown;
>> +	__u8 userlocks;
>> +	__u8 no_check;
>> +	__u32 err;
>> +	__u32 err_soft;
>> +	__u32 priority;
>> +	__u64 rcvlowat;
>> +	__u64 rcvtimeo;
>> +	__u64 sndtimeo;
>> +	__u16 backlog;
>> +	__s32 rcvbuf;
>> +	__s32 sndbuf;
>> +	__u64 flags;
>> +	__u64 lingertime;
>> +
>> +	/* socket */
>> +	__u64 socket_flags;
>> +	__u8 socket_state;
>> +
>> +	/* common to all supported families */
>> +	struct sockaddr laddr;
>> +	struct sockaddr raddr;
>> +	__u32 laddr_len;
>> +	__u32 raddr_len;
>> +
>> +	union {
>> +		struct {
>> +			__u32 this;
>> +			__u32 peer;
>> +		} un;
>> +	};
>> +
>> +} __attribute__ ((aligned(8)));

I think that makes sense.  The (large amount of) changes to add INET
support would seal the deal, I think.  So this goes in something like
include/linux/socket.h?

SH> EXTREME nit: a blank line between the return and the error label.

Ah, oops.

SH> in the CHECKPOINT_SUBTREE case do we want to try to ensure that
SH> sk->peer is owned by another checkpointed task?

That probably wouldn't be too hard, as I can just check pids_arr.

>> +			peer->sk_peercred.pid = task_tgid_vnr(current);

SH> Will the peer's sk_peercred.pid always be current's pid?

That gets set to the pid of whichever side does the connection, in the
normal connect()..accept() case, so I think this is okay.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support
  2009-06-04 20:14   ` Louis Rilling
@ 2009-06-04 21:16     ` Dan Smith
  0 siblings, 0 replies; 7+ messages in thread
From: Dan Smith @ 2009-06-04 21:16 UTC (permalink / raw)
  To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

LR> GFP_KERNEL is not allowed here, since from->lock is locked. Not
LR> sure that GFP_ATOMIC is acceptable though. Perhaps it would be
LR> better to temporarily move the queue to a local head, copy it (no
LR> spinlock needed), and then push it again.  This would need to
LR> block concurrent senders/receivers during this operation, unless
LR> it's guaranteed that they are all frozen.

Ah, yeah, good catch.  I'll have to think about that a little.

Thanks!

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support
       [not found]     ` <20090604151923.GA29519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-04 15:36       ` Serge E. Hallyn
  2009-06-04 20:20       ` Dan Smith
@ 2009-06-08  6:15       ` Oren Laadan
  2 siblings, 0 replies; 7+ messages in thread
From: Oren Laadan @ 2009-06-08  6:15 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith



Serge E. Hallyn wrote:
> Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
>> This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
>> has been tested with a single and multiple processes, and with data inflight
>> at the time of checkpoint.  It supports both socketpair()s and path-based
>> sockets.
>>
>> I have an almost-working AF_INET follow-on to this which I can submit after
>> this is reviewed and tweaked into acceptance.
>>
>> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

[...]

> 
>> +static int sock_un_checkpoint(struct ckpt_ctx *ctx,
>> +			      struct sock *sock,
>> +			      struct ckpt_hdr_socket *h)
>> +{
>> +	struct unix_sock *sk = unix_sk(sock);
>> +	struct unix_sock *pr = unix_sk(sk->peer);
>> +	int new;
>> +	int ret;
>> +
>> +	h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
>> +	if (h->un.this < 0)
>> +		goto out;
>> +
>> +	if (sk->peer)
>> +		h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
>> +	else
>> +		h->un.peer = 0;
>> +
>> +	if (h->un.peer < 0) {
>> +		ret = h->un.peer;
>> +		goto out;
>> +	}
>> +
>> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
>> + out:
>> +	return ret;
>> +}
> 
> in the CHECKPOINT_SUBTREE case do we want to try to ensure that sk->peer
> is owned by another checkpointed task?

What exactly would you like to enforce - that it is "in-use"
by a checkpointed task, or that is isn't "in-use" outside ?

It probably makes sense to verify that the socket is "in-use"
by at least one task in the checkpointed set (heh... I expect
kerlab guys to argue against forcing this...), and perhaps
issue a warning ?

(Which is not a bad idea - add a ckpt_write_warning() function
that will write a warning in the image, but won't abort the
entire checkpoint).

It isn't easy to verify the "in-use" property - what if task
A transfers a file using unix-domain sockets to task B (both
in the set), and A closed the file descriptor.... so we can
know it's in transit, but we don't know who will receive the
file eventually.

(Ahh.. of course .. issue a warning :)

It makes less sense to verify the socket is _not_ in use
_outside_ the checkpointed set - and it can be expensive to
do so; After all, there is a whole-container option if you
need that guarantee.

If we are to add such checks, or warnings, it's clearly not
a high priority now (and given akpm's comment ...).

Oren.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-06-08  6:15 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-03 15:18 [PATCH] c/r: Add AF_UNIX support Dan Smith
     [not found] ` <1244042305-7770-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-04 15:19   ` Serge E. Hallyn
     [not found]     ` <20090604151923.GA29519-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-04 15:36       ` Serge E. Hallyn
2009-06-04 20:20       ` Dan Smith
2009-06-08  6:15       ` Oren Laadan
2009-06-04 20:14   ` Louis Rilling
2009-06-04 21:16     ` Dan Smith

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.