All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] c/r: Add AF_UNIX support (v3)
@ 2009-06-16 15:55 Dan Smith
       [not found] ` <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-06-16 15:55 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: Alexey Dobriyan

This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
has been tested with a single and multiple processes, and with data inflight
at the time of checkpoint.  It supports socketpair()s, path-based, and
abstract sockets.

Changes in v3:
  - Move sock_file_checkpoint() above sock_file_restore()
  - Change __sock_file_*() functions to do_sock_file_*()
  - Adjust some of the struct cr_hdr_socket alignment
  - Improve the sock_copy_buffers() algorithm to avoid locking the source
    queue for the entire operation
  - Fix alignment in the socket header struct(s)
  - Move the per-protocol structure (ckpt_hdr_socket_un) out of the
    common socket header and read/write it separately
  - Fix missing call to sock_cptrst() in restore path
  - Break out the socket joining into another function
  - Fix failure to restore the socket address thus fixing getname()
  - Check the state values on restart
  - Fix case of state being TCP_CLOSE, which allows dgram sockets to be
    properly connected (if appropriate) to their peer and maintain the
    sockaddr for getname() operation
  - Fix restoring a listening socket that has been unlink()'d
  - Fix checkpointing sockets with an in-flight FD-passing SKB.  Fail
    with EBUSY.
  - Fix checkpointing listening sockets with an unaccepted connection.
    Fail with EBUSY.

Changes in v2:
  - Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems
    to be rather common in other uses of skb_copy())
  - Move the ckpt_hdr_socket structure definition to linux/socket.h
  - Fix whitespace issue
  - Move sock_file_checkpoint() to net/socket.c for symmetry

Cc: Oren Laaden <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Cc: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/files.c             |    7 +
 checkpoint/objhash.c           |   27 +++
 include/linux/checkpoint_hdr.h |   13 +
 include/linux/socket.h         |   59 +++++
 include/net/sock.h             |    9 +
 net/Makefile                   |    2 +
 net/checkpoint.c               |  493 ++++++++++++++++++++++++++++++++++++++++
 net/socket.c                   |   84 +++++++
 8 files changed, 694 insertions(+), 0 deletions(-)
 create mode 100644 net/checkpoint.c

diff --git a/checkpoint/files.c b/checkpoint/files.c
index b264e40..bb2cca0 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 
 /**************************************************************************
@@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
 		.file_type = CKPT_FILE_PIPE,
 		.restore = pipe_file_restore,
 	},
+	/* socket */
+	{
+		.file_name = "SOCKET",
+		.file_type = CKPT_FILE_SOCKET,
+		.restore = sock_file_restore,
+	},
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 045a920..7819e5e 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -19,6 +19,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 struct ckpt_obj;
 struct ckpt_obj_ops;
@@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
 	return atomic_read(&((struct ipc_namespace *) ptr)->count);
 }
 
+static int obj_sock_grab(void *ptr)
+{
+	sock_hold((struct sock *) ptr);
+	return 0;
+}
+
+static void obj_sock_drop(void *ptr)
+{
+	sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_bad,
 		.restore = restore_bad,
 	},
+	/* sock object */
+	{
+		.obj_name = "SOCKET",
+		.obj_type = CKPT_OBJ_SOCK,
+		.ref_drop = obj_sock_drop,
+		.ref_grab = obj_sock_grab,
+		.ref_users = obj_sock_users,
+		.checkpoint = sock_file_checkpoint,
+		.restore = sock_file_restore,
+	},
 };
 
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index cd427d8..60870df 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -76,6 +76,12 @@ enum {
 	CKPT_HDR_IPC_MSG_MSG,
 	CKPT_HDR_IPC_SEM,
 
+ 	CKPT_HDR_FD_SOCKET = 601,
+ 	CKPT_HDR_SOCKET,
+ 	CKPT_HDR_SOCKET_BUFFERS,
+ 	CKPT_HDR_SOCKET_BUFFER,
+	CKPT_HDR_SOCKET_UN,
+
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
@@ -103,6 +109,7 @@ enum obj_type {
 	CKPT_OBJ_NS,
 	CKPT_OBJ_UTS_NS,
 	CKPT_OBJ_IPC_NS,
+	CKPT_OBJ_SOCK,
 	CKPT_OBJ_MAX
 };
 
@@ -225,6 +232,7 @@ enum file_type {
 	CKPT_FILE_IGNORE = 0,
 	CKPT_FILE_GENERIC,
 	CKPT_FILE_PIPE,
+	CKPT_FILE_SOCKET,
 	CKPT_FILE_MAX
 };
 
@@ -248,6 +256,11 @@ struct ckpt_hdr_file_pipe {
 	__s32 pipe_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_file_socket {
+	struct ckpt_hdr_file common;
+	__u16 family;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_file_pipe_state {
 	struct ckpt_hdr h;
 	__s32 pipe_len;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 421afb4..3b5be70 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage {
 #include <linux/uio.h>			/* iovec support		*/
 #include <linux/types.h>		/* pid_t			*/
 #include <linux/compiler.h>		/* __user			*/
+#include <linux/checkpoint_hdr.h>	/* ckpt_hdr			*/
 
 #ifdef __KERNEL__
 # ifdef CONFIG_PROC_FS
@@ -323,5 +324,63 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
 #endif
+
+struct ckpt_hdr_socket_un {
+	struct ckpt_hdr h;
+	__u32 this;
+	__u32 peer;
+	__u8 linked;
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket {
+	struct ckpt_hdr h;
+
+	struct ckpt_socket { /* struct socket */
+		__u64 flags;
+		__u8 state;
+	} socket __attribute__ ((aligned(8)));
+
+	struct ckpt_sock_common { /* struct sock_common */
+		__u32 bound_dev_if;
+		__u16 family;
+		__u8 state;
+		__u8 reuse;
+	} sock_common __attribute__ ((aligned(8)));
+
+	struct ckpt_sock { /* struct sock */
+		__u64 rcvlowat;
+		__u64 rcvtimeo;
+		__u64 sndtimeo;
+		__u64 flags;
+		__u64 lingertime;
+
+		__u32 err;
+		__u32 err_soft;
+		__u32 priority;
+		__s32 rcvbuf;
+		__s32 sndbuf;
+		__u16 type;
+		__u16 backlog;
+
+		__u8 protocol;
+		__u8 state;
+		__u8 shutdown;
+		__u8 userlocks;
+		__u8 no_check;
+	} sock __attribute__ ((aligned(8)));
+
+	/* common to all supported families */
+	__u32 laddr_len;
+	__u32 raddr_len;
+	struct sockaddr laddr;
+	struct sockaddr raddr;
+
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket_buffer {
+	struct ckpt_hdr h;
+	__u32 skb_count;
+} __attribute__ ((aligned(8)));
+
 #endif /* not kernel and not glibc */
 #endif /* _LINUX_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..a8b6af1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+/* Checkpoint/Restart Functions */
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
+extern void *sock_file_restore(struct ckpt_ctx *);
+extern struct socket *do_sock_file_restore(struct ckpt_ctx *,
+					   struct ckpt_hdr_socket *);
+extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+
 #endif	/* _SOCK_H */
diff --git a/net/Makefile b/net/Makefile
index 9e00a55..c226ed1 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
+
+obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
diff --git a/net/checkpoint.c b/net/checkpoint.c
new file mode 100644
index 0000000..fd47485
--- /dev/null
+++ b/net/checkpoint.c
@@ -0,0 +1,493 @@
+/*
+ *  Copyright 2009 IBM Corporation
+ *
+ *  Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/socket.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/namei.h>
+
+/* Size of an empty struct sockaddr_un */
+#define UNIX_LEN_EMPTY 2
+
+static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
+{
+	int count = 0;
+	struct sk_buff *skb;
+
+	skb_queue_walk(from, skb) {
+		struct sk_buff *tmp;
+
+		tmp = dev_alloc_skb(skb->len);
+		if (!tmp)
+			return -ENOMEM;
+
+		spin_lock(&from->lock);
+		skb_morph(tmp, skb);
+		spin_unlock(&from->lock);
+
+		skb_queue_tail(to, tmp);
+		count++;
+	}
+
+	return count;
+}
+
+static int __sock_write_buffers(struct ckpt_ctx *ctx,
+				struct sk_buff_head *queue)
+{
+	struct sk_buff *skb;
+	int ret = 0;
+
+	skb_queue_walk(queue, skb) {
+		if (UNIXCB(skb).fp) {
+			ckpt_debug("unsupported fd-passing skb found\n");
+			return -EBUSY;
+		}
+
+		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
+					  CKPT_HDR_SOCKET_BUFFER);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	struct sk_buff_head tmpq;
+	int ret = -ENOMEM;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+	if (!h)
+		goto out;
+
+	skb_queue_head_init(&tmpq);
+
+	h->skb_count = sock_copy_buffers(queue, &tmpq);
+	if (h->skb_count < 0) {
+		ret = h->skb_count;
+		goto out;
+	}
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (!ret)
+		ret = __sock_write_buffers(ctx, &tmpq);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+	__skb_queue_purge(&tmpq);
+
+	return ret;
+}
+
+static int sock_un_checkpoint(struct ckpt_ctx *ctx,
+			      struct sock *sock,
+			      struct ckpt_hdr_socket *h)
+{
+	struct unix_sock *sk = unix_sk(sock);
+	struct unix_sock *pr = unix_sk(sk->peer);
+	struct ckpt_hdr_socket_un *un;
+	int new;
+	int ret = -ENOMEM;
+
+	if ((sock->sk_state == TCP_LISTEN) &&
+	    !skb_queue_empty(&sock->sk_receive_queue)) {
+		ckpt_debug("listening socket has unaccepted peers");
+		return -EBUSY;
+	}
+
+	un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
+	if (!un)
+		goto out;
+
+	un->linked = sk->dentry && (sk->dentry->d_inode->i_nlink > 0);
+
+	un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
+	if (un->this < 0)
+		goto out;
+
+	if (sk->peer)
+		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+	else
+		un->peer = 0;
+
+	if (un->peer < 0) {
+		ret = un->peer;
+		goto out;
+	}
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un);
+ out:
+	return ret;
+}
+
+static int sock_cptrst(struct ckpt_ctx *ctx,
+		       struct sock *sock,
+		       struct ckpt_hdr_socket *h,
+		       int op)
+{
+	if (sock->sk_socket) {
+		CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags);
+		CKPT_COPY(op, h->socket.state, sock->sk_socket->state);
+	}
+
+	CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse);
+	CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if);
+	CKPT_COPY(op, h->sock_common.family, sock->sk_family);
+
+	CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown);
+	CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks);
+	CKPT_COPY(op, h->sock.no_check, sock->sk_no_check);
+	CKPT_COPY(op, h->sock.protocol, sock->sk_protocol);
+	CKPT_COPY(op, h->sock.err, sock->sk_err);
+	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);
+	CKPT_COPY(op, h->sock.priority, sock->sk_priority);
+	CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat);
+	CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog);
+	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
+	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);
+	CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf);
+	CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf);
+	CKPT_COPY(op, h->sock.flags, sock->sk_flags);
+	CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime);
+	CKPT_COPY(op, h->sock.type, sock->sk_type);
+	CKPT_COPY(op, h->sock.state, sock->sk_state);
+
+	if ((h->socket.state == SS_CONNECTED) &&
+	    (h->sock.state != TCP_ESTABLISHED)) {
+		ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
+			   h->socket.state, h->sock.state);
+		return -EINVAL;
+	} else if ((h->socket.state == SS_UNCONNECTED) &&
+		   (h->sock.state == TCP_ESTABLISHED)) {
+		ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
+			   h->socket.state, h->sock.state);
+		return -EINVAL;
+	} else if ((h->sock.state < TCP_ESTABLISHED) ||
+		   (h->sock.state >= TCP_MAX_STATES)) {
+		ckpt_debug("sock in invalid state: %i\n", h->sock.state);
+		return -EINVAL;
+	} else if ((h->socket.state < SS_FREE) ||
+		   (h->socket.state > SS_DISCONNECTING)) {
+		ckpt_debug("socket in invalid state: %i\n", h->socket.state);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct socket *socket = file->private_data;
+	struct sock *sock = socket->sk;
+	struct ckpt_hdr_socket *h;
+	int ret = 0;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (!h)
+		return -ENOMEM;
+
+	h->laddr_len = sizeof(h->laddr);
+	h->raddr_len = sizeof(h->raddr);
+
+	if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((sock->sk_state != TCP_LISTEN) &&
+	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
+		if (sock->sk_type != SOCK_DGRAM) {
+			ret = -EINVAL;
+			goto out;
+		}
+		h->raddr_len = 0;
+	}
+
+	sock_cptrst(ctx, sock, h, CKPT_CPT);
+
+	if (sock->sk_family == AF_UNIX) {
+		ret = sock_un_checkpoint(ctx, sock, h);
+		if (ret)
+			goto out;
+	} else {
+		ckpt_debug("unsupported socket type %i\n",
+			   sock->sk_family);
+		ret = EINVAL;
+		goto out;
+	}
+
+	ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+	if (ret)
+		goto out;
+
+	ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+	if (ret)
+		goto out;
+
+	/* FIXME: write out-of-order queue for TCP */
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int sock_read_buffer(struct ckpt_ctx *ctx,
+			    struct sock *sock,
+			    struct sk_buff **skb)
+{
+	struct ckpt_hdr *h;
+	int ret = 0;
+	int len;
+
+	h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = h->len - sizeof(*h);
+
+	*skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
+	if (*skb == NULL) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	memcpy(skb_put(*skb, len), (char *)(h + 1), len);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int sock_read_buffers(struct ckpt_ctx *ctx,
+			     struct sock *sock,
+			     struct sk_buff_head *queue)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+	if (IS_ERR(h)) {
+		ret = PTR_ERR(h);
+		goto out;
+	}
+
+	for (i = 0; i < h->skb_count; i++) {
+		struct sk_buff *skb = NULL;
+
+		ret = sock_read_buffer(ctx, sock, &skb);
+		if (ret)
+			break;
+
+		skb_queue_tail(queue, skb);
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static struct unix_address *sock_un_makeaddr(struct sockaddr_un *sun_addr,
+					     unsigned len)
+{
+	struct unix_address *addr;
+
+	addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL);
+	if (!addr)
+		return ERR_PTR(ENOMEM);
+
+	memcpy(addr->name, sun_addr, len);
+	addr->len = len;
+	atomic_set(&addr->refcnt, 1);
+
+	return addr;
+}
+
+static int sock_un_join(struct sock *a,
+			struct sock *b,
+			struct ckpt_hdr_socket *h)
+{
+	struct unix_address *addr;
+
+	sock_hold(a);
+	sock_hold(b);
+
+	unix_sk(a)->peer = b;
+	unix_sk(b)->peer = a;
+
+	a->sk_peercred.pid = task_tgid_vnr(current);
+	current_euid_egid(&a->sk_peercred.uid,
+			  &a->sk_peercred.gid);
+
+	b->sk_peercred.pid = task_tgid_vnr(current);
+	current_euid_egid(&b->sk_peercred.uid,
+			  &b->sk_peercred.gid);
+
+	if (h->laddr_len == UNIX_LEN_EMPTY)
+		addr = sock_un_makeaddr((struct sockaddr_un *)&h->raddr,
+					h->raddr_len);
+	else
+		addr = sock_un_makeaddr((struct sockaddr_un *)&h->laddr,
+					h->laddr_len);
+	if (IS_ERR(addr))
+	    return PTR_ERR(addr);
+
+	atomic_inc(&addr->refcnt); /* Held by both ends */
+	unix_sk(a)->addr = unix_sk(b)->addr = addr;
+
+	return 0;
+}
+
+static int sock_un_unlink(const char *name)
+{
+	struct path spath;
+	struct path ppath;
+	int ret;
+
+	ret = kern_path(name, 0, &spath);
+	if (ret)
+		return ret;
+
+	ret = kern_path(name, LOOKUP_PARENT, &ppath);
+	if (ret)
+		goto out_s;
+
+	if (!spath.dentry) {
+		ckpt_debug("No dentry found for %s\n", name);
+		ret = -ENOENT;
+		goto out_p;
+	}
+
+	if (!ppath.dentry || !ppath.dentry->d_inode) {
+		ckpt_debug("No inode for parent of %s\n", name);
+		ret = -ENOENT;
+		goto out_p;
+	}
+
+	ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry);
+ out_p:
+	path_put(&ppath);
+ out_s:
+	path_put(&spath);
+
+	return ret;
+}
+
+static int sock_un_restart(struct ckpt_ctx *ctx,
+			   struct ckpt_hdr_socket *h,
+			   struct socket *socket)
+{
+	struct sock *peer;
+	struct ckpt_hdr_socket_un *un;
+	int ret = 0;
+
+	un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
+	if (IS_ERR(un))
+		return PTR_ERR(un);
+
+	if (un->peer < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
+
+	if ((h->sock.state == TCP_ESTABLISHED) ||
+	    (h->sock.state == TCP_CLOSE)) {
+		if (!IS_ERR(peer)) {
+			/* We're last, so join with peer */
+			struct sock *this = socket->sk;
+
+			ret = sock_un_join(this, peer, h);
+		} else {
+			/* We're first, so add our socket and wait for peer */
+			ckpt_obj_insert(ctx, socket->sk, un->this,
+					CKPT_OBJ_SOCK);
+		}
+
+	} else if (h->sock.state == TCP_LISTEN) {
+		ret = socket->ops->bind(socket,
+					(struct sockaddr *)&h->laddr,
+					h->laddr_len);
+		if (ret < 0)
+			goto out;
+
+		ret = socket->ops->listen(socket, h->sock.backlog);
+		if (ret < 0)
+			goto out;
+
+		/* We can unlink this blindly because we just created it
+		 * above and would have failed already without proper
+		 * permissions
+		 */
+		if (!un->linked) {
+			struct sockaddr_un *sun =
+				(struct sockaddr_un *)&h->laddr;
+			ret = sock_un_unlink(sun->sun_path);
+		}
+	} else
+		ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state);
+ out:
+	ckpt_hdr_put(ctx, un);
+	return ret;
+}
+
+struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
+				    struct ckpt_hdr_socket *h)
+{
+	struct socket *socket;
+	int ret;
+
+	ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (h->sock_common.family == AF_UNIX) {
+		ret = sock_un_restart(ctx, h, socket);
+		ckpt_debug("sock_un_restart: %i\n", ret);
+	} else {
+		ckpt_debug("unsupported family %i\n", h->sock_common.family);
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		goto out;
+
+	sock_cptrst(ctx, socket->sk, h, CKPT_RST);
+
+	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
+	if (ret)
+		goto out;
+
+	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
+	if (ret)
+		goto out;
+ out:
+	if (ret) {
+		sock_release(socket);
+		socket = ERR_PTR(ret);
+	}
+
+	return socket;
+}
+
diff --git a/net/socket.c b/net/socket.c
index 791d71a..be8c562 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -96,6 +96,9 @@
 #include <net/sock.h>
 #include <linux/netfilter.h>
 
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
@@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
 	.sendpage =	sock_sendpage,
 	.splice_write = generic_splice_sendpage,
 	.splice_read =	sock_splice_read,
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint =   sock_file_checkpoint,
+#endif
 };
 
 /*
@@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags)
 	return fd;
 }
 
+int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr_file_socket *h;
+	int ret;
+	struct file *file = ptr;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_SOCKET;
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = do_sock_file_checkpoint(ctx, file);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static struct file *sock_alloc_attach_fd(struct socket *socket)
+{
+	struct file *file;
+	int err;
+
+	file = get_empty_filp();
+	if (!file)
+		return ERR_PTR(ENOMEM);
+
+	err = sock_attach_fd(socket, file, 0);
+	if (err < 0) {
+		put_filp(file);
+		file = ERR_PTR(err);
+	}
+
+	return file;
+}
+
+void *sock_file_restore(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_socket *h = NULL;
+	struct socket *socket = NULL;
+	struct file *file = NULL;
+	int err;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+	if (IS_ERR(h))
+		return h;
+
+	socket = do_sock_file_restore(ctx, h);
+	if (IS_ERR(socket)) {
+		err = PTR_ERR(socket);
+		goto err_put;
+	}
+
+	file = sock_alloc_attach_fd(socket);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_release;
+	}
+
+	ckpt_hdr_put(ctx, h);
+
+	return file;
+
+ err_release:
+	sock_release(socket);
+ err_put:
+	ckpt_hdr_put(ctx, h);
+
+	return ERR_PTR(err);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
 	if (file->f_op == &socket_file_ops)
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found] ` <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-25  2:30   ` Oren Laadan
  2009-06-29 17:29     ` Dan Smith
  0 siblings, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-06-25  2:30 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan



Dan Smith wrote:
> This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
> has been tested with a single and multiple processes, and with data inflight
> at the time of checkpoint.  It supports socketpair()s, path-based, and
> abstract sockets.
> 
> Changes in v3:
>   - Move sock_file_checkpoint() above sock_file_restore()
>   - Change __sock_file_*() functions to do_sock_file_*()
>   - Adjust some of the struct cr_hdr_socket alignment
>   - Improve the sock_copy_buffers() algorithm to avoid locking the source
>     queue for the entire operation
>   - Fix alignment in the socket header struct(s)
>   - Move the per-protocol structure (ckpt_hdr_socket_un) out of the
>     common socket header and read/write it separately
>   - Fix missing call to sock_cptrst() in restore path
>   - Break out the socket joining into another function
>   - Fix failure to restore the socket address thus fixing getname()
>   - Check the state values on restart
>   - Fix case of state being TCP_CLOSE, which allows dgram sockets to be
>     properly connected (if appropriate) to their peer and maintain the
>     sockaddr for getname() operation
>   - Fix restoring a listening socket that has been unlink()'d
>   - Fix checkpointing sockets with an in-flight FD-passing SKB.  Fail
>     with EBUSY.
>   - Fix checkpointing listening sockets with an unaccepted connection.
>     Fail with EBUSY.
> 
> Changes in v2:
>   - Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems
>     to be rather common in other uses of skb_copy())
>   - Move the ckpt_hdr_socket structure definition to linux/socket.h
>   - Fix whitespace issue
>   - Move sock_file_checkpoint() to net/socket.c for symmetry
> 
> Cc: Oren Laaden <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> Cc: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/files.c             |    7 +
>  checkpoint/objhash.c           |   27 +++
>  include/linux/checkpoint_hdr.h |   13 +
>  include/linux/socket.h         |   59 +++++
>  include/net/sock.h             |    9 +
>  net/Makefile                   |    2 +
>  net/checkpoint.c               |  493 ++++++++++++++++++++++++++++++++++++++++
>  net/socket.c                   |   84 +++++++
>  8 files changed, 694 insertions(+), 0 deletions(-)
>  create mode 100644 net/checkpoint.c
> 
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index b264e40..bb2cca0 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -21,6 +21,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <net/sock.h>
>  
>  
>  /**************************************************************************
> @@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
>  		.file_type = CKPT_FILE_PIPE,
>  		.restore = pipe_file_restore,
>  	},
> +	/* socket */
> +	{
> +		.file_name = "SOCKET",
> +		.file_type = CKPT_FILE_SOCKET,
> +		.restore = sock_file_restore,
> +	},
>  };
>  
>  static struct file *do_restore_file(struct ckpt_ctx *ctx)
> diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
> index 045a920..7819e5e 100644
> --- a/checkpoint/objhash.c
> +++ b/checkpoint/objhash.c
> @@ -19,6 +19,7 @@
>  #include <linux/ipc_namespace.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <net/sock.h>
>  
>  struct ckpt_obj;
>  struct ckpt_obj_ops;
> @@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
>  	return atomic_read(&((struct ipc_namespace *) ptr)->count);
>  }
>  
> +static int obj_sock_grab(void *ptr)
> +{
> +	sock_hold((struct sock *) ptr);
> +	return 0;
> +}
> +
> +static void obj_sock_drop(void *ptr)
> +{
> +	sock_put((struct sock *) ptr);
> +}
> +
> +static int obj_sock_users(void *ptr)
> +{
> +	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
> +}
> +
>  static struct ckpt_obj_ops ckpt_obj_ops[] = {
>  	/* ignored object */
>  	{
> @@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
>  		.checkpoint = checkpoint_bad,
>  		.restore = restore_bad,
>  	},
> +	/* sock object */
> +	{
> +		.obj_name = "SOCKET",
> +		.obj_type = CKPT_OBJ_SOCK,
> +		.ref_drop = obj_sock_drop,
> +		.ref_grab = obj_sock_grab,
> +		.ref_users = obj_sock_users,
> +		.checkpoint = sock_file_checkpoint,
> +		.restore = sock_file_restore,
> +	},
>  };
>  
>  
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index cd427d8..60870df 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -76,6 +76,12 @@ enum {
>  	CKPT_HDR_IPC_MSG_MSG,
>  	CKPT_HDR_IPC_SEM,
>  
> + 	CKPT_HDR_FD_SOCKET = 601,
> + 	CKPT_HDR_SOCKET,
> + 	CKPT_HDR_SOCKET_BUFFERS,
> + 	CKPT_HDR_SOCKET_BUFFER,
> +	CKPT_HDR_SOCKET_UN,
> +
>  	CKPT_HDR_TAIL = 9001,
>  
>  	CKPT_HDR_ERROR = 9999,
> @@ -103,6 +109,7 @@ enum obj_type {
>  	CKPT_OBJ_NS,
>  	CKPT_OBJ_UTS_NS,
>  	CKPT_OBJ_IPC_NS,
> +	CKPT_OBJ_SOCK,
>  	CKPT_OBJ_MAX
>  };
>  
> @@ -225,6 +232,7 @@ enum file_type {
>  	CKPT_FILE_IGNORE = 0,
>  	CKPT_FILE_GENERIC,
>  	CKPT_FILE_PIPE,
> +	CKPT_FILE_SOCKET,
>  	CKPT_FILE_MAX
>  };
>  
> @@ -248,6 +256,11 @@ struct ckpt_hdr_file_pipe {
>  	__s32 pipe_objref;
>  } __attribute__((aligned(8)));
>  
> +struct ckpt_hdr_file_socket {
> +	struct ckpt_hdr_file common;
> +	__u16 family;
> +} __attribute__((aligned(8)));
> +
>  struct ckpt_hdr_file_pipe_state {
>  	struct ckpt_hdr h;
>  	__s32 pipe_len;
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 421afb4..3b5be70 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage {
>  #include <linux/uio.h>			/* iovec support		*/
>  #include <linux/types.h>		/* pid_t			*/
>  #include <linux/compiler.h>		/* __user			*/
> +#include <linux/checkpoint_hdr.h>	/* ckpt_hdr			*/
>  
>  #ifdef __KERNEL__
>  # ifdef CONFIG_PROC_FS
> @@ -323,5 +324,63 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
>  extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
>  
>  #endif
> +
> +struct ckpt_hdr_socket_un {
> +	struct ckpt_hdr h;
> +	__u32 this;
> +	__u32 peer;
> +	__u8 linked;
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket {
> +	struct ckpt_hdr h;
> +
> +	struct ckpt_socket { /* struct socket */
> +		__u64 flags;
> +		__u8 state;
> +	} socket __attribute__ ((aligned(8)));
> +
> +	struct ckpt_sock_common { /* struct sock_common */
> +		__u32 bound_dev_if;
> +		__u16 family;
> +		__u8 state;
> +		__u8 reuse;
> +	} sock_common __attribute__ ((aligned(8)));
> +
> +	struct ckpt_sock { /* struct sock */
> +		__u64 rcvlowat;
> +		__u64 rcvtimeo;
> +		__u64 sndtimeo;
> +		__u64 flags;
> +		__u64 lingertime;
> +
> +		__u32 err;
> +		__u32 err_soft;
> +		__u32 priority;
> +		__s32 rcvbuf;
> +		__s32 sndbuf;
> +		__u16 type;
> +		__u16 backlog;
> +
> +		__u8 protocol;
> +		__u8 state;
> +		__u8 shutdown;
> +		__u8 userlocks;
> +		__u8 no_check;
> +	} sock __attribute__ ((aligned(8)));
> +
> +	/* common to all supported families */
> +	__u32 laddr_len;
> +	__u32 raddr_len;
> +	struct sockaddr laddr;
> +	struct sockaddr raddr;
> +
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket_buffer {
> +	struct ckpt_hdr h;
> +	__u32 skb_count;
> +} __attribute__ ((aligned(8)));
> +
>  #endif /* not kernel and not glibc */
>  #endif /* _LINUX_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4bb1ff9..a8b6af1 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max;
>  extern __u32 sysctl_wmem_default;
>  extern __u32 sysctl_rmem_default;
>  
> +/* Checkpoint/Restart Functions */
> +struct ckpt_ctx;
> +struct ckpt_hdr_socket;
> +extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
> +extern void *sock_file_restore(struct ckpt_ctx *);
> +extern struct socket *do_sock_file_restore(struct ckpt_ctx *,
> +					   struct ckpt_hdr_socket *);
> +extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +
>  #endif	/* _SOCK_H */
> diff --git a/net/Makefile b/net/Makefile
> index 9e00a55..c226ed1 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
>  obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
>  endif
>  obj-$(CONFIG_WIMAX)		+= wimax/
> +
> +obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> new file mode 100644
> index 0000000..fd47485
> --- /dev/null
> +++ b/net/checkpoint.c
> @@ -0,0 +1,493 @@
> +/*
> + *  Copyright 2009 IBM Corporation
> + *
> + *  Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + */
> +
> +#include <linux/socket.h>
> +#include <linux/mount.h>
> +#include <linux/file.h>
> +
> +#include <net/af_unix.h>
> +#include <net/tcp_states.h>
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/namei.h>
> +
> +/* Size of an empty struct sockaddr_un */
> +#define UNIX_LEN_EMPTY 2
> +
> +static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
> +{
> +	int count = 0;
> +	struct sk_buff *skb;
> +
> +	skb_queue_walk(from, skb) {
> +		struct sk_buff *tmp;
> +
> +		tmp = dev_alloc_skb(skb->len);
> +		if (!tmp)
> +			return -ENOMEM;
> +
> +		spin_lock(&from->lock);
> +		skb_morph(tmp, skb);
> +		spin_unlock(&from->lock);
> +
> +		skb_queue_tail(to, tmp);
> +		count++;
> +	}
> +
> +	return count;
> +}
> +
> +static int __sock_write_buffers(struct ckpt_ctx *ctx,
> +				struct sk_buff_head *queue)
> +{
> +	struct sk_buff *skb;
> +	int ret = 0;
> +
> +	skb_queue_walk(queue, skb) {
> +		if (UNIXCB(skb).fp) {
> +			ckpt_debug("unsupported fd-passing skb found\n");
> +			return -EBUSY;
> +		}
> +
> +		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
> +					  CKPT_HDR_SOCKET_BUFFER);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff_head tmpq;
> +	int ret = -ENOMEM;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (!h)
> +		goto out;
> +
> +	skb_queue_head_init(&tmpq);
> +
> +	h->skb_count = sock_copy_buffers(queue, &tmpq);
> +	if (h->skb_count < 0) {
> +		ret = h->skb_count;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (!ret)
> +		ret = __sock_write_buffers(ctx, &tmpq);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	__skb_queue_purge(&tmpq);
> +
> +	return ret;
> +}
> +
> +static int sock_un_checkpoint(struct ckpt_ctx *ctx,
> +			      struct sock *sock,
> +			      struct ckpt_hdr_socket *h)
> +{
> +	struct unix_sock *sk = unix_sk(sock);
> +	struct unix_sock *pr = unix_sk(sk->peer);
> +	struct ckpt_hdr_socket_un *un;
> +	int new;
> +	int ret = -ENOMEM;
> +
> +	if ((sock->sk_state == TCP_LISTEN) &&
> +	    !skb_queue_empty(&sock->sk_receive_queue)) {
> +		ckpt_debug("listening socket has unaccepted peers");
> +		return -EBUSY;
> +	}
> +
> +	un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
> +	if (!un)
> +		goto out;
> +
> +	un->linked = sk->dentry && (sk->dentry->d_inode->i_nlink > 0);
> +
> +	un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
> +	if (un->this < 0)
> +		goto out;
> +
> +	if (sk->peer)
> +		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
> +	else
> +		un->peer = 0;
> +
> +	if (un->peer < 0) {
> +		ret = un->peer;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un);
> + out:
> +	return ret;
> +}
> +
> +static int sock_cptrst(struct ckpt_ctx *ctx,
> +		       struct sock *sock,
> +		       struct ckpt_hdr_socket *h,
> +		       int op)
> +{
> +	if (sock->sk_socket) {
> +		CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags);
> +		CKPT_COPY(op, h->socket.state, sock->sk_socket->state);
> +	}
> +
> +	CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse);
> +	CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if);
> +	CKPT_COPY(op, h->sock_common.family, sock->sk_family);
> +

Value of some fields below needs to be verified/checked:

> +	CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown);

Allow on SHUTDOWN_MASK (because in some places the code tests a flag
and in others a value - so it could be that {RCV,SND}_SHUTDOWN are
true, but != SHUTDOWN_MASK, which is unexpected by the code). Also
for future extensibility.

> +	CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks)

Here too, allow possible flags only.

> +	CKPT_COPY(op, h->sock.no_check, sock->sk_no_check);
> +	CKPT_COPY(op, h->sock.protocol, sock->sk_protocol);
> +	CKPT_COPY(op, h->sock.err, sock->sk_err);
> +	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);

I'm unsure if we need ensure that an error value is in range...
can it do harm to pass an out-of-range value back to the caller ?

> +	CKPT_COPY(op, h->sock.priority, sock->sk_priority);
> +	CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat);

This is 'int' in struct sock, but assignment may make it negative ?

> +	CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog);

Same.

> +	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
> +	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);

These two too. Perhaps refactor and use sock_set_timeout() ?

> +	CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf);
> +	CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf);

These should be checked for sign-ness, limited in magnitude (e.g.
<= sysctl_wmem_max), checked against a minimum (see setting it
in net/core/sock.c), and finally perhaps correlated with a flag
against sk_userlocks ?

> +	CKPT_COPY(op, h->sock.flags, sock->sk_flags);
> +	CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime);

Perhaps here too ?

> +	CKPT_COPY(op, h->sock.type, sock->sk_type);
> +	CKPT_COPY(op, h->sock.state, sock->sk_state);
> +

Do we need the tests below also during checkpoint ?

Perhaps place all the sanity tests in a separate routine that
is called from here as in:

	if (op == CKPT_RST)
		return sock_validate_data(...);
	else
		return 0;

> +	if ((h->socket.state == SS_CONNECTED) &&
> +	    (h->sock.state != TCP_ESTABLISHED)) {
> +		ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
> +			   h->socket.state, h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->socket.state == SS_UNCONNECTED) &&
> +		   (h->sock.state == TCP_ESTABLISHED)) {
> +		ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
> +			   h->socket.state, h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->sock.state < TCP_ESTABLISHED) ||
> +		   (h->sock.state >= TCP_MAX_STATES)) {
> +		ckpt_debug("sock in invalid state: %i\n", h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->socket.state < SS_FREE) ||
> +		   (h->socket.state > SS_DISCONNECTING)) {
> +		ckpt_debug("socket in invalid state: %i\n", h->socket.state);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct socket *socket = file->private_data;
> +	struct sock *sock = socket->sk;
> +	struct ckpt_hdr_socket *h;
> +	int ret = 0;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->laddr_len = sizeof(h->laddr);
> +	h->raddr_len = sizeof(h->raddr);
> +
> +	if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((sock->sk_state != TCP_LISTEN) &&
> +	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
> +		if (sock->sk_type != SOCK_DGRAM) {
> +			ret = -EINVAL;
> +			goto out;
> +		}

I suspect this fails for !SOCK_DGRAM that isn't connected, e.g. newly
created.

> +		h->raddr_len = 0;

Hmmm.. do you need to ensure that this value is consistent with the
value of ->peer ?  (at restart)

> +	}
> +
> +	sock_cptrst(ctx, sock, h, CKPT_CPT);
> +
> +	if (sock->sk_family == AF_UNIX) {
> +		ret = sock_un_checkpoint(ctx, sock, h);
> +		if (ret)
> +			goto out;
> +	} else {
> +		ckpt_debug("unsupported socket type %i\n",
> +			   sock->sk_family);

I'd suggest a ckpt_write_err() here (and in other key places) to
explain to the user what caused the checkpoint to fail.

> +		ret = EINVAL;
> +		goto out;
> +	}
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_write_queue);
> +	if (ret)
> +		goto out;

Any reason to write the buffers of a listening socket ?  they only
contain not-yet-accepted sockest, which aren't supported anyway.

And while there is not support for not-yet-accepted sockets, maybe
add an explicit test (if LISTEN and has skb's) and fail with an
error, as well as call ckpt_write_err().

> +
> +	/* FIXME: write out-of-order queue for TCP */

I'd prefer failing if such data exists...

> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_read_buffer(struct ckpt_ctx *ctx,
> +			    struct sock *sock,
> +			    struct sk_buff **skb)
> +{
> +	struct ckpt_hdr *h;
> +	int ret = 0;
> +	int len;
> +
> +	h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = h->len - sizeof(*h);
> +
> +	*skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
> +	if (*skb == NULL) {
> +		ret = ENOMEM;
> +		goto out;
> +	}
> +
> +	memcpy(skb_put(*skb, len), (char *)(h + 1), len);

Can we avoid this memory copy ?
Perhaps add _ckpt_read_hdr() and _ckpt_read_payload() helpers ?

> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static int sock_read_buffers(struct ckpt_ctx *ctx,
> +			     struct sock *sock,
> +			     struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (IS_ERR(h)) {
> +		ret = PTR_ERR(h);
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		struct sk_buff *skb = NULL;
> +
> +		ret = sock_read_buffer(ctx, sock, &skb);
> +		if (ret)
> +			break;
> +
> +		skb_queue_tail(queue, skb);
> +	}

Make sure that total length is within limits, or this is an opening
for DoS.

> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static struct unix_address *sock_un_makeaddr(struct sockaddr_un *sun_addr,
> +					     unsigned len)
> +{
> +	struct unix_address *addr;
> +
> +	addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL);
> +	if (!addr)
> +		return ERR_PTR(ENOMEM);

I checked below, but you don't examine @len before calling this
function. So a DoS is possible by exhausting kernel memory with
these kmalloc()s.

> +
> +	memcpy(addr->name, sun_addr, len);
> +	addr->len = len;
> +	atomic_set(&addr->refcnt, 1);
> +
> +	return addr;
> +}
> +
> +static int sock_un_join(struct sock *a,
> +			struct sock *b,
> +			struct ckpt_hdr_socket *h)
> +{
> +	struct unix_address *addr;
> +
> +	sock_hold(a);
> +	sock_hold(b);
> +
> +	unix_sk(a)->peer = b;
> +	unix_sk(b)->peer = a;
> +
> +	a->sk_peercred.pid = task_tgid_vnr(current);
> +	current_euid_egid(&a->sk_peercred.uid,
> +			  &a->sk_peercred.gid);
> +
> +	b->sk_peercred.pid = task_tgid_vnr(current);
> +	current_euid_egid(&b->sk_peercred.uid,
> +			  &b->sk_peercred.gid);
> +

Hmmm... a socket that is connected may also be bind(), so I'm
not sure the test below is always correct ?

Also, this is the place to verify that the {l,r}addr_len make
sense, to prevent the DoS mentioned above.

> +	if (h->laddr_len == UNIX_LEN_EMPTY)
> +		addr = sock_un_makeaddr((struct sockaddr_un *)&h->raddr,
> +					h->raddr_len);
> +	else
> +		addr = sock_un_makeaddr((struct sockaddr_un *)&h->laddr,
> +					h->laddr_len);
> +	if (IS_ERR(addr))
> +	    return PTR_ERR(addr);
> +
> +	atomic_inc(&addr->refcnt); /* Held by both ends */
> +	unix_sk(a)->addr = unix_sk(b)->addr = addr;

A thought: do we want to restore sharing of addresses at restart ?
For instance, all sockets accepted from a single listening socket
share their local addresses.

> +
> +	return 0;
> +}
> +
> +static int sock_un_unlink(const char *name)
> +{
> +	struct path spath;
> +	struct path ppath;
> +	int ret;
> +
> +	ret = kern_path(name, 0, &spath);
> +	if (ret)
> +		return ret;
> +
> +	ret = kern_path(name, LOOKUP_PARENT, &ppath);
> +	if (ret)
> +		goto out_s;
> +
> +	if (!spath.dentry) {
> +		ckpt_debug("No dentry found for %s\n", name);
> +		ret = -ENOENT;
> +		goto out_p;
> +	}
> +
> +	if (!ppath.dentry || !ppath.dentry->d_inode) {
> +		ckpt_debug("No inode for parent of %s\n", name);
> +		ret = -ENOENT;
> +		goto out_p;
> +	}
> +
> +	ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry);
> + out_p:
> +	path_put(&ppath);
> + out_s:
> +	path_put(&spath);
> +
> +	return ret;
> +}
> +
> +static int sock_un_restart(struct ckpt_ctx *ctx,
> +			   struct ckpt_hdr_socket *h,
> +			   struct socket *socket)
> +{
> +	struct sock *peer;
> +	struct ckpt_hdr_socket_un *un;
> +	int ret = 0;
> +
> +	un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
> +	if (IS_ERR(un))
> +		return PTR_ERR(un);
> +
> +	if (un->peer < 0) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
> +
> +	if ((h->sock.state == TCP_ESTABLISHED) ||
> +	    (h->sock.state == TCP_CLOSE)) {

Here below, @peer may hold ERR_PTR(-EINVAL) meaning not found,
or ERR_PTR(-ENOMSG) meaning type mismatch and bad input. You
need to test for the latter.

> +		if (!IS_ERR(peer)) {
> +			/* We're last, so join with peer */
> +			struct sock *this = socket->sk;
> +
> +			ret = sock_un_join(this, peer, h);
> +		} else {

So this should be:
		} else if (PTR_ERR(peer) == -EINVAL) {

> +			/* We're first, so add our socket and wait for peer */
> +			ckpt_obj_insert(ctx, socket->sk, un->this,
> +					CKPT_OBJ_SOCK);
		} else {
			ret = PTR_ERR(peer);
> +		}

Also, need to check ret value of ckpt_obj_insert().

> +
> +	} else if (h->sock.state == TCP_LISTEN) {
> +		ret = socket->ops->bind(socket,
> +					(struct sockaddr *)&h->laddr,
> +					h->laddr_len);

Three comments:

1) Non-listening sockets may also need bind(). Especially SOCK_DGRAM.

2) What happens if s1 is checkpointed before s2 in this scenario:
	s1 = socket(); bind(s1, addr); unlink(addr);
	s2 = socket(); bind(s1, addr);

I'd suggest that a socket that is did bind() and that corresponding
inode was later unlink()ed, will not be re-bind at all. After all,
it is unreachable in terms of connect() syscall.

3) If the sockets path is relative, you can't blindly rely on the socket
address reported by ->getname(), because the original bind() occured
_relative_ to the task's current-working-directory at the time of the
socket() syscall.

The cwd of the restarting task may differ. Moreover, the resulting
path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname()
reports the original, relative pathname anyways.

So if the address doesn't begin with a '/', you need to also save
the cwd and on restart, go there before caling bind().

> +		if (ret < 0)
> +			goto out;
> +
> +		ret = socket->ops->listen(socket, h->sock.backlog);
> +		if (ret < 0)
> +			goto out;
> +
> +		/* We can unlink this blindly because we just created it
> +		 * above and would have failed already without proper
> +		 * permissions
> +		 */
> +		if (!un->linked) {
> +			struct sockaddr_un *sun =
> +				(struct sockaddr_un *)&h->laddr;
> +			ret = sock_un_unlink(sun->sun_path);

Here, too, need to be aware of the cwd of the restarting task.

> +		}
> +	} else
> +		ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state);
> + out:
> +	ckpt_hdr_put(ctx, un);
> +	return ret;
> +}
> +
> +struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
> +				    struct ckpt_hdr_socket *h)
> +{
> +	struct socket *socket;
> +	int ret;
> +
> +	ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	if (h->sock_common.family == AF_UNIX) {
> +		ret = sock_un_restart(ctx, h, socket);
> +		ckpt_debug("sock_un_restart: %i\n", ret);
> +	} else {
> +		ckpt_debug("unsupported family %i\n", h->sock_common.family);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto out;
> +
> +	sock_cptrst(ctx, socket->sk, h, CKPT_RST);

Check return value ?

> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
> +	if (ret)
> +		goto out;

Only read buffers if really need to - i.e. not listening socket,
not closed socket.

> + out:
> +	if (ret) {
> +		sock_release(socket);
> +		socket = ERR_PTR(ret);
> +	}
> +
> +	return socket;
> +}
> +
> diff --git a/net/socket.c b/net/socket.c
> index 791d71a..be8c562 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -96,6 +96,9 @@
>  #include <net/sock.h>
>  #include <linux/netfilter.h>
>  
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +
>  static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
>  static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
>  			 unsigned long nr_segs, loff_t pos);
> @@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
>  	.sendpage =	sock_sendpage,
>  	.splice_write = generic_splice_sendpage,
>  	.splice_read =	sock_splice_read,
> +#ifdef CONFIG_CHECKPOINT
> +	.checkpoint =   sock_file_checkpoint,
> +#endif
>  };
>  
>  /*
> @@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags)
>  	return fd;
>  }
>  
> +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
> +{
> +	struct ckpt_hdr_file_socket *h;
> +	int ret;
> +	struct file *file = ptr;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->common.f_type = CKPT_FILE_SOCKET;
> +
> +	ret = checkpoint_file_common(ctx, file, &h->common);
> +	if (ret < 0)
> +		goto out;
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = do_sock_file_checkpoint(ctx, file);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static struct file *sock_alloc_attach_fd(struct socket *socket)
> +{
> +	struct file *file;
> +	int err;
> +
> +	file = get_empty_filp();
> +	if (!file)
> +		return ERR_PTR(ENOMEM);
> +
> +	err = sock_attach_fd(socket, file, 0);
> +	if (err < 0) {
> +		put_filp(file);
> +		file = ERR_PTR(err);
> +	}
> +
> +	return file;
> +}
> +
> +void *sock_file_restore(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket *h = NULL;
> +	struct socket *socket = NULL;
> +	struct file *file = NULL;
> +	int err;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (IS_ERR(h))
> +		return h;
> +
> +	socket = do_sock_file_restore(ctx, h);
> +	if (IS_ERR(socket)) {
> +		err = PTR_ERR(socket);
> +		goto err_put;
> +	}
> +
> +	file = sock_alloc_attach_fd(socket);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		goto err_release;
> +	}
> +
> +	ckpt_hdr_put(ctx, h);
> +
> +	return file;
> +
> + err_release:
> +	sock_release(socket);
> + err_put:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ERR_PTR(err);
> +}
> +
>  static struct socket *sock_from_file(struct file *file, int *err)
>  {
>  	if (file->f_op == &socket_file_ops)

Ok ... waiting for next round :)

Oren.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
  2009-06-25  2:30   ` Oren Laadan
@ 2009-06-29 17:29     ` Dan Smith
       [not found]       ` <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-06-29 17:29 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> Value of some fields below needs to be verified/checked:

Yep, okay, I think I've got some sane sanity checks in place now.

>> +	CKPT_COPY(op, h->sock.err, sock->sk_err);
>> +	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);

OL> I'm unsure if we need ensure that an error value is in range...
OL> can it do harm to pass an out-of-range value back to the caller ?

I don't think any of the kernel code would care, but it's probably
best to make sure that it's in a reasonable range.

OL> This is 'int' in struct sock, but assignment may make it negative
OL> ?

Fixed this and others.

>> +	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
>> +	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);

OL> These two too. Perhaps refactor and use sock_set_timeout() ?

From the look of sock_set_timeout() I'm not sure it's expected to be
called from kernel space.  Regardless, I think just setting it (and
not corrupting it) should be fine.

OL> These should be checked for sign-ness, limited in magnitude (e.g.
OL> <= sysctl_wmem_max), checked against a minimum (see setting it in
OL> net/core/sock.c), and finally perhaps correlated with a flag
OL> against sk_userlocks ?

Yep, fixed.

>> +	if ((sock->sk_state != TCP_LISTEN) &&
>> +	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
>> +		if (sock->sk_type != SOCK_DGRAM) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}

OL> I suspect this fails for !SOCK_DGRAM that isn't connected,
OL> e.g. newly created.

Okay, yeah.  I changed the logic to only bail if it's a DGRAM socket
and the state is TCP_ESTABLISHED, which I think is probably what we
want.  Since we bail on the transitional TCP states, I think this
should be okay.

>> +		h->raddr_len = 0;

OL> Hmmm.. do you need to ensure that this value is consistent with
OL> the value of ->peer ?  (at restart)

I dunno, why would I?  If I hook up the two peers to the same address
structure on restart everything should be correct, no?

As somewhat of a side question, is there a rule that says that
getname() must return the exact same thing each time it is called?

OL> I'd suggest a ckpt_write_err() here (and in other key places) to
OL> explain to the user what caused the checkpoint to fail.

Yep, I hadn't followed the introduction of that, but it's a very
welcome addition.  I've added calls in the appropriate places.

OL> Any reason to write the buffers of a listening socket ?  they only
OL> contain not-yet-accepted sockest, which aren't supported anyway.

Nope, fixed.

OL> And while there is not support for not-yet-accepted sockets, maybe
OL> add an explicit test (if LISTEN and has skb's) and fail with an
OL> error, as well as call ckpt_write_err().

Isn't that just this (from the patch):

+	if ((sock->sk_state == TCP_LISTEN) &&
+	    !skb_queue_empty(&sock->sk_receive_queue)) {
+		ckpt_debug("listening socket has unaccepted peers");
+		return -EBUSY;
+	}

?  I've put a ckpt_write_err() there now...

>> +	/* FIXME: write out-of-order queue for TCP */

OL> I'd prefer failing if such data exists...

Yep, indeed.  I've removed the comment here and will augment the INET
patch to check for that and fail appropriately.

>> +	memcpy(skb_put(*skb, len), (char *)(h + 1), len);

OL> Can we avoid this memory copy ?  Perhaps add _ckpt_read_hdr() and
OL> _ckpt_read_payload() helpers ?

Okay, yep.

>> +static int sock_read_buffers(struct ckpt_ctx *ctx,
>> +			     struct sock *sock,
>> +			     struct sk_buff_head *queue)
>> +{
>> +	struct ckpt_hdr_socket_buffer *h;
>> +	int ret = 0;
>> +	int i;
>> +
>> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
>> +	if (IS_ERR(h)) {
>> +		ret = PTR_ERR(h);
>> +		goto out;
>> +	}
>> +
>> +	for (i = 0; i < h->skb_count; i++) {
>> +		struct sk_buff *skb = NULL;
>> +
>> +		ret = sock_read_buffer(ctx, sock, &skb);
>> +		if (ret)
>> +			break;
>> +
>> +		skb_queue_tail(queue, skb);
>> +	}

OL> Make sure that total length is within limits, or this is an
OL> opening for DoS.

You mean the number of buffers?  The sock_read_buffer() function will
limit each skb to SKB_MAX_ALLOC.  What is a sane maximum number of
buffers?  Maybe we need a sysctl for that?

OL> I checked below, but you don't examine @len before calling this
OL> function. So a DoS is possible by exhausting kernel memory with
OL> these kmalloc()s.

Yep, fixed.

OL> Hmmm... a socket that is connected may also be bind(), so I'm not
OL> sure the test below is always correct ?

You mean because it could have been bind()'ed to a different address
than it's peer before it called connect()?  Wouldn't the connect()
have changed the local address?

OL> A thought: do we want to restore sharing of addresses at restart ?
OL> For instance, all sockets accepted from a single listening socket
OL> share their local addresses.

For INET it seems important, but I don't think I've seen anything in
the UNIX code that cares.  However, I can move some of the
parent-tracking stuff to this patch in order to hook up the shared
addresses if you think it's important.

OL> 1) Non-listening sockets may also need bind(). Especially
OL> SOCK_DGRAM.

Perhaps if the socket is not established and had a local address, we
call bind?

OL> 2) What happens if s1 is checkpointed before s2 in this scenario:
OL> 	s1 = socket(); bind(s1, addr); unlink(addr);
OL> 	s2 = socket(); bind(s1, addr);

OL> I'd suggest that a socket that is did bind() and that
OL> corresponding inode was later unlink()ed, will not be re-bind at
OL> all. After all, it is unreachable in terms of connect() syscall.

I'm not sure what you're saying here...

OL> 3) If the sockets path is relative, you can't blindly rely on the
OL> socket address reported by ->getname(), because the original
OL> bind() occured _relative_ to the task's current-working-directory
OL> at the time of the socket() syscall.

OL> The cwd of the restarting task may differ. Moreover, the resulting
OL> path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname()
OL> reports the original, relative pathname anyways.

OL> So if the address doesn't begin with a '/', you need to also save
OL> the cwd and on restart, go there before caling bind().

But the app could have changed its cwd after binding the socket so the
cwd of the checkpointed task isn't what we want, right?  Maybe we
could do a dentry lookup to find the actual path it's bound to and
store that separately from the local and remote addresses?

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]       ` <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-07-02  9:15         ` Oren Laadan
  2009-07-06 18:31           ` Dan Smith
  0 siblings, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-07-02  9:15 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan



Dan Smith wrote:
> OL> Value of some fields below needs to be verified/checked:
> 
> Yep, okay, I think I've got some sane sanity checks in place now.
> 
>>> +	CKPT_COPY(op, h->sock.err, sock->sk_err);
>>> +	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);
> 
> OL> I'm unsure if we need ensure that an error value is in range...
> OL> can it do harm to pass an out-of-range value back to the caller ?
> 
> I don't think any of the kernel code would care, but it's probably
> best to make sure that it's in a reasonable range.
> 
> OL> This is 'int' in struct sock, but assignment may make it negative
> OL> ?
> 
> Fixed this and others.
> 
>>> +	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
>>> +	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);
> 
> OL> These two too. Perhaps refactor and use sock_set_timeout() ?
> 
> From the look of sock_set_timeout() I'm not sure it's expected to be
> called from kernel space.  Regardless, I think just setting it (and
> not corrupting it) should be fine.
> 
> OL> These should be checked for sign-ness, limited in magnitude (e.g.
> OL> <= sysctl_wmem_max), checked against a minimum (see setting it in
> OL> net/core/sock.c), and finally perhaps correlated with a flag
> OL> against sk_userlocks ?
> 
> Yep, fixed.
> 
>>> +	if ((sock->sk_state != TCP_LISTEN) &&
>>> +	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
>>> +		if (sock->sk_type != SOCK_DGRAM) {
>>> +			ret = -EINVAL;
>>> +			goto out;
>>> +		}
> 
> OL> I suspect this fails for !SOCK_DGRAM that isn't connected,
> OL> e.g. newly created.
> 
> Okay, yeah.  I changed the logic to only bail if it's a DGRAM socket
> and the state is TCP_ESTABLISHED, which I think is probably what we
> want.  Since we bail on the transitional TCP states, I think this
> should be okay.
> 
>>> +		h->raddr_len = 0;
> 
> OL> Hmmm.. do you need to ensure that this value is consistent with
> OL> the value of ->peer ?  (at restart)
> 
> I dunno, why would I?  If I hook up the two peers to the same address
> structure on restart everything should be correct, no?

Because in the kernel:
	"->peer != NULL"  if-and-only-if   "h->raddr is meaningful"
so I wonder whether you should detect such input and complain,
in case a malicious user does that ?

> 
> As somewhat of a side question, is there a rule that says that
> getname() must return the exact same thing each time it is called?

I, for one, would expect it to remain the same unless my program
re-connected the sockets.

Or better: is there a reason why we would want to behave otherwise ?

> 
> OL> I'd suggest a ckpt_write_err() here (and in other key places) to
> OL> explain to the user what caused the checkpoint to fail.
> 
> Yep, I hadn't followed the introduction of that, but it's a very
> welcome addition.  I've added calls in the appropriate places.
> 
> OL> Any reason to write the buffers of a listening socket ?  they only
> OL> contain not-yet-accepted sockest, which aren't supported anyway.
> 
> Nope, fixed.
> 
> OL> And while there is not support for not-yet-accepted sockets, maybe
> OL> add an explicit test (if LISTEN and has skb's) and fail with an
> OL> error, as well as call ckpt_write_err().
> 
> Isn't that just this (from the patch):

Yes. I guess I misread the code.

> 
> +	if ((sock->sk_state == TCP_LISTEN) &&
> +	    !skb_queue_empty(&sock->sk_receive_queue)) {
> +		ckpt_debug("listening socket has unaccepted peers");
> +		return -EBUSY;
> +	}
> 
> ?  I've put a ckpt_write_err() there now...
> 
>>> +	/* FIXME: write out-of-order queue for TCP */
> 
> OL> I'd prefer failing if such data exists...
> 
> Yep, indeed.  I've removed the comment here and will augment the INET
> patch to check for that and fail appropriately.
> 
>>> +	memcpy(skb_put(*skb, len), (char *)(h + 1), len);
> 
> OL> Can we avoid this memory copy ?  Perhaps add _ckpt_read_hdr() and
> OL> _ckpt_read_payload() helpers ?
> 
> Okay, yep.
> 
>>> +static int sock_read_buffers(struct ckpt_ctx *ctx,
>>> +			     struct sock *sock,
>>> +			     struct sk_buff_head *queue)
>>> +{
>>> +	struct ckpt_hdr_socket_buffer *h;
>>> +	int ret = 0;
>>> +	int i;
>>> +
>>> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
>>> +	if (IS_ERR(h)) {
>>> +		ret = PTR_ERR(h);
>>> +		goto out;
>>> +	}
>>> +
>>> +	for (i = 0; i < h->skb_count; i++) {
>>> +		struct sk_buff *skb = NULL;
>>> +
>>> +		ret = sock_read_buffer(ctx, sock, &skb);
>>> +		if (ret)
>>> +			break;
>>> +
>>> +		skb_queue_tail(queue, skb);
>>> +	}
> 
> OL> Make sure that total length is within limits, or this is an
> OL> opening for DoS.
> 
> You mean the number of buffers?  The sock_read_buffer() function will
> limit each skb to SKB_MAX_ALLOC.  What is a sane maximum number of
> buffers?  Maybe we need a sysctl for that?

Well, it should not be more than allowed by ->rcvbuf, or some other
system limits (which I'm sure exists). At least apply the restrictions
put by a regular read() syscall.

One exception is when a socket's rcvbuf fills with large amount
of data, and then the user calls setsockopt() to reduce the buffer
to a lower size... in which case relying on the restored value for
->rcvbuf limit is clearly wrong.

> 
> OL> I checked below, but you don't examine @len before calling this
> OL> function. So a DoS is possible by exhausting kernel memory with
> OL> these kmalloc()s.
> 
> Yep, fixed.
> 
> OL> Hmmm... a socket that is connected may also be bind(), so I'm not
> OL> sure the test below is always correct ?
> 
> You mean because it could have been bind()'ed to a different address
> than it's peer before it called connect()?  Wouldn't the connect()
> have changed the local address?

Nope.

The connect() syscall will auto-bind only if the socket isn't
already bound. And in the case of ad_unix, even that happens only
when SOCK_PASSCRED bit is set in sock->flags.

> 
> OL> A thought: do we want to restore sharing of addresses at restart ?
> OL> For instance, all sockets accepted from a single listening socket
> OL> share their local addresses.
> 
> For INET it seems important, but I don't think I've seen anything in
> the UNIX code that cares.  However, I can move some of the
> parent-tracking stuff to this patch in order to hook up the shared
> addresses if you think it's important.

Don't think it's important, definitely not for af_unix (the amount
of memory spent isn't worth the optimization, I reckon). But INET
(or other sockets) may indeed require this.

> 
> OL> 1) Non-listening sockets may also need bind(). Especially
> OL> SOCK_DGRAM.
> 
> Perhaps if the socket is not established and had a local address, we
> call bind?

Even an established socket may have a local address that needs to
be saved, to make getsockname() consistent across c/r.

> 
> OL> 2) What happens if s1 is checkpointed before s2 in this scenario:
> OL> 	s1 = socket(); bind(s1, addr); unlink(addr);
> OL> 	s2 = socket(); bind(s1, addr);
			^^^^^^^^
			   (s2, addr)    <-- should have been	
> 
> OL> I'd suggest that a socket that is did bind() and that
> OL> corresponding inode was later unlink()ed, will not be re-bind at
> OL> all. After all, it is unreachable in terms of connect() syscall.
> 
> I'm not sure what you're saying here...
>

Neither was I  :p

Now with the fixed example, the problem is if s2 is restored first,
the restore of s1 will fail, or succeed and make s1 unreachable.

However, we know that s1 is no longer reachable through connect()
(or sendto() for dgram) - so during restart we want to manually
set the local address, without the side-effect of creating an inode
in the filesystem, for those sockets that are 1) bound with pathname
address, and 2) the corresponding socket inode isn't unlinked.

> OL> 3) If the sockets path is relative, you can't blindly rely on the
> OL> socket address reported by ->getname(), because the original
> OL> bind() occured _relative_ to the task's current-working-directory
> OL> at the time of the socket() syscall.
> 
> OL> The cwd of the restarting task may differ. Moreover, the resulting
> OL> path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname()
> OL> reports the original, relative pathname anyways.
> 
> OL> So if the address doesn't begin with a '/', you need to also save
> OL> the cwd and on restart, go there before caling bind().
> 
> But the app could have changed its cwd after binding the socket so the
> cwd of the checkpointed task isn't what we want, right?  Maybe we
> could do a dentry lookup to find the actual path it's bound to and
> store that separately from the local and remote addresses?
> 

I meant "cwd at the time of the bind()". To find it, you can "subtract"
the sock-address from the total pathname of the inode (assuming, of
course, that the address is relative).

Oren.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
  2009-07-02  9:15         ` Oren Laadan
@ 2009-07-06 18:31           ` Dan Smith
       [not found]             ` <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-07-06 18:31 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> Because in the kernel: "->peer != NULL" if-and-only-if "h->raddr
OL> is meaningful" so I wonder whether you should detect such input
OL> and complain, in case a malicious user does that ?

Ah, sorry, I misunderstood.  Yep, agreed.  I was thinking I was
covered on that by the use of the regular functions but my makeaddr
function would let it slip by, so I've fixed that.

OL> Well, it should not be more than allowed by ->rcvbuf, or some
OL> other system limits (which I'm sure exists). At least apply the
OL> restrictions put by a regular read() syscall.

Relying on the stored rcvbuf doesn't help prevent a DoS, right?

OL> One exception is when a socket's rcvbuf fills with large amount of
OL> data, and then the user calls setsockopt() to reduce the buffer to
OL> a lower size... in which case relying on the restored value for
OL> -> rcvbuf limit is clearly wrong.

Hmm, I don't know why that's the case.  Even still, I don't see why we
should rely on the stored version of rcvbuf at all, aside from maybe
just checking that the socket buffers are smaller than the claimed
rcvbuf size of correctness' sake.  However, checking against an
existing sysctl or other limit (if there is one, still need to look)
is obvious a good idea.

OL> The connect() syscall will auto-bind only if the socket isn't
OL> already bound. And in the case of ad_unix, even that happens only
OL> when SOCK_PASSCRED bit is set in sock->flags.

Well, I'm not really sure why that makes sense (not arguing that it's
true).  Perhaps a comment about this potential issue will suffice for
now?

OL> Even an established socket may have a local address that needs to
OL> be saved, to make getsockname() consistent across c/r.

...right, but that's already restored properly, no?  My test case
shows that established STREAM sockets have the same output of
getsockname() before and after restart.

OL> s1 = socket(); bind(s1, addr); unlink(addr);
OL> s2 = socket(); bind(s1, addr);
OL> 			^^^^^^^^
OL>                    (s2, addr)    <-- should have been	

OL> Now with the fixed example, the problem is if s2 is restored first,
OL> the restore of s1 will fail, or succeed and make s1 unreachable.

I think you meant "restore of s1 will fail, or succeed and make *s2*
unreachable".

OL> However, we know that s1 is no longer reachable through connect()
OL> (or sendto() for dgram) - so during restart we want to manually
OL> set the local address, without the side-effect of creating an
OL> inode in the filesystem, for those sockets that are 1) bound with
OL> pathname address, and 2) the corresponding socket inode isn't
OL> unlinked.

Okay, done.

OL> I meant "cwd at the time of the bind()". To find it, you can
OL> "subtract" the sock-address from the total pathname of the inode
OL> (assuming, of course, that the address is relative).

Right, right.  I was thinking that the socket kept an inode which
would make it hard to determine the full path, but it's a dentry.
I'll work that in.

Thanks!

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]             ` <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-07-06 19:06               ` Oren Laadan
       [not found]                 ` <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-07-06 19:06 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan



Dan Smith wrote:
> OL> Because in the kernel: "->peer != NULL" if-and-only-if "h->raddr
> OL> is meaningful" so I wonder whether you should detect such input
> OL> and complain, in case a malicious user does that ?
> 
> Ah, sorry, I misunderstood.  Yep, agreed.  I was thinking I was
> covered on that by the use of the regular functions but my makeaddr
> function would let it slip by, so I've fixed that.
> 
> OL> Well, it should not be more than allowed by ->rcvbuf, or some
> OL> other system limits (which I'm sure exists). At least apply the
> OL> restrictions put by a regular read() syscall.
> 
> Relying on the stored rcvbuf doesn't help prevent a DoS, right?

Yes, of course, I meant a sane ->rcvbuf :)

> 
> OL> One exception is when a socket's rcvbuf fills with large amount of
> OL> data, and then the user calls setsockopt() to reduce the buffer to
> OL> a lower size... in which case relying on the restored value for
> OL> -> rcvbuf limit is clearly wrong.
> 
> Hmm, I don't know why that's the case.  Even still, I don't see why we
> should rely on the stored version of rcvbuf at all, aside from maybe
> just checking that the socket buffers are smaller than the claimed
> rcvbuf size of correctness' sake.  However, checking against an
> existing sysctl or other limit (if there is one, still need to look)
> is obvious a good idea.

Yes.

In fact, if you take a look at sock_setsockopt(), it already has the
logic to impose (or not to) a limit on sk_rcvbuf.

Maybe do something like:
	...
	sock_setsockopt(sock, ..., saved_rcvbuf_length);
	sock_read_buffers(..., saved_recvbuf_length)
	sock_setsockopt(sock, ..., saved_rcvbuf_val);
	...

So first you set the limit to the actual size you are going to restore,
if that succeeds, you restore the buffer (guaranteed to have enough
room), then restore the saved sk_rcvbuf value.  Permission tests are
on the house.

(You'll need to slightly refactor sock_setsockopt() for that).

> 
> OL> The connect() syscall will auto-bind only if the socket isn't
> OL> already bound. And in the case of ad_unix, even that happens only
> OL> when SOCK_PASSCRED bit is set in sock->flags.
> 
> Well, I'm not really sure why that makes sense (not arguing that it's
> true).  Perhaps a comment about this potential issue will suffice for
> now?

Comment about what ?  I was just describing the behavior ...

> 
> OL> Even an established socket may have a local address that needs to
> OL> be saved, to make getsockname() consistent across c/r.
> 
> ...right, but that's already restored properly, no?  My test case
> shows that established STREAM sockets have the same output of
> getsockname() before and after restart.

Indeed this test passes. But it is insufficient. Looking at v4 of
the patch, for established sockets, for the auto-bind case of
connect(), you are mostly correct. But what about --

1) 	s = socket(.., SOCK_DGRAM,...);
	bind(s, any_addr);
--> that is, s isn't connected to/from another socket.

2) 	s = socket(.., SOCK_DGRAM,...);
	bind(s, any_addr);
	connect(s, other_addr, ...);
--> now s is connected, but after restart you can't connect another
socket to it because the address wasn't bind() properly.

3)	s = socket(..., SOCK_STREAM,...);
	bind(s, any_addr);
	connect(s, other_addr, ...);
--> here, too, s is not properly bind(), so if after restart someone
tries to:
	r = socket(..., ..., ...);
	bind(r, any_addr);
the bind() will succeed after restart, but will have failed on the
original system where the checkpoint was taken.

IOW: when you restore the address of a socket, you need to emulate
the operation of bind(), not merely memcpy the address into place.
That's the reason why I originally had suggested to imitate the
usual sequence of connection sockets to restore them.

(And if the address was a pathname, but already unlinked, then
also unlink after the bind, FWIW).

Oren.

> 
> OL> s1 = socket(); bind(s1, addr); unlink(addr);
> OL> s2 = socket(); bind(s1, addr);
> OL> 			^^^^^^^^
> OL>                    (s2, addr)    <-- should have been	
> 
> OL> Now with the fixed example, the problem is if s2 is restored first,
> OL> the restore of s1 will fail, or succeed and make s1 unreachable.
> 
> I think you meant "restore of s1 will fail, or succeed and make *s2*
> unreachable".
> 
> OL> However, we know that s1 is no longer reachable through connect()
> OL> (or sendto() for dgram) - so during restart we want to manually
> OL> set the local address, without the side-effect of creating an
> OL> inode in the filesystem, for those sockets that are 1) bound with
> OL> pathname address, and 2) the corresponding socket inode isn't
> OL> unlinked.
> 
> Okay, done.
> 
> OL> I meant "cwd at the time of the bind()". To find it, you can
> OL> "subtract" the sock-address from the total pathname of the inode
> OL> (assuming, of course, that the address is relative).
> 
> Right, right.  I was thinking that the socket kept an inode which
> would make it hard to determine the full path, but it's a dentry.
> I'll work that in.
> 
> Thanks!
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                 ` <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-07-06 22:12                   ` Dan Smith
       [not found]                     ` <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-07-06 22:12 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> (You'll need to slightly refactor sock_setsockopt() for that).

It seems much easier and less invasive to just check against
sysctl_{r,w}mem_max.  I've got that added to the patch now.

OL> 2) 	s = socket(.., SOCK_DGRAM,...);
OL> 	bind(s, any_addr);
OL> 	connect(s, other_addr, ...);
--> now s is connected, but after restart you can't connect another
OL> socket to it because the address wasn't bind() properly.

Okay, I guess that's true.  So, since there isn't a "I'm bound, but
not listening or connected" flag anywhere, does it suffice to bind()
any socket that is not connected but that does have a local address?
Sockets that get a local address via connect() should never transition
through that state, so I think that should work.  At least for INET,
any socket that is restored into a connected state is properly hashed
such that another socket can't bind() to its local address (tested).

OL> (And if the address was a pathname, but already unlinked, then
OL> also unlink after the bind, FWIW).

For path-based UNIX sockets, we don't care about this exclusion,
right?  As long as we make the socket owner think everything is as it
was, that is.  Given that a normal system doesn't fail the bind of b
in this case:

  a = socket(AF_UNIX);
  b = socket(AF_UNIX);

  bind(a, addr);
  unlink(addr);
  bind(b, addr);

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                     ` <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-07-06 22:46                       ` Oren Laadan
       [not found]                         ` <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-07-06 22:46 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan



Dan Smith wrote:
> OL> (You'll need to slightly refactor sock_setsockopt() for that).
> 
> It seems much easier and less invasive to just check against
> sysctl_{r,w}mem_max.  I've got that added to the patch now.

But there are two cases:  if you are CAP_NET_ADMIN you are allowed
to go beyond that limit. So you need to add that test too.

And in general, this helps to keep the checks - be it security,
resource limits, or whatever - in one place, instead of having
to duplicate code and, more importantly, risk getting out of
sync with the original checks (e.g., if sock_setsockopt changes).

> 
> OL> 2) 	s = socket(.., SOCK_DGRAM,...);
> OL> 	bind(s, any_addr);
> OL> 	connect(s, other_addr, ...);
> --> now s is connected, but after restart you can't connect another
> OL> socket to it because the address wasn't bind() properly.
> 
> Okay, I guess that's true.  So, since there isn't a "I'm bound, but
> not listening or connected" flag anywhere, does it suffice to bind()
> any socket that is not connected but that does have a local address?
> Sockets that get a local address via connect() should never transition
> through that state, so I think that should work.  At least for INET,
> any socket that is restored into a connected state is properly hashed
> such that another socket can't bind() to its local address (tested).

Yes, with the exception below (for UNIX)...

> 
> OL> (And if the address was a pathname, but already unlinked, then
> OL> also unlink after the bind, FWIW).
> 
> For path-based UNIX sockets, we don't care about this exclusion,
> right?  As long as we make the socket owner think everything is as it

You are correct in that you don't need it for the scenario you
presented below.

But we do care, because it is necessary to do the unlink() after the
bind(), like you do for listening sockets, for this scenario:

	s = socket()
	bind(s, pathname)
	unlink(pathname)
				<---- checkpoint/restart
	r = socket()
	bind(r, pathname)

The second bind() will succeed on the original system, but will
fail on the restarted system, unless you do that.

> was, that is.  Given that a normal system doesn't fail the bind of b
> in this case:
> 
>   a = socket(AF_UNIX);
>   b = socket(AF_UNIX);
> 
>   bind(a, addr);
>   unlink(addr);
>   bind(b, addr);
> 

BTW, I just looked again at the code, and I'm concerned about:

+		if (!un->linked) {
+			struct sockaddr_un *sun =
+				(struct sockaddr_un *)&h->laddr;
+			ret = sock_unix_unlink(sun->sun_path);
+		}

You need to verify that the address is not abstract, because
I'm not sure what sock_unix_unlink() would do given the empty
string. Usually this is filtered higher in the VFS (getname).

Oren.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                         ` <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-07-07 14:52                           ` Dan Smith
       [not found]                             ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-07-07 14:52 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> But there are two cases: if you are CAP_NET_ADMIN you are allowed
OL> to go beyond that limit. So you need to add that test too.

Okay, fair enough.

OL> And in general, this helps to keep the checks - be it security,
OL> resource limits, or whatever - in one place, instead of having to
OL> duplicate code and, more importantly, risk getting out of sync
OL> with the original checks (e.g., if sock_setsockopt changes).

But sock_setsockopt() will also set the userlocks flag saying that
you've specified the buffer size.  At the point at which I currently
restore the buffers, we've already restored the value specified in the
checkpoint stream, so we'd need to re-reset it as a special case after
the call to sock_setsockopt().  Further, to get the "override" case,
you have to call it with SO_RCVBUFFORCE which fails if you're not
CAP_SYS_ADMIN.  So, do we try force, then if it fails try SO_RCVBUF,
then if it fails actually fail?  Since sock_setsockopt() doubles the
buffer size we get it, do we cut the value we want in half before
passing it in?

Doing all that seems like an abuse of sock_setsockopt() to me when the
alternative is to check CAP_SYS_ADMIN and set the buffer size.

OL> But we do care, because it is necessary to do the unlink() after
OL> the bind(), like you do for listening sockets, for this scenario:

OL> 	s = socket()
OL> 	bind(s, pathname)
OL> 	unlink(pathname)
OL> 				<---- checkpoint/restart
OL> 	r = socket()
OL> 	bind(r, pathname)

OL> The second bind() will succeed on the original system, but will
OL> fail on the restarted system, unless you do that.

Not if we don't actually call bind(s) in the unlinked case.  What I
meant in my previous response is: if we're unlinked, then just fake
the bind actions but don't actually do the bind()..unlink().  We
already went over the case where we might unlink() a valid socket
depending on the order, right?

OL> BTW, I just looked again at the code, and I'm concerned about:

OL> +		if (!un->linked) {
OL> +			struct sockaddr_un *sun =
OL> +				(struct sockaddr_un *)&h->laddr;
OL> +			ret = sock_unix_unlink(sun->sun_path);
OL> +		}

OL> You need to verify that the address is not abstract, because I'm
OL> not sure what sock_unix_unlink() would do given the empty
OL> string. Usually this is filtered higher in the VFS (getname).

Yep, but luckily that's gone with my recent changes to fake the bind()
for unlinked sockets instead of actually doing the unlink() :)

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                             ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-07-07 15:15                               ` Oren Laadan
       [not found]                                 ` <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
  2009-07-07 15:33                               ` Oren Laadan
  1 sibling, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-07-07 15:15 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

On Tue, 7 Jul 2009, Dan Smith wrote:

> OL> But there are two cases: if you are CAP_NET_ADMIN you are allowed
> OL> to go beyond that limit. So you need to add that test too.
> 
> Okay, fair enough.
> 
> OL> And in general, this helps to keep the checks - be it security,
> OL> resource limits, or whatever - in one place, instead of having to
> OL> duplicate code and, more importantly, risk getting out of sync
> OL> with the original checks (e.g., if sock_setsockopt changes).
> 
> But sock_setsockopt() will also set the userlocks flag saying that
> you've specified the buffer size.  At the point at which I currently
> restore the buffers, we've already restored the value specified in the
> checkpoint stream, so we'd need to re-reset it as a special case after
> the call to sock_setsockopt().  Further, to get the "override" case,
> you have to call it with SO_RCVBUFFORCE which fails if you're not
> CAP_SYS_ADMIN.  So, do we try force, then if it fails try SO_RCVBUF,
> then if it fails actually fail?  Since sock_setsockopt() doubles the
> buffer size we get it, do we cut the value we want in half before
> passing it in?

What I had in mind is take out the part that checks against the limit
and/or CAP_NET_ADMIN so you could call it from sock_setsockopt() and
from the restore code. But I don't have a concrete suggestions.

> Doing all that seems like an abuse of sock_setsockopt() to me when the
> alternative is to check CAP_SYS_ADMIN and set the buffer size.

I'm ok with that. My only concern was CAP_NET_ADMIN - so the input
should come from network people - is it ok with them to use CAP_SYS_ADMIN
there "instead" ?

> 
> OL> But we do care, because it is necessary to do the unlink() after
> OL> the bind(), like you do for listening sockets, for this scenario:
> 
> OL> 	s = socket()
> OL> 	bind(s, pathname)
> OL> 	unlink(pathname)
> OL> 				<---- checkpoint/restart
> OL> 	r = socket()
> OL> 	bind(r, pathname)
> 
> OL> The second bind() will succeed on the original system, but will
> OL> fail on the restarted system, unless you do that.
> 
> Not if we don't actually call bind(s) in the unlinked case.  What I
> meant in my previous response is: if we're unlinked, then just fake
> the bind actions but don't actually do the bind()..unlink().  We
> already went over the case where we might unlink() a valid socket
> depending on the order, right?
> 

Heh .. I wrote that as a precaution after arguing the a bind() need
genrally be called for all sockets, except when ... [].  I guess we
got lost in the thread.

> OL> BTW, I just looked again at the code, and I'm concerned about:
> 
> OL> +		if (!un->linked) {
> OL> +			struct sockaddr_un *sun =
> OL> +				(struct sockaddr_un *)&h->laddr;
> OL> +			ret = sock_unix_unlink(sun->sun_path);
> OL> +		}
> 
> OL> You need to verify that the address is not abstract, because I'm
> OL> not sure what sock_unix_unlink() would do given the empty
> OL> string. Usually this is filtered higher in the VFS (getname).
> 
> Yep, but luckily that's gone with my recent changes to fake the bind()
> for unlinked sockets instead of actually doing the unlink() :)

:)

Oren.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                                 ` <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
@ 2009-07-07 15:24                                   ` Dan Smith
  0 siblings, 0 replies; 15+ messages in thread
From: Dan Smith @ 2009-07-07 15:24 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> I'm ok with that. My only concern was CAP_NET_ADMIN - so the input
OL> should come from network people - is it ok with them to use
OL> CAP_SYS_ADMIN there "instead" ?

I meant CAP_NET_ADMIN of course :)

I'll merge that check in with the sysctl one and then see what they
have to say about the whole thing.  My guess would be that this
particular part will not be the most controversial bit :)

I'll post v5 here in a few and copy netdev.

Thanks!

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                             ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-07-07 15:15                               ` Oren Laadan
@ 2009-07-07 15:33                               ` Oren Laadan
  2009-07-07 15:36                                 ` Dan Smith
  1 sibling, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-07-07 15:33 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan



Dan Smith wrote:
> OL> But there are two cases: if you are CAP_NET_ADMIN you are allowed
> OL> to go beyond that limit. So you need to add that test too.
> 
> Okay, fair enough.
> 
> OL> And in general, this helps to keep the checks - be it security,
> OL> resource limits, or whatever - in one place, instead of having to
> OL> duplicate code and, more importantly, risk getting out of sync
> OL> with the original checks (e.g., if sock_setsockopt changes).
> 
> But sock_setsockopt() will also set the userlocks flag saying that
> you've specified the buffer size.  At the point at which I currently
> restore the buffers, we've already restored the value specified in the
> checkpoint stream, so we'd need to re-reset it as a special case after
> the call to sock_setsockopt().  Further, to get the "override" case,
> you have to call it with SO_RCVBUFFORCE which fails if you're not
> CAP_SYS_ADMIN.  So, do we try force, then if it fails try SO_RCVBUF,
> then if it fails actually fail?  Since sock_setsockopt() doubles the
> buffer size we get it, do we cut the value we want in half before
> passing it in?
> 
> Doing all that seems like an abuse of sock_setsockopt() to me when the
> alternative is to check CAP_SYS_ADMIN and set the buffer size.
> 
> OL> But we do care, because it is necessary to do the unlink() after
> OL> the bind(), like you do for listening sockets, for this scenario:
> 
> OL> 	s = socket()
> OL> 	bind(s, pathname)
> OL> 	unlink(pathname)
> OL> 				<---- checkpoint/restart
> OL> 	r = socket()
> OL> 	bind(r, pathname)
> 
> OL> The second bind() will succeed on the original system, but will
> OL> fail on the restarted system, unless you do that.
> 
> Not if we don't actually call bind(s) in the unlinked case.  What I
> meant in my previous response is: if we're unlinked, then just fake
> the bind actions but don't actually do the bind()..unlink().  We
> already went over the case where we might unlink() a valid socket
> depending on the order, right?
> 
> OL> BTW, I just looked again at the code, and I'm concerned about:
> 
> OL> +		if (!un->linked) {
> OL> +			struct sockaddr_un *sun =
> OL> +				(struct sockaddr_un *)&h->laddr;
> OL> +			ret = sock_unix_unlink(sun->sun_path);
> OL> +		}
> 
> OL> You need to verify that the address is not abstract, because I'm
> OL> not sure what sock_unix_unlink() would do given the empty
> OL> string. Usually this is filtered higher in the VFS (getname).
> 
> Yep, but luckily that's gone with my recent changes to fake the bind()
> for unlinked sockets instead of actually doing the unlink() :)

Ahh.. and forgot to ask/mention: you do need to call sock_unix_unlink()
before attempting bind(), for the reasons we had discussed earlier
(consider same example as above, checkpoint/restart done before the
unlink(), then restart will otherwise fail).

So you still need to sanitize the file name for that case, no ?

Oren.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
  2009-07-07 15:33                               ` Oren Laadan
@ 2009-07-07 15:36                                 ` Dan Smith
       [not found]                                   ` <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-07-07 15:36 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> Ahh.. and forgot to ask/mention: you do need to call
OL> sock_unix_unlink() before attempting bind(), for the reasons we
OL> had discussed earlier (consider same example as above,
OL> checkpoint/restart done before the unlink(), then restart will
OL> otherwise fail).

I thought we agreed that was userspace's job?  That's why I didn't
unlink() before bind() in this version (4) of the patch either.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                                   ` <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-07-07 15:48                                     ` Oren Laadan
       [not found]                                       ` <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Oren Laadan @ 2009-07-07 15:48 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan



Dan Smith wrote:
> OL> Ahh.. and forgot to ask/mention: you do need to call
> OL> sock_unix_unlink() before attempting bind(), for the reasons we
> OL> had discussed earlier (consider same example as above,
> OL> checkpoint/restart done before the unlink(), then restart will
> OL> otherwise fail).
> 
> I thought we agreed that was userspace's job?  That's why I didn't
> unlink() before bind() in this version (4) of the patch either.
> 

I don't recall such a conclusion.

I argued that it it's kernel's job. I suppose we agree that such
pathnames should always be unlinked - so it isn't policy based.
Therefore, I don't see a good reason to have userspace scan the
checkpoint image data just to find such sockets and make them
disappear ...

Oren.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] c/r: Add AF_UNIX support (v3)
       [not found]                                       ` <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-07-07 16:04                                         ` Dan Smith
  0 siblings, 0 replies; 15+ messages in thread
From: Dan Smith @ 2009-07-07 16:04 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan

OL> I don't recall such a conclusion.

Maybe this will lead you to a pointer in your IRC logs:

  Jun 11 13:51:53 *       orenl in favor of dansmith

OL> I argued that it it's kernel's job. I suppose we agree that such
OL> pathnames should always be unlinked - so it isn't policy based.
OL> Therefore, I don't see a good reason to have userspace scan the
OL> checkpoint image data just to find such sockets and make them
OL> disappear ...

I'm not really interested in arguing about it again, so I'll just put
it in and move on.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2009-07-07 16:04 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-16 15:55 [PATCH] c/r: Add AF_UNIX support (v3) Dan Smith
     [not found] ` <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-25  2:30   ` Oren Laadan
2009-06-29 17:29     ` Dan Smith
     [not found]       ` <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-02  9:15         ` Oren Laadan
2009-07-06 18:31           ` Dan Smith
     [not found]             ` <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-06 19:06               ` Oren Laadan
     [not found]                 ` <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-06 22:12                   ` Dan Smith
     [not found]                     ` <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-06 22:46                       ` Oren Laadan
     [not found]                         ` <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-07 14:52                           ` Dan Smith
     [not found]                             ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-07 15:15                               ` Oren Laadan
     [not found]                                 ` <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
2009-07-07 15:24                                   ` Dan Smith
2009-07-07 15:33                               ` Oren Laadan
2009-07-07 15:36                                 ` Dan Smith
     [not found]                                   ` <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-07 15:48                                     ` Oren Laadan
     [not found]                                       ` <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-07 16:04                                         ` Dan Smith

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.