* [PATCH] c/r: Add AF_UNIX support (v3)
@ 2009-06-16 15:55 Dan Smith
[not found] ` <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 15+ messages in thread
From: Dan Smith @ 2009-06-16 15:55 UTC (permalink / raw)
To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: Alexey Dobriyan
This patch adds basic checkpoint/restart support for AF_UNIX sockets. It
has been tested with a single and multiple processes, and with data inflight
at the time of checkpoint. It supports socketpair()s, path-based, and
abstract sockets.
Changes in v3:
- Move sock_file_checkpoint() above sock_file_restore()
- Change __sock_file_*() functions to do_sock_file_*()
- Adjust some of the struct cr_hdr_socket alignment
- Improve the sock_copy_buffers() algorithm to avoid locking the source
queue for the entire operation
- Fix alignment in the socket header struct(s)
- Move the per-protocol structure (ckpt_hdr_socket_un) out of the
common socket header and read/write it separately
- Fix missing call to sock_cptrst() in restore path
- Break out the socket joining into another function
- Fix failure to restore the socket address thus fixing getname()
- Check the state values on restart
- Fix case of state being TCP_CLOSE, which allows dgram sockets to be
properly connected (if appropriate) to their peer and maintain the
sockaddr for getname() operation
- Fix restoring a listening socket that has been unlink()'d
- Fix checkpointing sockets with an in-flight FD-passing SKB. Fail
with EBUSY.
- Fix checkpointing listening sockets with an unaccepted connection.
Fail with EBUSY.
Changes in v2:
- Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems
to be rather common in other uses of skb_copy())
- Move the ckpt_hdr_socket structure definition to linux/socket.h
- Fix whitespace issue
- Move sock_file_checkpoint() to net/socket.c for symmetry
Cc: Oren Laaden <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Cc: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
checkpoint/files.c | 7 +
checkpoint/objhash.c | 27 +++
include/linux/checkpoint_hdr.h | 13 +
include/linux/socket.h | 59 +++++
include/net/sock.h | 9 +
net/Makefile | 2 +
net/checkpoint.c | 493 ++++++++++++++++++++++++++++++++++++++++
net/socket.c | 84 +++++++
8 files changed, 694 insertions(+), 0 deletions(-)
create mode 100644 net/checkpoint.c
diff --git a/checkpoint/files.c b/checkpoint/files.c
index b264e40..bb2cca0 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@
#include <linux/syscalls.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
/**************************************************************************
@@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_PIPE,
.restore = pipe_file_restore,
},
+ /* socket */
+ {
+ .file_name = "SOCKET",
+ .file_type = CKPT_FILE_SOCKET,
+ .restore = sock_file_restore,
+ },
};
static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 045a920..7819e5e 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -19,6 +19,7 @@
#include <linux/ipc_namespace.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
struct ckpt_obj;
struct ckpt_obj_ops;
@@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
return atomic_read(&((struct ipc_namespace *) ptr)->count);
}
+static int obj_sock_grab(void *ptr)
+{
+ sock_hold((struct sock *) ptr);
+ return 0;
+}
+
+static void obj_sock_drop(void *ptr)
+{
+ sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+ return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
static struct ckpt_obj_ops ckpt_obj_ops[] = {
/* ignored object */
{
@@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
.checkpoint = checkpoint_bad,
.restore = restore_bad,
},
+ /* sock object */
+ {
+ .obj_name = "SOCKET",
+ .obj_type = CKPT_OBJ_SOCK,
+ .ref_drop = obj_sock_drop,
+ .ref_grab = obj_sock_grab,
+ .ref_users = obj_sock_users,
+ .checkpoint = sock_file_checkpoint,
+ .restore = sock_file_restore,
+ },
};
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index cd427d8..60870df 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -76,6 +76,12 @@ enum {
CKPT_HDR_IPC_MSG_MSG,
CKPT_HDR_IPC_SEM,
+ CKPT_HDR_FD_SOCKET = 601,
+ CKPT_HDR_SOCKET,
+ CKPT_HDR_SOCKET_BUFFERS,
+ CKPT_HDR_SOCKET_BUFFER,
+ CKPT_HDR_SOCKET_UN,
+
CKPT_HDR_TAIL = 9001,
CKPT_HDR_ERROR = 9999,
@@ -103,6 +109,7 @@ enum obj_type {
CKPT_OBJ_NS,
CKPT_OBJ_UTS_NS,
CKPT_OBJ_IPC_NS,
+ CKPT_OBJ_SOCK,
CKPT_OBJ_MAX
};
@@ -225,6 +232,7 @@ enum file_type {
CKPT_FILE_IGNORE = 0,
CKPT_FILE_GENERIC,
CKPT_FILE_PIPE,
+ CKPT_FILE_SOCKET,
CKPT_FILE_MAX
};
@@ -248,6 +256,11 @@ struct ckpt_hdr_file_pipe {
__s32 pipe_objref;
} __attribute__((aligned(8)));
+struct ckpt_hdr_file_socket {
+ struct ckpt_hdr_file common;
+ __u16 family;
+} __attribute__((aligned(8)));
+
struct ckpt_hdr_file_pipe_state {
struct ckpt_hdr h;
__s32 pipe_len;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 421afb4..3b5be70 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage {
#include <linux/uio.h> /* iovec support */
#include <linux/types.h> /* pid_t */
#include <linux/compiler.h> /* __user */
+#include <linux/checkpoint_hdr.h> /* ckpt_hdr */
#ifdef __KERNEL__
# ifdef CONFIG_PROC_FS
@@ -323,5 +324,63 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
#endif
+
+struct ckpt_hdr_socket_un {
+ struct ckpt_hdr h;
+ __u32 this;
+ __u32 peer;
+ __u8 linked;
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket {
+ struct ckpt_hdr h;
+
+ struct ckpt_socket { /* struct socket */
+ __u64 flags;
+ __u8 state;
+ } socket __attribute__ ((aligned(8)));
+
+ struct ckpt_sock_common { /* struct sock_common */
+ __u32 bound_dev_if;
+ __u16 family;
+ __u8 state;
+ __u8 reuse;
+ } sock_common __attribute__ ((aligned(8)));
+
+ struct ckpt_sock { /* struct sock */
+ __u64 rcvlowat;
+ __u64 rcvtimeo;
+ __u64 sndtimeo;
+ __u64 flags;
+ __u64 lingertime;
+
+ __u32 err;
+ __u32 err_soft;
+ __u32 priority;
+ __s32 rcvbuf;
+ __s32 sndbuf;
+ __u16 type;
+ __u16 backlog;
+
+ __u8 protocol;
+ __u8 state;
+ __u8 shutdown;
+ __u8 userlocks;
+ __u8 no_check;
+ } sock __attribute__ ((aligned(8)));
+
+ /* common to all supported families */
+ __u32 laddr_len;
+ __u32 raddr_len;
+ struct sockaddr laddr;
+ struct sockaddr raddr;
+
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket_buffer {
+ struct ckpt_hdr h;
+ __u32 skb_count;
+} __attribute__ ((aligned(8)));
+
#endif /* not kernel and not glibc */
#endif /* _LINUX_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..a8b6af1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max;
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
+/* Checkpoint/Restart Functions */
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
+extern void *sock_file_restore(struct ckpt_ctx *);
+extern struct socket *do_sock_file_restore(struct ckpt_ctx *,
+ struct ckpt_hdr_socket *);
+extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+
#endif /* _SOCK_H */
diff --git a/net/Makefile b/net/Makefile
index 9e00a55..c226ed1 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
endif
obj-$(CONFIG_WIMAX) += wimax/
+
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
diff --git a/net/checkpoint.c b/net/checkpoint.c
new file mode 100644
index 0000000..fd47485
--- /dev/null
+++ b/net/checkpoint.c
@@ -0,0 +1,493 @@
+/*
+ * Copyright 2009 IBM Corporation
+ *
+ * Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/socket.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/namei.h>
+
+/* Size of an empty struct sockaddr_un */
+#define UNIX_LEN_EMPTY 2
+
+static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
+{
+ int count = 0;
+ struct sk_buff *skb;
+
+ skb_queue_walk(from, skb) {
+ struct sk_buff *tmp;
+
+ tmp = dev_alloc_skb(skb->len);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&from->lock);
+ skb_morph(tmp, skb);
+ spin_unlock(&from->lock);
+
+ skb_queue_tail(to, tmp);
+ count++;
+ }
+
+ return count;
+}
+
+static int __sock_write_buffers(struct ckpt_ctx *ctx,
+ struct sk_buff_head *queue)
+{
+ struct sk_buff *skb;
+ int ret = 0;
+
+ skb_queue_walk(queue, skb) {
+ if (UNIXCB(skb).fp) {
+ ckpt_debug("unsupported fd-passing skb found\n");
+ return -EBUSY;
+ }
+
+ ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
+ CKPT_HDR_SOCKET_BUFFER);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+ struct ckpt_hdr_socket_buffer *h;
+ struct sk_buff_head tmpq;
+ int ret = -ENOMEM;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+ if (!h)
+ goto out;
+
+ skb_queue_head_init(&tmpq);
+
+ h->skb_count = sock_copy_buffers(queue, &tmpq);
+ if (h->skb_count < 0) {
+ ret = h->skb_count;
+ goto out;
+ }
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ if (!ret)
+ ret = __sock_write_buffers(ctx, &tmpq);
+
+ out:
+ ckpt_hdr_put(ctx, h);
+ __skb_queue_purge(&tmpq);
+
+ return ret;
+}
+
+static int sock_un_checkpoint(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct ckpt_hdr_socket *h)
+{
+ struct unix_sock *sk = unix_sk(sock);
+ struct unix_sock *pr = unix_sk(sk->peer);
+ struct ckpt_hdr_socket_un *un;
+ int new;
+ int ret = -ENOMEM;
+
+ if ((sock->sk_state == TCP_LISTEN) &&
+ !skb_queue_empty(&sock->sk_receive_queue)) {
+ ckpt_debug("listening socket has unaccepted peers");
+ return -EBUSY;
+ }
+
+ un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
+ if (!un)
+ goto out;
+
+ un->linked = sk->dentry && (sk->dentry->d_inode->i_nlink > 0);
+
+ un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
+ if (un->this < 0)
+ goto out;
+
+ if (sk->peer)
+ un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+ else
+ un->peer = 0;
+
+ if (un->peer < 0) {
+ ret = un->peer;
+ goto out;
+ }
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ if (ret < 0)
+ goto out;
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un);
+ out:
+ return ret;
+}
+
+static int sock_cptrst(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct ckpt_hdr_socket *h,
+ int op)
+{
+ if (sock->sk_socket) {
+ CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags);
+ CKPT_COPY(op, h->socket.state, sock->sk_socket->state);
+ }
+
+ CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse);
+ CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if);
+ CKPT_COPY(op, h->sock_common.family, sock->sk_family);
+
+ CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown);
+ CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks);
+ CKPT_COPY(op, h->sock.no_check, sock->sk_no_check);
+ CKPT_COPY(op, h->sock.protocol, sock->sk_protocol);
+ CKPT_COPY(op, h->sock.err, sock->sk_err);
+ CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);
+ CKPT_COPY(op, h->sock.priority, sock->sk_priority);
+ CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat);
+ CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog);
+ CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
+ CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);
+ CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf);
+ CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf);
+ CKPT_COPY(op, h->sock.flags, sock->sk_flags);
+ CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime);
+ CKPT_COPY(op, h->sock.type, sock->sk_type);
+ CKPT_COPY(op, h->sock.state, sock->sk_state);
+
+ if ((h->socket.state == SS_CONNECTED) &&
+ (h->sock.state != TCP_ESTABLISHED)) {
+ ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
+ h->socket.state, h->sock.state);
+ return -EINVAL;
+ } else if ((h->socket.state == SS_UNCONNECTED) &&
+ (h->sock.state == TCP_ESTABLISHED)) {
+ ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
+ h->socket.state, h->sock.state);
+ return -EINVAL;
+ } else if ((h->sock.state < TCP_ESTABLISHED) ||
+ (h->sock.state >= TCP_MAX_STATES)) {
+ ckpt_debug("sock in invalid state: %i\n", h->sock.state);
+ return -EINVAL;
+ } else if ((h->socket.state < SS_FREE) ||
+ (h->socket.state > SS_DISCONNECTING)) {
+ ckpt_debug("socket in invalid state: %i\n", h->socket.state);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct socket *socket = file->private_data;
+ struct sock *sock = socket->sk;
+ struct ckpt_hdr_socket *h;
+ int ret = 0;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+ if (!h)
+ return -ENOMEM;
+
+ h->laddr_len = sizeof(h->laddr);
+ h->raddr_len = sizeof(h->raddr);
+
+ if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((sock->sk_state != TCP_LISTEN) &&
+ (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
+ if (sock->sk_type != SOCK_DGRAM) {
+ ret = -EINVAL;
+ goto out;
+ }
+ h->raddr_len = 0;
+ }
+
+ sock_cptrst(ctx, sock, h, CKPT_CPT);
+
+ if (sock->sk_family == AF_UNIX) {
+ ret = sock_un_checkpoint(ctx, sock, h);
+ if (ret)
+ goto out;
+ } else {
+ ckpt_debug("unsupported socket type %i\n",
+ sock->sk_family);
+ ret = EINVAL;
+ goto out;
+ }
+
+ ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+ if (ret)
+ goto out;
+
+ ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+ if (ret)
+ goto out;
+
+ /* FIXME: write out-of-order queue for TCP */
+ out:
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+static int sock_read_buffer(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct sk_buff **skb)
+{
+ struct ckpt_hdr *h;
+ int ret = 0;
+ int len;
+
+ h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ len = h->len - sizeof(*h);
+
+ *skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
+ if (*skb == NULL) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ memcpy(skb_put(*skb, len), (char *)(h + 1), len);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int sock_read_buffers(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct sk_buff_head *queue)
+{
+ struct ckpt_hdr_socket_buffer *h;
+ int ret = 0;
+ int i;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+ if (IS_ERR(h)) {
+ ret = PTR_ERR(h);
+ goto out;
+ }
+
+ for (i = 0; i < h->skb_count; i++) {
+ struct sk_buff *skb = NULL;
+
+ ret = sock_read_buffer(ctx, sock, &skb);
+ if (ret)
+ break;
+
+ skb_queue_tail(queue, skb);
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+static struct unix_address *sock_un_makeaddr(struct sockaddr_un *sun_addr,
+ unsigned len)
+{
+ struct unix_address *addr;
+
+ addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL);
+ if (!addr)
+ return ERR_PTR(ENOMEM);
+
+ memcpy(addr->name, sun_addr, len);
+ addr->len = len;
+ atomic_set(&addr->refcnt, 1);
+
+ return addr;
+}
+
+static int sock_un_join(struct sock *a,
+ struct sock *b,
+ struct ckpt_hdr_socket *h)
+{
+ struct unix_address *addr;
+
+ sock_hold(a);
+ sock_hold(b);
+
+ unix_sk(a)->peer = b;
+ unix_sk(b)->peer = a;
+
+ a->sk_peercred.pid = task_tgid_vnr(current);
+ current_euid_egid(&a->sk_peercred.uid,
+ &a->sk_peercred.gid);
+
+ b->sk_peercred.pid = task_tgid_vnr(current);
+ current_euid_egid(&b->sk_peercred.uid,
+ &b->sk_peercred.gid);
+
+ if (h->laddr_len == UNIX_LEN_EMPTY)
+ addr = sock_un_makeaddr((struct sockaddr_un *)&h->raddr,
+ h->raddr_len);
+ else
+ addr = sock_un_makeaddr((struct sockaddr_un *)&h->laddr,
+ h->laddr_len);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+
+ atomic_inc(&addr->refcnt); /* Held by both ends */
+ unix_sk(a)->addr = unix_sk(b)->addr = addr;
+
+ return 0;
+}
+
+static int sock_un_unlink(const char *name)
+{
+ struct path spath;
+ struct path ppath;
+ int ret;
+
+ ret = kern_path(name, 0, &spath);
+ if (ret)
+ return ret;
+
+ ret = kern_path(name, LOOKUP_PARENT, &ppath);
+ if (ret)
+ goto out_s;
+
+ if (!spath.dentry) {
+ ckpt_debug("No dentry found for %s\n", name);
+ ret = -ENOENT;
+ goto out_p;
+ }
+
+ if (!ppath.dentry || !ppath.dentry->d_inode) {
+ ckpt_debug("No inode for parent of %s\n", name);
+ ret = -ENOENT;
+ goto out_p;
+ }
+
+ ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry);
+ out_p:
+ path_put(&ppath);
+ out_s:
+ path_put(&spath);
+
+ return ret;
+}
+
+static int sock_un_restart(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_socket *h,
+ struct socket *socket)
+{
+ struct sock *peer;
+ struct ckpt_hdr_socket_un *un;
+ int ret = 0;
+
+ un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
+ if (IS_ERR(un))
+ return PTR_ERR(un);
+
+ if (un->peer < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
+
+ if ((h->sock.state == TCP_ESTABLISHED) ||
+ (h->sock.state == TCP_CLOSE)) {
+ if (!IS_ERR(peer)) {
+ /* We're last, so join with peer */
+ struct sock *this = socket->sk;
+
+ ret = sock_un_join(this, peer, h);
+ } else {
+ /* We're first, so add our socket and wait for peer */
+ ckpt_obj_insert(ctx, socket->sk, un->this,
+ CKPT_OBJ_SOCK);
+ }
+
+ } else if (h->sock.state == TCP_LISTEN) {
+ ret = socket->ops->bind(socket,
+ (struct sockaddr *)&h->laddr,
+ h->laddr_len);
+ if (ret < 0)
+ goto out;
+
+ ret = socket->ops->listen(socket, h->sock.backlog);
+ if (ret < 0)
+ goto out;
+
+ /* We can unlink this blindly because we just created it
+ * above and would have failed already without proper
+ * permissions
+ */
+ if (!un->linked) {
+ struct sockaddr_un *sun =
+ (struct sockaddr_un *)&h->laddr;
+ ret = sock_un_unlink(sun->sun_path);
+ }
+ } else
+ ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state);
+ out:
+ ckpt_hdr_put(ctx, un);
+ return ret;
+}
+
+struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_socket *h)
+{
+ struct socket *socket;
+ int ret;
+
+ ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (h->sock_common.family == AF_UNIX) {
+ ret = sock_un_restart(ctx, h, socket);
+ ckpt_debug("sock_un_restart: %i\n", ret);
+ } else {
+ ckpt_debug("unsupported family %i\n", h->sock_common.family);
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ goto out;
+
+ sock_cptrst(ctx, socket->sk, h, CKPT_RST);
+
+ ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
+ if (ret)
+ goto out;
+
+ ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
+ if (ret)
+ goto out;
+ out:
+ if (ret) {
+ sock_release(socket);
+ socket = ERR_PTR(ret);
+ }
+
+ return socket;
+}
+
diff --git a/net/socket.c b/net/socket.c
index 791d71a..be8c562 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -96,6 +96,9 @@
#include <net/sock.h>
#include <linux/netfilter.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
@@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = sock_file_checkpoint,
+#endif
};
/*
@@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags)
return fd;
}
+int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct ckpt_hdr_file_socket *h;
+ int ret;
+ struct file *file = ptr;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+
+ h->common.f_type = CKPT_FILE_SOCKET;
+
+ ret = checkpoint_file_common(ctx, file, &h->common);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ if (ret < 0)
+ goto out;
+
+ ret = do_sock_file_checkpoint(ctx, file);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static struct file *sock_alloc_attach_fd(struct socket *socket)
+{
+ struct file *file;
+ int err;
+
+ file = get_empty_filp();
+ if (!file)
+ return ERR_PTR(ENOMEM);
+
+ err = sock_attach_fd(socket, file, 0);
+ if (err < 0) {
+ put_filp(file);
+ file = ERR_PTR(err);
+ }
+
+ return file;
+}
+
+void *sock_file_restore(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_socket *h = NULL;
+ struct socket *socket = NULL;
+ struct file *file = NULL;
+ int err;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+ if (IS_ERR(h))
+ return h;
+
+ socket = do_sock_file_restore(ctx, h);
+ if (IS_ERR(socket)) {
+ err = PTR_ERR(socket);
+ goto err_put;
+ }
+
+ file = sock_alloc_attach_fd(socket);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_release;
+ }
+
+ ckpt_hdr_put(ctx, h);
+
+ return file;
+
+ err_release:
+ sock_release(socket);
+ err_put:
+ ckpt_hdr_put(ctx, h);
+
+ return ERR_PTR(err);
+}
+
static struct socket *sock_from_file(struct file *file, int *err)
{
if (file->f_op == &socket_file_ops)
--
1.6.0.4
^ permalink raw reply related [flat|nested] 15+ messages in thread[parent not found: <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-06-25 2:30 ` Oren Laadan 2009-06-29 17:29 ` Dan Smith 0 siblings, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-06-25 2:30 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan Dan Smith wrote: > This patch adds basic checkpoint/restart support for AF_UNIX sockets. It > has been tested with a single and multiple processes, and with data inflight > at the time of checkpoint. It supports socketpair()s, path-based, and > abstract sockets. > > Changes in v3: > - Move sock_file_checkpoint() above sock_file_restore() > - Change __sock_file_*() functions to do_sock_file_*() > - Adjust some of the struct cr_hdr_socket alignment > - Improve the sock_copy_buffers() algorithm to avoid locking the source > queue for the entire operation > - Fix alignment in the socket header struct(s) > - Move the per-protocol structure (ckpt_hdr_socket_un) out of the > common socket header and read/write it separately > - Fix missing call to sock_cptrst() in restore path > - Break out the socket joining into another function > - Fix failure to restore the socket address thus fixing getname() > - Check the state values on restart > - Fix case of state being TCP_CLOSE, which allows dgram sockets to be > properly connected (if appropriate) to their peer and maintain the > sockaddr for getname() operation > - Fix restoring a listening socket that has been unlink()'d > - Fix checkpointing sockets with an in-flight FD-passing SKB. Fail > with EBUSY. > - Fix checkpointing listening sockets with an unaccepted connection. > Fail with EBUSY. > > Changes in v2: > - Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems > to be rather common in other uses of skb_copy()) > - Move the ckpt_hdr_socket structure definition to linux/socket.h > - Fix whitespace issue > - Move sock_file_checkpoint() to net/socket.c for symmetry > > Cc: Oren Laaden <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> > Cc: Alexey Dobriyan <adobriyan-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> > Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > --- > checkpoint/files.c | 7 + > checkpoint/objhash.c | 27 +++ > include/linux/checkpoint_hdr.h | 13 + > include/linux/socket.h | 59 +++++ > include/net/sock.h | 9 + > net/Makefile | 2 + > net/checkpoint.c | 493 ++++++++++++++++++++++++++++++++++++++++ > net/socket.c | 84 +++++++ > 8 files changed, 694 insertions(+), 0 deletions(-) > create mode 100644 net/checkpoint.c > > diff --git a/checkpoint/files.c b/checkpoint/files.c > index b264e40..bb2cca0 100644 > --- a/checkpoint/files.c > +++ b/checkpoint/files.c > @@ -21,6 +21,7 @@ > #include <linux/syscalls.h> > #include <linux/checkpoint.h> > #include <linux/checkpoint_hdr.h> > +#include <net/sock.h> > > > /************************************************************************** > @@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = { > .file_type = CKPT_FILE_PIPE, > .restore = pipe_file_restore, > }, > + /* socket */ > + { > + .file_name = "SOCKET", > + .file_type = CKPT_FILE_SOCKET, > + .restore = sock_file_restore, > + }, > }; > > static struct file *do_restore_file(struct ckpt_ctx *ctx) > diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c > index 045a920..7819e5e 100644 > --- a/checkpoint/objhash.c > +++ b/checkpoint/objhash.c > @@ -19,6 +19,7 @@ > #include <linux/ipc_namespace.h> > #include <linux/checkpoint.h> > #include <linux/checkpoint_hdr.h> > +#include <net/sock.h> > > struct ckpt_obj; > struct ckpt_obj_ops; > @@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr) > return atomic_read(&((struct ipc_namespace *) ptr)->count); > } > > +static int obj_sock_grab(void *ptr) > +{ > + sock_hold((struct sock *) ptr); > + return 0; > +} > + > +static void obj_sock_drop(void *ptr) > +{ > + sock_put((struct sock *) ptr); > +} > + > +static int obj_sock_users(void *ptr) > +{ > + return atomic_read(&((struct sock *) ptr)->sk_refcnt); > +} > + > static struct ckpt_obj_ops ckpt_obj_ops[] = { > /* ignored object */ > { > @@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = { > .checkpoint = checkpoint_bad, > .restore = restore_bad, > }, > + /* sock object */ > + { > + .obj_name = "SOCKET", > + .obj_type = CKPT_OBJ_SOCK, > + .ref_drop = obj_sock_drop, > + .ref_grab = obj_sock_grab, > + .ref_users = obj_sock_users, > + .checkpoint = sock_file_checkpoint, > + .restore = sock_file_restore, > + }, > }; > > > diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h > index cd427d8..60870df 100644 > --- a/include/linux/checkpoint_hdr.h > +++ b/include/linux/checkpoint_hdr.h > @@ -76,6 +76,12 @@ enum { > CKPT_HDR_IPC_MSG_MSG, > CKPT_HDR_IPC_SEM, > > + CKPT_HDR_FD_SOCKET = 601, > + CKPT_HDR_SOCKET, > + CKPT_HDR_SOCKET_BUFFERS, > + CKPT_HDR_SOCKET_BUFFER, > + CKPT_HDR_SOCKET_UN, > + > CKPT_HDR_TAIL = 9001, > > CKPT_HDR_ERROR = 9999, > @@ -103,6 +109,7 @@ enum obj_type { > CKPT_OBJ_NS, > CKPT_OBJ_UTS_NS, > CKPT_OBJ_IPC_NS, > + CKPT_OBJ_SOCK, > CKPT_OBJ_MAX > }; > > @@ -225,6 +232,7 @@ enum file_type { > CKPT_FILE_IGNORE = 0, > CKPT_FILE_GENERIC, > CKPT_FILE_PIPE, > + CKPT_FILE_SOCKET, > CKPT_FILE_MAX > }; > > @@ -248,6 +256,11 @@ struct ckpt_hdr_file_pipe { > __s32 pipe_objref; > } __attribute__((aligned(8))); > > +struct ckpt_hdr_file_socket { > + struct ckpt_hdr_file common; > + __u16 family; > +} __attribute__((aligned(8))); > + > struct ckpt_hdr_file_pipe_state { > struct ckpt_hdr h; > __s32 pipe_len; > diff --git a/include/linux/socket.h b/include/linux/socket.h > index 421afb4..3b5be70 100644 > --- a/include/linux/socket.h > +++ b/include/linux/socket.h > @@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage { > #include <linux/uio.h> /* iovec support */ > #include <linux/types.h> /* pid_t */ > #include <linux/compiler.h> /* __user */ > +#include <linux/checkpoint_hdr.h> /* ckpt_hdr */ > > #ifdef __KERNEL__ > # ifdef CONFIG_PROC_FS > @@ -323,5 +324,63 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka > extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); > > #endif > + > +struct ckpt_hdr_socket_un { > + struct ckpt_hdr h; > + __u32 this; > + __u32 peer; > + __u8 linked; > +} __attribute__ ((aligned(8))); > + > +struct ckpt_hdr_socket { > + struct ckpt_hdr h; > + > + struct ckpt_socket { /* struct socket */ > + __u64 flags; > + __u8 state; > + } socket __attribute__ ((aligned(8))); > + > + struct ckpt_sock_common { /* struct sock_common */ > + __u32 bound_dev_if; > + __u16 family; > + __u8 state; > + __u8 reuse; > + } sock_common __attribute__ ((aligned(8))); > + > + struct ckpt_sock { /* struct sock */ > + __u64 rcvlowat; > + __u64 rcvtimeo; > + __u64 sndtimeo; > + __u64 flags; > + __u64 lingertime; > + > + __u32 err; > + __u32 err_soft; > + __u32 priority; > + __s32 rcvbuf; > + __s32 sndbuf; > + __u16 type; > + __u16 backlog; > + > + __u8 protocol; > + __u8 state; > + __u8 shutdown; > + __u8 userlocks; > + __u8 no_check; > + } sock __attribute__ ((aligned(8))); > + > + /* common to all supported families */ > + __u32 laddr_len; > + __u32 raddr_len; > + struct sockaddr laddr; > + struct sockaddr raddr; > + > +} __attribute__ ((aligned(8))); > + > +struct ckpt_hdr_socket_buffer { > + struct ckpt_hdr h; > + __u32 skb_count; > +} __attribute__ ((aligned(8))); > + > #endif /* not kernel and not glibc */ > #endif /* _LINUX_SOCKET_H */ > diff --git a/include/net/sock.h b/include/net/sock.h > index 4bb1ff9..a8b6af1 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max; > extern __u32 sysctl_wmem_default; > extern __u32 sysctl_rmem_default; > > +/* Checkpoint/Restart Functions */ > +struct ckpt_ctx; > +struct ckpt_hdr_socket; > +extern int sock_file_checkpoint(struct ckpt_ctx *, void *); > +extern void *sock_file_restore(struct ckpt_ctx *); > +extern struct socket *do_sock_file_restore(struct ckpt_ctx *, > + struct ckpt_hdr_socket *); > +extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file); > + > #endif /* _SOCK_H */ > diff --git a/net/Makefile b/net/Makefile > index 9e00a55..c226ed1 100644 > --- a/net/Makefile > +++ b/net/Makefile > @@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y) > obj-$(CONFIG_SYSCTL) += sysctl_net.o > endif > obj-$(CONFIG_WIMAX) += wimax/ > + > +obj-$(CONFIG_CHECKPOINT) += checkpoint.o > diff --git a/net/checkpoint.c b/net/checkpoint.c > new file mode 100644 > index 0000000..fd47485 > --- /dev/null > +++ b/net/checkpoint.c > @@ -0,0 +1,493 @@ > +/* > + * Copyright 2009 IBM Corporation > + * > + * Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License as > + * published by the Free Software Foundation, version 2 of the > + * License. > + */ > + > +#include <linux/socket.h> > +#include <linux/mount.h> > +#include <linux/file.h> > + > +#include <net/af_unix.h> > +#include <net/tcp_states.h> > + > +#include <linux/checkpoint.h> > +#include <linux/checkpoint_hdr.h> > +#include <linux/namei.h> > + > +/* Size of an empty struct sockaddr_un */ > +#define UNIX_LEN_EMPTY 2 > + > +static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to) > +{ > + int count = 0; > + struct sk_buff *skb; > + > + skb_queue_walk(from, skb) { > + struct sk_buff *tmp; > + > + tmp = dev_alloc_skb(skb->len); > + if (!tmp) > + return -ENOMEM; > + > + spin_lock(&from->lock); > + skb_morph(tmp, skb); > + spin_unlock(&from->lock); > + > + skb_queue_tail(to, tmp); > + count++; > + } > + > + return count; > +} > + > +static int __sock_write_buffers(struct ckpt_ctx *ctx, > + struct sk_buff_head *queue) > +{ > + struct sk_buff *skb; > + int ret = 0; > + > + skb_queue_walk(queue, skb) { > + if (UNIXCB(skb).fp) { > + ckpt_debug("unsupported fd-passing skb found\n"); > + return -EBUSY; > + } > + > + ret = ckpt_write_obj_type(ctx, skb->data, skb->len, > + CKPT_HDR_SOCKET_BUFFER); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue) > +{ > + struct ckpt_hdr_socket_buffer *h; > + struct sk_buff_head tmpq; > + int ret = -ENOMEM; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS); > + if (!h) > + goto out; > + > + skb_queue_head_init(&tmpq); > + > + h->skb_count = sock_copy_buffers(queue, &tmpq); > + if (h->skb_count < 0) { > + ret = h->skb_count; > + goto out; > + } > + > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + if (!ret) > + ret = __sock_write_buffers(ctx, &tmpq); > + > + out: > + ckpt_hdr_put(ctx, h); > + __skb_queue_purge(&tmpq); > + > + return ret; > +} > + > +static int sock_un_checkpoint(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct ckpt_hdr_socket *h) > +{ > + struct unix_sock *sk = unix_sk(sock); > + struct unix_sock *pr = unix_sk(sk->peer); > + struct ckpt_hdr_socket_un *un; > + int new; > + int ret = -ENOMEM; > + > + if ((sock->sk_state == TCP_LISTEN) && > + !skb_queue_empty(&sock->sk_receive_queue)) { > + ckpt_debug("listening socket has unaccepted peers"); > + return -EBUSY; > + } > + > + un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN); > + if (!un) > + goto out; > + > + un->linked = sk->dentry && (sk->dentry->d_inode->i_nlink > 0); > + > + un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new); > + if (un->this < 0) > + goto out; > + > + if (sk->peer) > + un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new); > + else > + un->peer = 0; > + > + if (un->peer < 0) { > + ret = un->peer; > + goto out; > + } > + > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + if (ret < 0) > + goto out; > + > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un); > + out: > + return ret; > +} > + > +static int sock_cptrst(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct ckpt_hdr_socket *h, > + int op) > +{ > + if (sock->sk_socket) { > + CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags); > + CKPT_COPY(op, h->socket.state, sock->sk_socket->state); > + } > + > + CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse); > + CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if); > + CKPT_COPY(op, h->sock_common.family, sock->sk_family); > + Value of some fields below needs to be verified/checked: > + CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown); Allow on SHUTDOWN_MASK (because in some places the code tests a flag and in others a value - so it could be that {RCV,SND}_SHUTDOWN are true, but != SHUTDOWN_MASK, which is unexpected by the code). Also for future extensibility. > + CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks) Here too, allow possible flags only. > + CKPT_COPY(op, h->sock.no_check, sock->sk_no_check); > + CKPT_COPY(op, h->sock.protocol, sock->sk_protocol); > + CKPT_COPY(op, h->sock.err, sock->sk_err); > + CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft); I'm unsure if we need ensure that an error value is in range... can it do harm to pass an out-of-range value back to the caller ? > + CKPT_COPY(op, h->sock.priority, sock->sk_priority); > + CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat); This is 'int' in struct sock, but assignment may make it negative ? > + CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog); Same. > + CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo); > + CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo); These two too. Perhaps refactor and use sock_set_timeout() ? > + CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf); > + CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf); These should be checked for sign-ness, limited in magnitude (e.g. <= sysctl_wmem_max), checked against a minimum (see setting it in net/core/sock.c), and finally perhaps correlated with a flag against sk_userlocks ? > + CKPT_COPY(op, h->sock.flags, sock->sk_flags); > + CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime); Perhaps here too ? > + CKPT_COPY(op, h->sock.type, sock->sk_type); > + CKPT_COPY(op, h->sock.state, sock->sk_state); > + Do we need the tests below also during checkpoint ? Perhaps place all the sanity tests in a separate routine that is called from here as in: if (op == CKPT_RST) return sock_validate_data(...); else return 0; > + if ((h->socket.state == SS_CONNECTED) && > + (h->sock.state != TCP_ESTABLISHED)) { > + ckpt_debug("socket/sock in inconsistent state: %i/%i\n", > + h->socket.state, h->sock.state); > + return -EINVAL; > + } else if ((h->socket.state == SS_UNCONNECTED) && > + (h->sock.state == TCP_ESTABLISHED)) { > + ckpt_debug("socket/sock in inconsistent state: %i/%i\n", > + h->socket.state, h->sock.state); > + return -EINVAL; > + } else if ((h->sock.state < TCP_ESTABLISHED) || > + (h->sock.state >= TCP_MAX_STATES)) { > + ckpt_debug("sock in invalid state: %i\n", h->sock.state); > + return -EINVAL; > + } else if ((h->socket.state < SS_FREE) || > + (h->socket.state > SS_DISCONNECTING)) { > + ckpt_debug("socket in invalid state: %i\n", h->socket.state); > + return -EINVAL; > + } > + > + return 0; > +} > + > +int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file) > +{ > + struct socket *socket = file->private_data; > + struct sock *sock = socket->sk; > + struct ckpt_hdr_socket *h; > + int ret = 0; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET); > + if (!h) > + return -ENOMEM; > + > + h->laddr_len = sizeof(h->laddr); > + h->raddr_len = sizeof(h->raddr); > + > + if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) { > + ret = -EINVAL; > + goto out; > + } > + > + if ((sock->sk_state != TCP_LISTEN) && > + (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) { > + if (sock->sk_type != SOCK_DGRAM) { > + ret = -EINVAL; > + goto out; > + } I suspect this fails for !SOCK_DGRAM that isn't connected, e.g. newly created. > + h->raddr_len = 0; Hmmm.. do you need to ensure that this value is consistent with the value of ->peer ? (at restart) > + } > + > + sock_cptrst(ctx, sock, h, CKPT_CPT); > + > + if (sock->sk_family == AF_UNIX) { > + ret = sock_un_checkpoint(ctx, sock, h); > + if (ret) > + goto out; > + } else { > + ckpt_debug("unsupported socket type %i\n", > + sock->sk_family); I'd suggest a ckpt_write_err() here (and in other key places) to explain to the user what caused the checkpoint to fail. > + ret = EINVAL; > + goto out; > + } > + > + ret = sock_write_buffers(ctx, &sock->sk_receive_queue); > + if (ret) > + goto out; > + > + ret = sock_write_buffers(ctx, &sock->sk_write_queue); > + if (ret) > + goto out; Any reason to write the buffers of a listening socket ? they only contain not-yet-accepted sockest, which aren't supported anyway. And while there is not support for not-yet-accepted sockets, maybe add an explicit test (if LISTEN and has skb's) and fail with an error, as well as call ckpt_write_err(). > + > + /* FIXME: write out-of-order queue for TCP */ I'd prefer failing if such data exists... > + out: > + ckpt_hdr_put(ctx, h); > + > + return ret; > +} > + > +static int sock_read_buffer(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct sk_buff **skb) > +{ > + struct ckpt_hdr *h; > + int ret = 0; > + int len; > + > + h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER); > + if (IS_ERR(h)) > + return PTR_ERR(h); > + > + len = h->len - sizeof(*h); > + > + *skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret); > + if (*skb == NULL) { > + ret = ENOMEM; > + goto out; > + } > + > + memcpy(skb_put(*skb, len), (char *)(h + 1), len); Can we avoid this memory copy ? Perhaps add _ckpt_read_hdr() and _ckpt_read_payload() helpers ? > + out: > + ckpt_hdr_put(ctx, h); > + return ret; > +} > + > +static int sock_read_buffers(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct sk_buff_head *queue) > +{ > + struct ckpt_hdr_socket_buffer *h; > + int ret = 0; > + int i; > + > + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS); > + if (IS_ERR(h)) { > + ret = PTR_ERR(h); > + goto out; > + } > + > + for (i = 0; i < h->skb_count; i++) { > + struct sk_buff *skb = NULL; > + > + ret = sock_read_buffer(ctx, sock, &skb); > + if (ret) > + break; > + > + skb_queue_tail(queue, skb); > + } Make sure that total length is within limits, or this is an opening for DoS. > + out: > + ckpt_hdr_put(ctx, h); > + > + return ret; > +} > + > +static struct unix_address *sock_un_makeaddr(struct sockaddr_un *sun_addr, > + unsigned len) > +{ > + struct unix_address *addr; > + > + addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL); > + if (!addr) > + return ERR_PTR(ENOMEM); I checked below, but you don't examine @len before calling this function. So a DoS is possible by exhausting kernel memory with these kmalloc()s. > + > + memcpy(addr->name, sun_addr, len); > + addr->len = len; > + atomic_set(&addr->refcnt, 1); > + > + return addr; > +} > + > +static int sock_un_join(struct sock *a, > + struct sock *b, > + struct ckpt_hdr_socket *h) > +{ > + struct unix_address *addr; > + > + sock_hold(a); > + sock_hold(b); > + > + unix_sk(a)->peer = b; > + unix_sk(b)->peer = a; > + > + a->sk_peercred.pid = task_tgid_vnr(current); > + current_euid_egid(&a->sk_peercred.uid, > + &a->sk_peercred.gid); > + > + b->sk_peercred.pid = task_tgid_vnr(current); > + current_euid_egid(&b->sk_peercred.uid, > + &b->sk_peercred.gid); > + Hmmm... a socket that is connected may also be bind(), so I'm not sure the test below is always correct ? Also, this is the place to verify that the {l,r}addr_len make sense, to prevent the DoS mentioned above. > + if (h->laddr_len == UNIX_LEN_EMPTY) > + addr = sock_un_makeaddr((struct sockaddr_un *)&h->raddr, > + h->raddr_len); > + else > + addr = sock_un_makeaddr((struct sockaddr_un *)&h->laddr, > + h->laddr_len); > + if (IS_ERR(addr)) > + return PTR_ERR(addr); > + > + atomic_inc(&addr->refcnt); /* Held by both ends */ > + unix_sk(a)->addr = unix_sk(b)->addr = addr; A thought: do we want to restore sharing of addresses at restart ? For instance, all sockets accepted from a single listening socket share their local addresses. > + > + return 0; > +} > + > +static int sock_un_unlink(const char *name) > +{ > + struct path spath; > + struct path ppath; > + int ret; > + > + ret = kern_path(name, 0, &spath); > + if (ret) > + return ret; > + > + ret = kern_path(name, LOOKUP_PARENT, &ppath); > + if (ret) > + goto out_s; > + > + if (!spath.dentry) { > + ckpt_debug("No dentry found for %s\n", name); > + ret = -ENOENT; > + goto out_p; > + } > + > + if (!ppath.dentry || !ppath.dentry->d_inode) { > + ckpt_debug("No inode for parent of %s\n", name); > + ret = -ENOENT; > + goto out_p; > + } > + > + ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry); > + out_p: > + path_put(&ppath); > + out_s: > + path_put(&spath); > + > + return ret; > +} > + > +static int sock_un_restart(struct ckpt_ctx *ctx, > + struct ckpt_hdr_socket *h, > + struct socket *socket) > +{ > + struct sock *peer; > + struct ckpt_hdr_socket_un *un; > + int ret = 0; > + > + un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN); > + if (IS_ERR(un)) > + return PTR_ERR(un); > + > + if (un->peer < 0) { > + ret = -EINVAL; > + goto out; > + } > + > + peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK); > + > + if ((h->sock.state == TCP_ESTABLISHED) || > + (h->sock.state == TCP_CLOSE)) { Here below, @peer may hold ERR_PTR(-EINVAL) meaning not found, or ERR_PTR(-ENOMSG) meaning type mismatch and bad input. You need to test for the latter. > + if (!IS_ERR(peer)) { > + /* We're last, so join with peer */ > + struct sock *this = socket->sk; > + > + ret = sock_un_join(this, peer, h); > + } else { So this should be: } else if (PTR_ERR(peer) == -EINVAL) { > + /* We're first, so add our socket and wait for peer */ > + ckpt_obj_insert(ctx, socket->sk, un->this, > + CKPT_OBJ_SOCK); } else { ret = PTR_ERR(peer); > + } Also, need to check ret value of ckpt_obj_insert(). > + > + } else if (h->sock.state == TCP_LISTEN) { > + ret = socket->ops->bind(socket, > + (struct sockaddr *)&h->laddr, > + h->laddr_len); Three comments: 1) Non-listening sockets may also need bind(). Especially SOCK_DGRAM. 2) What happens if s1 is checkpointed before s2 in this scenario: s1 = socket(); bind(s1, addr); unlink(addr); s2 = socket(); bind(s1, addr); I'd suggest that a socket that is did bind() and that corresponding inode was later unlink()ed, will not be re-bind at all. After all, it is unreachable in terms of connect() syscall. 3) If the sockets path is relative, you can't blindly rely on the socket address reported by ->getname(), because the original bind() occured _relative_ to the task's current-working-directory at the time of the socket() syscall. The cwd of the restarting task may differ. Moreover, the resulting path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname() reports the original, relative pathname anyways. So if the address doesn't begin with a '/', you need to also save the cwd and on restart, go there before caling bind(). > + if (ret < 0) > + goto out; > + > + ret = socket->ops->listen(socket, h->sock.backlog); > + if (ret < 0) > + goto out; > + > + /* We can unlink this blindly because we just created it > + * above and would have failed already without proper > + * permissions > + */ > + if (!un->linked) { > + struct sockaddr_un *sun = > + (struct sockaddr_un *)&h->laddr; > + ret = sock_un_unlink(sun->sun_path); Here, too, need to be aware of the cwd of the restarting task. > + } > + } else > + ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state); > + out: > + ckpt_hdr_put(ctx, un); > + return ret; > +} > + > +struct socket *do_sock_file_restore(struct ckpt_ctx *ctx, > + struct ckpt_hdr_socket *h) > +{ > + struct socket *socket; > + int ret; > + > + ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket); > + if (ret < 0) > + return ERR_PTR(ret); > + > + if (h->sock_common.family == AF_UNIX) { > + ret = sock_un_restart(ctx, h, socket); > + ckpt_debug("sock_un_restart: %i\n", ret); > + } else { > + ckpt_debug("unsupported family %i\n", h->sock_common.family); > + ret = -EINVAL; > + } > + > + if (ret) > + goto out; > + > + sock_cptrst(ctx, socket->sk, h, CKPT_RST); Check return value ? > + > + ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue); > + if (ret) > + goto out; > + > + ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue); > + if (ret) > + goto out; Only read buffers if really need to - i.e. not listening socket, not closed socket. > + out: > + if (ret) { > + sock_release(socket); > + socket = ERR_PTR(ret); > + } > + > + return socket; > +} > + > diff --git a/net/socket.c b/net/socket.c > index 791d71a..be8c562 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -96,6 +96,9 @@ > #include <net/sock.h> > #include <linux/netfilter.h> > > +#include <linux/checkpoint.h> > +#include <linux/checkpoint_hdr.h> > + > static int sock_no_open(struct inode *irrelevant, struct file *dontcare); > static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, > unsigned long nr_segs, loff_t pos); > @@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = { > .sendpage = sock_sendpage, > .splice_write = generic_splice_sendpage, > .splice_read = sock_splice_read, > +#ifdef CONFIG_CHECKPOINT > + .checkpoint = sock_file_checkpoint, > +#endif > }; > > /* > @@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags) > return fd; > } > > +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr) > +{ > + struct ckpt_hdr_file_socket *h; > + int ret; > + struct file *file = ptr; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE); > + if (!h) > + return -ENOMEM; > + > + h->common.f_type = CKPT_FILE_SOCKET; > + > + ret = checkpoint_file_common(ctx, file, &h->common); > + if (ret < 0) > + goto out; > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + if (ret < 0) > + goto out; > + > + ret = do_sock_file_checkpoint(ctx, file); > + out: > + ckpt_hdr_put(ctx, h); > + return ret; > +} > + > +static struct file *sock_alloc_attach_fd(struct socket *socket) > +{ > + struct file *file; > + int err; > + > + file = get_empty_filp(); > + if (!file) > + return ERR_PTR(ENOMEM); > + > + err = sock_attach_fd(socket, file, 0); > + if (err < 0) { > + put_filp(file); > + file = ERR_PTR(err); > + } > + > + return file; > +} > + > +void *sock_file_restore(struct ckpt_ctx *ctx) > +{ > + struct ckpt_hdr_socket *h = NULL; > + struct socket *socket = NULL; > + struct file *file = NULL; > + int err; > + > + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET); > + if (IS_ERR(h)) > + return h; > + > + socket = do_sock_file_restore(ctx, h); > + if (IS_ERR(socket)) { > + err = PTR_ERR(socket); > + goto err_put; > + } > + > + file = sock_alloc_attach_fd(socket); > + if (IS_ERR(file)) { > + err = PTR_ERR(file); > + goto err_release; > + } > + > + ckpt_hdr_put(ctx, h); > + > + return file; > + > + err_release: > + sock_release(socket); > + err_put: > + ckpt_hdr_put(ctx, h); > + > + return ERR_PTR(err); > +} > + > static struct socket *sock_from_file(struct file *file, int *err) > { > if (file->f_op == &socket_file_ops) Ok ... waiting for next round :) Oren. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] c/r: Add AF_UNIX support (v3) 2009-06-25 2:30 ` Oren Laadan @ 2009-06-29 17:29 ` Dan Smith [not found] ` <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Dan Smith @ 2009-06-29 17:29 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> Value of some fields below needs to be verified/checked: Yep, okay, I think I've got some sane sanity checks in place now. >> + CKPT_COPY(op, h->sock.err, sock->sk_err); >> + CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft); OL> I'm unsure if we need ensure that an error value is in range... OL> can it do harm to pass an out-of-range value back to the caller ? I don't think any of the kernel code would care, but it's probably best to make sure that it's in a reasonable range. OL> This is 'int' in struct sock, but assignment may make it negative OL> ? Fixed this and others. >> + CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo); >> + CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo); OL> These two too. Perhaps refactor and use sock_set_timeout() ? From the look of sock_set_timeout() I'm not sure it's expected to be called from kernel space. Regardless, I think just setting it (and not corrupting it) should be fine. OL> These should be checked for sign-ness, limited in magnitude (e.g. OL> <= sysctl_wmem_max), checked against a minimum (see setting it in OL> net/core/sock.c), and finally perhaps correlated with a flag OL> against sk_userlocks ? Yep, fixed. >> + if ((sock->sk_state != TCP_LISTEN) && >> + (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) { >> + if (sock->sk_type != SOCK_DGRAM) { >> + ret = -EINVAL; >> + goto out; >> + } OL> I suspect this fails for !SOCK_DGRAM that isn't connected, OL> e.g. newly created. Okay, yeah. I changed the logic to only bail if it's a DGRAM socket and the state is TCP_ESTABLISHED, which I think is probably what we want. Since we bail on the transitional TCP states, I think this should be okay. >> + h->raddr_len = 0; OL> Hmmm.. do you need to ensure that this value is consistent with OL> the value of ->peer ? (at restart) I dunno, why would I? If I hook up the two peers to the same address structure on restart everything should be correct, no? As somewhat of a side question, is there a rule that says that getname() must return the exact same thing each time it is called? OL> I'd suggest a ckpt_write_err() here (and in other key places) to OL> explain to the user what caused the checkpoint to fail. Yep, I hadn't followed the introduction of that, but it's a very welcome addition. I've added calls in the appropriate places. OL> Any reason to write the buffers of a listening socket ? they only OL> contain not-yet-accepted sockest, which aren't supported anyway. Nope, fixed. OL> And while there is not support for not-yet-accepted sockets, maybe OL> add an explicit test (if LISTEN and has skb's) and fail with an OL> error, as well as call ckpt_write_err(). Isn't that just this (from the patch): + if ((sock->sk_state == TCP_LISTEN) && + !skb_queue_empty(&sock->sk_receive_queue)) { + ckpt_debug("listening socket has unaccepted peers"); + return -EBUSY; + } ? I've put a ckpt_write_err() there now... >> + /* FIXME: write out-of-order queue for TCP */ OL> I'd prefer failing if such data exists... Yep, indeed. I've removed the comment here and will augment the INET patch to check for that and fail appropriately. >> + memcpy(skb_put(*skb, len), (char *)(h + 1), len); OL> Can we avoid this memory copy ? Perhaps add _ckpt_read_hdr() and OL> _ckpt_read_payload() helpers ? Okay, yep. >> +static int sock_read_buffers(struct ckpt_ctx *ctx, >> + struct sock *sock, >> + struct sk_buff_head *queue) >> +{ >> + struct ckpt_hdr_socket_buffer *h; >> + int ret = 0; >> + int i; >> + >> + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS); >> + if (IS_ERR(h)) { >> + ret = PTR_ERR(h); >> + goto out; >> + } >> + >> + for (i = 0; i < h->skb_count; i++) { >> + struct sk_buff *skb = NULL; >> + >> + ret = sock_read_buffer(ctx, sock, &skb); >> + if (ret) >> + break; >> + >> + skb_queue_tail(queue, skb); >> + } OL> Make sure that total length is within limits, or this is an OL> opening for DoS. You mean the number of buffers? The sock_read_buffer() function will limit each skb to SKB_MAX_ALLOC. What is a sane maximum number of buffers? Maybe we need a sysctl for that? OL> I checked below, but you don't examine @len before calling this OL> function. So a DoS is possible by exhausting kernel memory with OL> these kmalloc()s. Yep, fixed. OL> Hmmm... a socket that is connected may also be bind(), so I'm not OL> sure the test below is always correct ? You mean because it could have been bind()'ed to a different address than it's peer before it called connect()? Wouldn't the connect() have changed the local address? OL> A thought: do we want to restore sharing of addresses at restart ? OL> For instance, all sockets accepted from a single listening socket OL> share their local addresses. For INET it seems important, but I don't think I've seen anything in the UNIX code that cares. However, I can move some of the parent-tracking stuff to this patch in order to hook up the shared addresses if you think it's important. OL> 1) Non-listening sockets may also need bind(). Especially OL> SOCK_DGRAM. Perhaps if the socket is not established and had a local address, we call bind? OL> 2) What happens if s1 is checkpointed before s2 in this scenario: OL> s1 = socket(); bind(s1, addr); unlink(addr); OL> s2 = socket(); bind(s1, addr); OL> I'd suggest that a socket that is did bind() and that OL> corresponding inode was later unlink()ed, will not be re-bind at OL> all. After all, it is unreachable in terms of connect() syscall. I'm not sure what you're saying here... OL> 3) If the sockets path is relative, you can't blindly rely on the OL> socket address reported by ->getname(), because the original OL> bind() occured _relative_ to the task's current-working-directory OL> at the time of the socket() syscall. OL> The cwd of the restarting task may differ. Moreover, the resulting OL> path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname() OL> reports the original, relative pathname anyways. OL> So if the address doesn't begin with a '/', you need to also save OL> the cwd and on restart, go there before caling bind(). But the app could have changed its cwd after binding the socket so the cwd of the checkpointed task isn't what we want, right? Maybe we could do a dentry lookup to find the actual path it's bound to and store that separately from the local and remote addresses? -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-07-02 9:15 ` Oren Laadan 2009-07-06 18:31 ` Dan Smith 0 siblings, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-07-02 9:15 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan Dan Smith wrote: > OL> Value of some fields below needs to be verified/checked: > > Yep, okay, I think I've got some sane sanity checks in place now. > >>> + CKPT_COPY(op, h->sock.err, sock->sk_err); >>> + CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft); > > OL> I'm unsure if we need ensure that an error value is in range... > OL> can it do harm to pass an out-of-range value back to the caller ? > > I don't think any of the kernel code would care, but it's probably > best to make sure that it's in a reasonable range. > > OL> This is 'int' in struct sock, but assignment may make it negative > OL> ? > > Fixed this and others. > >>> + CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo); >>> + CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo); > > OL> These two too. Perhaps refactor and use sock_set_timeout() ? > > From the look of sock_set_timeout() I'm not sure it's expected to be > called from kernel space. Regardless, I think just setting it (and > not corrupting it) should be fine. > > OL> These should be checked for sign-ness, limited in magnitude (e.g. > OL> <= sysctl_wmem_max), checked against a minimum (see setting it in > OL> net/core/sock.c), and finally perhaps correlated with a flag > OL> against sk_userlocks ? > > Yep, fixed. > >>> + if ((sock->sk_state != TCP_LISTEN) && >>> + (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) { >>> + if (sock->sk_type != SOCK_DGRAM) { >>> + ret = -EINVAL; >>> + goto out; >>> + } > > OL> I suspect this fails for !SOCK_DGRAM that isn't connected, > OL> e.g. newly created. > > Okay, yeah. I changed the logic to only bail if it's a DGRAM socket > and the state is TCP_ESTABLISHED, which I think is probably what we > want. Since we bail on the transitional TCP states, I think this > should be okay. > >>> + h->raddr_len = 0; > > OL> Hmmm.. do you need to ensure that this value is consistent with > OL> the value of ->peer ? (at restart) > > I dunno, why would I? If I hook up the two peers to the same address > structure on restart everything should be correct, no? Because in the kernel: "->peer != NULL" if-and-only-if "h->raddr is meaningful" so I wonder whether you should detect such input and complain, in case a malicious user does that ? > > As somewhat of a side question, is there a rule that says that > getname() must return the exact same thing each time it is called? I, for one, would expect it to remain the same unless my program re-connected the sockets. Or better: is there a reason why we would want to behave otherwise ? > > OL> I'd suggest a ckpt_write_err() here (and in other key places) to > OL> explain to the user what caused the checkpoint to fail. > > Yep, I hadn't followed the introduction of that, but it's a very > welcome addition. I've added calls in the appropriate places. > > OL> Any reason to write the buffers of a listening socket ? they only > OL> contain not-yet-accepted sockest, which aren't supported anyway. > > Nope, fixed. > > OL> And while there is not support for not-yet-accepted sockets, maybe > OL> add an explicit test (if LISTEN and has skb's) and fail with an > OL> error, as well as call ckpt_write_err(). > > Isn't that just this (from the patch): Yes. I guess I misread the code. > > + if ((sock->sk_state == TCP_LISTEN) && > + !skb_queue_empty(&sock->sk_receive_queue)) { > + ckpt_debug("listening socket has unaccepted peers"); > + return -EBUSY; > + } > > ? I've put a ckpt_write_err() there now... > >>> + /* FIXME: write out-of-order queue for TCP */ > > OL> I'd prefer failing if such data exists... > > Yep, indeed. I've removed the comment here and will augment the INET > patch to check for that and fail appropriately. > >>> + memcpy(skb_put(*skb, len), (char *)(h + 1), len); > > OL> Can we avoid this memory copy ? Perhaps add _ckpt_read_hdr() and > OL> _ckpt_read_payload() helpers ? > > Okay, yep. > >>> +static int sock_read_buffers(struct ckpt_ctx *ctx, >>> + struct sock *sock, >>> + struct sk_buff_head *queue) >>> +{ >>> + struct ckpt_hdr_socket_buffer *h; >>> + int ret = 0; >>> + int i; >>> + >>> + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS); >>> + if (IS_ERR(h)) { >>> + ret = PTR_ERR(h); >>> + goto out; >>> + } >>> + >>> + for (i = 0; i < h->skb_count; i++) { >>> + struct sk_buff *skb = NULL; >>> + >>> + ret = sock_read_buffer(ctx, sock, &skb); >>> + if (ret) >>> + break; >>> + >>> + skb_queue_tail(queue, skb); >>> + } > > OL> Make sure that total length is within limits, or this is an > OL> opening for DoS. > > You mean the number of buffers? The sock_read_buffer() function will > limit each skb to SKB_MAX_ALLOC. What is a sane maximum number of > buffers? Maybe we need a sysctl for that? Well, it should not be more than allowed by ->rcvbuf, or some other system limits (which I'm sure exists). At least apply the restrictions put by a regular read() syscall. One exception is when a socket's rcvbuf fills with large amount of data, and then the user calls setsockopt() to reduce the buffer to a lower size... in which case relying on the restored value for ->rcvbuf limit is clearly wrong. > > OL> I checked below, but you don't examine @len before calling this > OL> function. So a DoS is possible by exhausting kernel memory with > OL> these kmalloc()s. > > Yep, fixed. > > OL> Hmmm... a socket that is connected may also be bind(), so I'm not > OL> sure the test below is always correct ? > > You mean because it could have been bind()'ed to a different address > than it's peer before it called connect()? Wouldn't the connect() > have changed the local address? Nope. The connect() syscall will auto-bind only if the socket isn't already bound. And in the case of ad_unix, even that happens only when SOCK_PASSCRED bit is set in sock->flags. > > OL> A thought: do we want to restore sharing of addresses at restart ? > OL> For instance, all sockets accepted from a single listening socket > OL> share their local addresses. > > For INET it seems important, but I don't think I've seen anything in > the UNIX code that cares. However, I can move some of the > parent-tracking stuff to this patch in order to hook up the shared > addresses if you think it's important. Don't think it's important, definitely not for af_unix (the amount of memory spent isn't worth the optimization, I reckon). But INET (or other sockets) may indeed require this. > > OL> 1) Non-listening sockets may also need bind(). Especially > OL> SOCK_DGRAM. > > Perhaps if the socket is not established and had a local address, we > call bind? Even an established socket may have a local address that needs to be saved, to make getsockname() consistent across c/r. > > OL> 2) What happens if s1 is checkpointed before s2 in this scenario: > OL> s1 = socket(); bind(s1, addr); unlink(addr); > OL> s2 = socket(); bind(s1, addr); ^^^^^^^^ (s2, addr) <-- should have been > > OL> I'd suggest that a socket that is did bind() and that > OL> corresponding inode was later unlink()ed, will not be re-bind at > OL> all. After all, it is unreachable in terms of connect() syscall. > > I'm not sure what you're saying here... > Neither was I :p Now with the fixed example, the problem is if s2 is restored first, the restore of s1 will fail, or succeed and make s1 unreachable. However, we know that s1 is no longer reachable through connect() (or sendto() for dgram) - so during restart we want to manually set the local address, without the side-effect of creating an inode in the filesystem, for those sockets that are 1) bound with pathname address, and 2) the corresponding socket inode isn't unlinked. > OL> 3) If the sockets path is relative, you can't blindly rely on the > OL> socket address reported by ->getname(), because the original > OL> bind() occured _relative_ to the task's current-working-directory > OL> at the time of the socket() syscall. > > OL> The cwd of the restarting task may differ. Moreover, the resulting > OL> path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname() > OL> reports the original, relative pathname anyways. > > OL> So if the address doesn't begin with a '/', you need to also save > OL> the cwd and on restart, go there before caling bind(). > > But the app could have changed its cwd after binding the socket so the > cwd of the checkpointed task isn't what we want, right? Maybe we > could do a dentry lookup to find the actual path it's bound to and > store that separately from the local and remote addresses? > I meant "cwd at the time of the bind()". To find it, you can "subtract" the sock-address from the total pathname of the inode (assuming, of course, that the address is relative). Oren. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] c/r: Add AF_UNIX support (v3) 2009-07-02 9:15 ` Oren Laadan @ 2009-07-06 18:31 ` Dan Smith [not found] ` <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Dan Smith @ 2009-07-06 18:31 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> Because in the kernel: "->peer != NULL" if-and-only-if "h->raddr OL> is meaningful" so I wonder whether you should detect such input OL> and complain, in case a malicious user does that ? Ah, sorry, I misunderstood. Yep, agreed. I was thinking I was covered on that by the use of the regular functions but my makeaddr function would let it slip by, so I've fixed that. OL> Well, it should not be more than allowed by ->rcvbuf, or some OL> other system limits (which I'm sure exists). At least apply the OL> restrictions put by a regular read() syscall. Relying on the stored rcvbuf doesn't help prevent a DoS, right? OL> One exception is when a socket's rcvbuf fills with large amount of OL> data, and then the user calls setsockopt() to reduce the buffer to OL> a lower size... in which case relying on the restored value for OL> -> rcvbuf limit is clearly wrong. Hmm, I don't know why that's the case. Even still, I don't see why we should rely on the stored version of rcvbuf at all, aside from maybe just checking that the socket buffers are smaller than the claimed rcvbuf size of correctness' sake. However, checking against an existing sysctl or other limit (if there is one, still need to look) is obvious a good idea. OL> The connect() syscall will auto-bind only if the socket isn't OL> already bound. And in the case of ad_unix, even that happens only OL> when SOCK_PASSCRED bit is set in sock->flags. Well, I'm not really sure why that makes sense (not arguing that it's true). Perhaps a comment about this potential issue will suffice for now? OL> Even an established socket may have a local address that needs to OL> be saved, to make getsockname() consistent across c/r. ...right, but that's already restored properly, no? My test case shows that established STREAM sockets have the same output of getsockname() before and after restart. OL> s1 = socket(); bind(s1, addr); unlink(addr); OL> s2 = socket(); bind(s1, addr); OL> ^^^^^^^^ OL> (s2, addr) <-- should have been OL> Now with the fixed example, the problem is if s2 is restored first, OL> the restore of s1 will fail, or succeed and make s1 unreachable. I think you meant "restore of s1 will fail, or succeed and make *s2* unreachable". OL> However, we know that s1 is no longer reachable through connect() OL> (or sendto() for dgram) - so during restart we want to manually OL> set the local address, without the side-effect of creating an OL> inode in the filesystem, for those sockets that are 1) bound with OL> pathname address, and 2) the corresponding socket inode isn't OL> unlinked. Okay, done. OL> I meant "cwd at the time of the bind()". To find it, you can OL> "subtract" the sock-address from the total pathname of the inode OL> (assuming, of course, that the address is relative). Right, right. I was thinking that the socket kept an inode which would make it hard to determine the full path, but it's a dentry. I'll work that in. Thanks! -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-07-06 19:06 ` Oren Laadan [not found] ` <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-07-06 19:06 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan Dan Smith wrote: > OL> Because in the kernel: "->peer != NULL" if-and-only-if "h->raddr > OL> is meaningful" so I wonder whether you should detect such input > OL> and complain, in case a malicious user does that ? > > Ah, sorry, I misunderstood. Yep, agreed. I was thinking I was > covered on that by the use of the regular functions but my makeaddr > function would let it slip by, so I've fixed that. > > OL> Well, it should not be more than allowed by ->rcvbuf, or some > OL> other system limits (which I'm sure exists). At least apply the > OL> restrictions put by a regular read() syscall. > > Relying on the stored rcvbuf doesn't help prevent a DoS, right? Yes, of course, I meant a sane ->rcvbuf :) > > OL> One exception is when a socket's rcvbuf fills with large amount of > OL> data, and then the user calls setsockopt() to reduce the buffer to > OL> a lower size... in which case relying on the restored value for > OL> -> rcvbuf limit is clearly wrong. > > Hmm, I don't know why that's the case. Even still, I don't see why we > should rely on the stored version of rcvbuf at all, aside from maybe > just checking that the socket buffers are smaller than the claimed > rcvbuf size of correctness' sake. However, checking against an > existing sysctl or other limit (if there is one, still need to look) > is obvious a good idea. Yes. In fact, if you take a look at sock_setsockopt(), it already has the logic to impose (or not to) a limit on sk_rcvbuf. Maybe do something like: ... sock_setsockopt(sock, ..., saved_rcvbuf_length); sock_read_buffers(..., saved_recvbuf_length) sock_setsockopt(sock, ..., saved_rcvbuf_val); ... So first you set the limit to the actual size you are going to restore, if that succeeds, you restore the buffer (guaranteed to have enough room), then restore the saved sk_rcvbuf value. Permission tests are on the house. (You'll need to slightly refactor sock_setsockopt() for that). > > OL> The connect() syscall will auto-bind only if the socket isn't > OL> already bound. And in the case of ad_unix, even that happens only > OL> when SOCK_PASSCRED bit is set in sock->flags. > > Well, I'm not really sure why that makes sense (not arguing that it's > true). Perhaps a comment about this potential issue will suffice for > now? Comment about what ? I was just describing the behavior ... > > OL> Even an established socket may have a local address that needs to > OL> be saved, to make getsockname() consistent across c/r. > > ...right, but that's already restored properly, no? My test case > shows that established STREAM sockets have the same output of > getsockname() before and after restart. Indeed this test passes. But it is insufficient. Looking at v4 of the patch, for established sockets, for the auto-bind case of connect(), you are mostly correct. But what about -- 1) s = socket(.., SOCK_DGRAM,...); bind(s, any_addr); --> that is, s isn't connected to/from another socket. 2) s = socket(.., SOCK_DGRAM,...); bind(s, any_addr); connect(s, other_addr, ...); --> now s is connected, but after restart you can't connect another socket to it because the address wasn't bind() properly. 3) s = socket(..., SOCK_STREAM,...); bind(s, any_addr); connect(s, other_addr, ...); --> here, too, s is not properly bind(), so if after restart someone tries to: r = socket(..., ..., ...); bind(r, any_addr); the bind() will succeed after restart, but will have failed on the original system where the checkpoint was taken. IOW: when you restore the address of a socket, you need to emulate the operation of bind(), not merely memcpy the address into place. That's the reason why I originally had suggested to imitate the usual sequence of connection sockets to restore them. (And if the address was a pathname, but already unlinked, then also unlink after the bind, FWIW). Oren. > > OL> s1 = socket(); bind(s1, addr); unlink(addr); > OL> s2 = socket(); bind(s1, addr); > OL> ^^^^^^^^ > OL> (s2, addr) <-- should have been > > OL> Now with the fixed example, the problem is if s2 is restored first, > OL> the restore of s1 will fail, or succeed and make s1 unreachable. > > I think you meant "restore of s1 will fail, or succeed and make *s2* > unreachable". > > OL> However, we know that s1 is no longer reachable through connect() > OL> (or sendto() for dgram) - so during restart we want to manually > OL> set the local address, without the side-effect of creating an > OL> inode in the filesystem, for those sockets that are 1) bound with > OL> pathname address, and 2) the corresponding socket inode isn't > OL> unlinked. > > Okay, done. > > OL> I meant "cwd at the time of the bind()". To find it, you can > OL> "subtract" the sock-address from the total pathname of the inode > OL> (assuming, of course, that the address is relative). > > Right, right. I was thinking that the socket kept an inode which > would make it hard to determine the full path, but it's a dentry. > I'll work that in. > > Thanks! > ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-07-06 22:12 ` Dan Smith [not found] ` <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Dan Smith @ 2009-07-06 22:12 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> (You'll need to slightly refactor sock_setsockopt() for that). It seems much easier and less invasive to just check against sysctl_{r,w}mem_max. I've got that added to the patch now. OL> 2) s = socket(.., SOCK_DGRAM,...); OL> bind(s, any_addr); OL> connect(s, other_addr, ...); --> now s is connected, but after restart you can't connect another OL> socket to it because the address wasn't bind() properly. Okay, I guess that's true. So, since there isn't a "I'm bound, but not listening or connected" flag anywhere, does it suffice to bind() any socket that is not connected but that does have a local address? Sockets that get a local address via connect() should never transition through that state, so I think that should work. At least for INET, any socket that is restored into a connected state is properly hashed such that another socket can't bind() to its local address (tested). OL> (And if the address was a pathname, but already unlinked, then OL> also unlink after the bind, FWIW). For path-based UNIX sockets, we don't care about this exclusion, right? As long as we make the socket owner think everything is as it was, that is. Given that a normal system doesn't fail the bind of b in this case: a = socket(AF_UNIX); b = socket(AF_UNIX); bind(a, addr); unlink(addr); bind(b, addr); -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-07-06 22:46 ` Oren Laadan [not found] ` <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-07-06 22:46 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan Dan Smith wrote: > OL> (You'll need to slightly refactor sock_setsockopt() for that). > > It seems much easier and less invasive to just check against > sysctl_{r,w}mem_max. I've got that added to the patch now. But there are two cases: if you are CAP_NET_ADMIN you are allowed to go beyond that limit. So you need to add that test too. And in general, this helps to keep the checks - be it security, resource limits, or whatever - in one place, instead of having to duplicate code and, more importantly, risk getting out of sync with the original checks (e.g., if sock_setsockopt changes). > > OL> 2) s = socket(.., SOCK_DGRAM,...); > OL> bind(s, any_addr); > OL> connect(s, other_addr, ...); > --> now s is connected, but after restart you can't connect another > OL> socket to it because the address wasn't bind() properly. > > Okay, I guess that's true. So, since there isn't a "I'm bound, but > not listening or connected" flag anywhere, does it suffice to bind() > any socket that is not connected but that does have a local address? > Sockets that get a local address via connect() should never transition > through that state, so I think that should work. At least for INET, > any socket that is restored into a connected state is properly hashed > such that another socket can't bind() to its local address (tested). Yes, with the exception below (for UNIX)... > > OL> (And if the address was a pathname, but already unlinked, then > OL> also unlink after the bind, FWIW). > > For path-based UNIX sockets, we don't care about this exclusion, > right? As long as we make the socket owner think everything is as it You are correct in that you don't need it for the scenario you presented below. But we do care, because it is necessary to do the unlink() after the bind(), like you do for listening sockets, for this scenario: s = socket() bind(s, pathname) unlink(pathname) <---- checkpoint/restart r = socket() bind(r, pathname) The second bind() will succeed on the original system, but will fail on the restarted system, unless you do that. > was, that is. Given that a normal system doesn't fail the bind of b > in this case: > > a = socket(AF_UNIX); > b = socket(AF_UNIX); > > bind(a, addr); > unlink(addr); > bind(b, addr); > BTW, I just looked again at the code, and I'm concerned about: + if (!un->linked) { + struct sockaddr_un *sun = + (struct sockaddr_un *)&h->laddr; + ret = sock_unix_unlink(sun->sun_path); + } You need to verify that the address is not abstract, because I'm not sure what sock_unix_unlink() would do given the empty string. Usually this is filtered higher in the VFS (getname). Oren. ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-07-07 14:52 ` Dan Smith [not found] ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Dan Smith @ 2009-07-07 14:52 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> But there are two cases: if you are CAP_NET_ADMIN you are allowed OL> to go beyond that limit. So you need to add that test too. Okay, fair enough. OL> And in general, this helps to keep the checks - be it security, OL> resource limits, or whatever - in one place, instead of having to OL> duplicate code and, more importantly, risk getting out of sync OL> with the original checks (e.g., if sock_setsockopt changes). But sock_setsockopt() will also set the userlocks flag saying that you've specified the buffer size. At the point at which I currently restore the buffers, we've already restored the value specified in the checkpoint stream, so we'd need to re-reset it as a special case after the call to sock_setsockopt(). Further, to get the "override" case, you have to call it with SO_RCVBUFFORCE which fails if you're not CAP_SYS_ADMIN. So, do we try force, then if it fails try SO_RCVBUF, then if it fails actually fail? Since sock_setsockopt() doubles the buffer size we get it, do we cut the value we want in half before passing it in? Doing all that seems like an abuse of sock_setsockopt() to me when the alternative is to check CAP_SYS_ADMIN and set the buffer size. OL> But we do care, because it is necessary to do the unlink() after OL> the bind(), like you do for listening sockets, for this scenario: OL> s = socket() OL> bind(s, pathname) OL> unlink(pathname) OL> <---- checkpoint/restart OL> r = socket() OL> bind(r, pathname) OL> The second bind() will succeed on the original system, but will OL> fail on the restarted system, unless you do that. Not if we don't actually call bind(s) in the unlinked case. What I meant in my previous response is: if we're unlinked, then just fake the bind actions but don't actually do the bind()..unlink(). We already went over the case where we might unlink() a valid socket depending on the order, right? OL> BTW, I just looked again at the code, and I'm concerned about: OL> + if (!un->linked) { OL> + struct sockaddr_un *sun = OL> + (struct sockaddr_un *)&h->laddr; OL> + ret = sock_unix_unlink(sun->sun_path); OL> + } OL> You need to verify that the address is not abstract, because I'm OL> not sure what sock_unix_unlink() would do given the empty OL> string. Usually this is filtered higher in the VFS (getname). Yep, but luckily that's gone with my recent changes to fake the bind() for unlinked sockets instead of actually doing the unlink() :) -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-07-07 15:15 ` Oren Laadan [not found] ` <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org> 2009-07-07 15:33 ` Oren Laadan 1 sibling, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-07-07 15:15 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan On Tue, 7 Jul 2009, Dan Smith wrote: > OL> But there are two cases: if you are CAP_NET_ADMIN you are allowed > OL> to go beyond that limit. So you need to add that test too. > > Okay, fair enough. > > OL> And in general, this helps to keep the checks - be it security, > OL> resource limits, or whatever - in one place, instead of having to > OL> duplicate code and, more importantly, risk getting out of sync > OL> with the original checks (e.g., if sock_setsockopt changes). > > But sock_setsockopt() will also set the userlocks flag saying that > you've specified the buffer size. At the point at which I currently > restore the buffers, we've already restored the value specified in the > checkpoint stream, so we'd need to re-reset it as a special case after > the call to sock_setsockopt(). Further, to get the "override" case, > you have to call it with SO_RCVBUFFORCE which fails if you're not > CAP_SYS_ADMIN. So, do we try force, then if it fails try SO_RCVBUF, > then if it fails actually fail? Since sock_setsockopt() doubles the > buffer size we get it, do we cut the value we want in half before > passing it in? What I had in mind is take out the part that checks against the limit and/or CAP_NET_ADMIN so you could call it from sock_setsockopt() and from the restore code. But I don't have a concrete suggestions. > Doing all that seems like an abuse of sock_setsockopt() to me when the > alternative is to check CAP_SYS_ADMIN and set the buffer size. I'm ok with that. My only concern was CAP_NET_ADMIN - so the input should come from network people - is it ok with them to use CAP_SYS_ADMIN there "instead" ? > > OL> But we do care, because it is necessary to do the unlink() after > OL> the bind(), like you do for listening sockets, for this scenario: > > OL> s = socket() > OL> bind(s, pathname) > OL> unlink(pathname) > OL> <---- checkpoint/restart > OL> r = socket() > OL> bind(r, pathname) > > OL> The second bind() will succeed on the original system, but will > OL> fail on the restarted system, unless you do that. > > Not if we don't actually call bind(s) in the unlinked case. What I > meant in my previous response is: if we're unlinked, then just fake > the bind actions but don't actually do the bind()..unlink(). We > already went over the case where we might unlink() a valid socket > depending on the order, right? > Heh .. I wrote that as a precaution after arguing the a bind() need genrally be called for all sockets, except when ... []. I guess we got lost in the thread. > OL> BTW, I just looked again at the code, and I'm concerned about: > > OL> + if (!un->linked) { > OL> + struct sockaddr_un *sun = > OL> + (struct sockaddr_un *)&h->laddr; > OL> + ret = sock_unix_unlink(sun->sun_path); > OL> + } > > OL> You need to verify that the address is not abstract, because I'm > OL> not sure what sock_unix_unlink() would do given the empty > OL> string. Usually this is filtered higher in the VFS (getname). > > Yep, but luckily that's gone with my recent changes to fake the bind() > for unlinked sockets instead of actually doing the unlink() :) :) Oren. ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org> @ 2009-07-07 15:24 ` Dan Smith 0 siblings, 0 replies; 15+ messages in thread From: Dan Smith @ 2009-07-07 15:24 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> I'm ok with that. My only concern was CAP_NET_ADMIN - so the input OL> should come from network people - is it ok with them to use OL> CAP_SYS_ADMIN there "instead" ? I meant CAP_NET_ADMIN of course :) I'll merge that check in with the sysctl one and then see what they have to say about the whole thing. My guess would be that this particular part will not be the most controversial bit :) I'll post v5 here in a few and copy netdev. Thanks! -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 2009-07-07 15:15 ` Oren Laadan @ 2009-07-07 15:33 ` Oren Laadan 2009-07-07 15:36 ` Dan Smith 1 sibling, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-07-07 15:33 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan Dan Smith wrote: > OL> But there are two cases: if you are CAP_NET_ADMIN you are allowed > OL> to go beyond that limit. So you need to add that test too. > > Okay, fair enough. > > OL> And in general, this helps to keep the checks - be it security, > OL> resource limits, or whatever - in one place, instead of having to > OL> duplicate code and, more importantly, risk getting out of sync > OL> with the original checks (e.g., if sock_setsockopt changes). > > But sock_setsockopt() will also set the userlocks flag saying that > you've specified the buffer size. At the point at which I currently > restore the buffers, we've already restored the value specified in the > checkpoint stream, so we'd need to re-reset it as a special case after > the call to sock_setsockopt(). Further, to get the "override" case, > you have to call it with SO_RCVBUFFORCE which fails if you're not > CAP_SYS_ADMIN. So, do we try force, then if it fails try SO_RCVBUF, > then if it fails actually fail? Since sock_setsockopt() doubles the > buffer size we get it, do we cut the value we want in half before > passing it in? > > Doing all that seems like an abuse of sock_setsockopt() to me when the > alternative is to check CAP_SYS_ADMIN and set the buffer size. > > OL> But we do care, because it is necessary to do the unlink() after > OL> the bind(), like you do for listening sockets, for this scenario: > > OL> s = socket() > OL> bind(s, pathname) > OL> unlink(pathname) > OL> <---- checkpoint/restart > OL> r = socket() > OL> bind(r, pathname) > > OL> The second bind() will succeed on the original system, but will > OL> fail on the restarted system, unless you do that. > > Not if we don't actually call bind(s) in the unlinked case. What I > meant in my previous response is: if we're unlinked, then just fake > the bind actions but don't actually do the bind()..unlink(). We > already went over the case where we might unlink() a valid socket > depending on the order, right? > > OL> BTW, I just looked again at the code, and I'm concerned about: > > OL> + if (!un->linked) { > OL> + struct sockaddr_un *sun = > OL> + (struct sockaddr_un *)&h->laddr; > OL> + ret = sock_unix_unlink(sun->sun_path); > OL> + } > > OL> You need to verify that the address is not abstract, because I'm > OL> not sure what sock_unix_unlink() would do given the empty > OL> string. Usually this is filtered higher in the VFS (getname). > > Yep, but luckily that's gone with my recent changes to fake the bind() > for unlinked sockets instead of actually doing the unlink() :) Ahh.. and forgot to ask/mention: you do need to call sock_unix_unlink() before attempting bind(), for the reasons we had discussed earlier (consider same example as above, checkpoint/restart done before the unlink(), then restart will otherwise fail). So you still need to sanitize the file name for that case, no ? Oren. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] c/r: Add AF_UNIX support (v3) 2009-07-07 15:33 ` Oren Laadan @ 2009-07-07 15:36 ` Dan Smith [not found] ` <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Dan Smith @ 2009-07-07 15:36 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> Ahh.. and forgot to ask/mention: you do need to call OL> sock_unix_unlink() before attempting bind(), for the reasons we OL> had discussed earlier (consider same example as above, OL> checkpoint/restart done before the unlink(), then restart will OL> otherwise fail). I thought we agreed that was userspace's job? That's why I didn't unlink() before bind() in this version (4) of the patch either. -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-07-07 15:48 ` Oren Laadan [not found] ` <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 15+ messages in thread From: Oren Laadan @ 2009-07-07 15:48 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan Dan Smith wrote: > OL> Ahh.. and forgot to ask/mention: you do need to call > OL> sock_unix_unlink() before attempting bind(), for the reasons we > OL> had discussed earlier (consider same example as above, > OL> checkpoint/restart done before the unlink(), then restart will > OL> otherwise fail). > > I thought we agreed that was userspace's job? That's why I didn't > unlink() before bind() in this version (4) of the patch either. > I don't recall such a conclusion. I argued that it it's kernel's job. I suppose we agree that such pathnames should always be unlinked - so it isn't policy based. Therefore, I don't see a good reason to have userspace scan the checkpoint image data just to find such sockets and make them disappear ... Oren. ^ permalink raw reply [flat|nested] 15+ messages in thread
[parent not found: <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v3) [not found] ` <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-07-07 16:04 ` Dan Smith 0 siblings, 0 replies; 15+ messages in thread From: Dan Smith @ 2009-07-07 16:04 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Alexey Dobriyan OL> I don't recall such a conclusion. Maybe this will lead you to a pointer in your IRC logs: Jun 11 13:51:53 * orenl in favor of dansmith OL> I argued that it it's kernel's job. I suppose we agree that such OL> pathnames should always be unlinked - so it isn't policy based. OL> Therefore, I don't see a good reason to have userspace scan the OL> checkpoint image data just to find such sockets and make them OL> disappear ... I'm not really interested in arguing about it again, so I'll just put it in and move on. -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2009-07-07 16:04 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-16 15:55 [PATCH] c/r: Add AF_UNIX support (v3) Dan Smith
[not found] ` <1245167716-28906-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-25 2:30 ` Oren Laadan
2009-06-29 17:29 ` Dan Smith
[not found] ` <871vp3x81s.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-02 9:15 ` Oren Laadan
2009-07-06 18:31 ` Dan Smith
[not found] ` <87y6r1slxz.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-06 19:06 ` Oren Laadan
[not found] ` <4A524B40.4040600-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-06 22:12 ` Dan Smith
[not found] ` <87tz1psbpq.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-06 22:46 ` Oren Laadan
[not found] ` <4A527EB8.4060201-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-07 14:52 ` Dan Smith
[not found] ` <87prccsfyp.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-07 15:15 ` Oren Laadan
[not found] ` <Pine.LNX.4.64.0907071102341.24765-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
2009-07-07 15:24 ` Dan Smith
2009-07-07 15:33 ` Oren Laadan
2009-07-07 15:36 ` Dan Smith
[not found] ` <87ab3gsdwx.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-07-07 15:48 ` Oren Laadan
[not found] ` <4A536E44.6050601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-07-07 16:04 ` Dan Smith
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.