* [PATCH] c/r: Add AF_UNIX support (v2)
@ 2009-06-05 18:27 Dan Smith
[not found] ` <1244226434-8167-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Dan Smith @ 2009-06-05 18:27 UTC (permalink / raw)
To: containers-qjLDD68F18O7TbgM5vRIOg
This patch adds basic checkpoint/restart support for AF_UNIX sockets. It
has been tested with a single and multiple processes, and with data inflight
at the time of checkpoint. It supports both socketpair()s and path-based
sockets.
I have an almost-working AF_INET follow-on to this which I can submit after
this is reviewed and tweaked into acceptance.
Changes in v2:
- Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems
to be rather common in other uses of skb_copy())
- Move the ckpt_hdr_socket structure definition to linux/socket.h
- Fix whitespace issue
- Move sock_file_checkpoint() to net/socket.c for symmetry
Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
checkpoint/files.c | 7 +
checkpoint/objhash.c | 27 +++
include/linux/checkpoint_hdr.h | 12 ++
include/linux/socket.h | 56 +++++++
include/net/sock.h | 9 +
net/Makefile | 2 +
net/socket.c | 84 ++++++++++
net/socket_cr.c | 352 ++++++++++++++++++++++++++++++++++++++++
8 files changed, 549 insertions(+), 0 deletions(-)
create mode 100644 net/socket_cr.c
diff --git a/checkpoint/files.c b/checkpoint/files.c
index b264e40..bb2cca0 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@
#include <linux/syscalls.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
/**************************************************************************
@@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
.file_type = CKPT_FILE_PIPE,
.restore = pipe_file_restore,
},
+ /* socket */
+ {
+ .file_name = "SOCKET",
+ .file_type = CKPT_FILE_SOCKET,
+ .restore = sock_file_restore,
+ },
};
static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 045a920..7819e5e 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -19,6 +19,7 @@
#include <linux/ipc_namespace.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
struct ckpt_obj;
struct ckpt_obj_ops;
@@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
return atomic_read(&((struct ipc_namespace *) ptr)->count);
}
+static int obj_sock_grab(void *ptr)
+{
+ sock_hold((struct sock *) ptr);
+ return 0;
+}
+
+static void obj_sock_drop(void *ptr)
+{
+ sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+ return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
static struct ckpt_obj_ops ckpt_obj_ops[] = {
/* ignored object */
{
@@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
.checkpoint = checkpoint_bad,
.restore = restore_bad,
},
+ /* sock object */
+ {
+ .obj_name = "SOCKET",
+ .obj_type = CKPT_OBJ_SOCK,
+ .ref_drop = obj_sock_drop,
+ .ref_grab = obj_sock_grab,
+ .ref_users = obj_sock_users,
+ .checkpoint = sock_file_checkpoint,
+ .restore = sock_file_restore,
+ },
};
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index cd427d8..16f40d1 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -76,6 +76,11 @@ enum {
CKPT_HDR_IPC_MSG_MSG,
CKPT_HDR_IPC_SEM,
+ CKPT_HDR_FD_SOCKET = 601,
+ CKPT_HDR_SOCKET,
+ CKPT_HDR_SOCKET_BUFFERS,
+ CKPT_HDR_SOCKET_BUFFER,
+
CKPT_HDR_TAIL = 9001,
CKPT_HDR_ERROR = 9999,
@@ -103,6 +108,7 @@ enum obj_type {
CKPT_OBJ_NS,
CKPT_OBJ_UTS_NS,
CKPT_OBJ_IPC_NS,
+ CKPT_OBJ_SOCK,
CKPT_OBJ_MAX
};
@@ -225,6 +231,7 @@ enum file_type {
CKPT_FILE_IGNORE = 0,
CKPT_FILE_GENERIC,
CKPT_FILE_PIPE,
+ CKPT_FILE_SOCKET,
CKPT_FILE_MAX
};
@@ -248,6 +255,11 @@ struct ckpt_hdr_file_pipe {
__s32 pipe_objref;
} __attribute__((aligned(8)));
+struct ckpt_hdr_file_socket {
+ struct ckpt_hdr_file common;
+ __u16 family;
+} __attribute__((aligned(8)));
+
struct ckpt_hdr_file_pipe_state {
struct ckpt_hdr h;
__s32 pipe_len;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 421afb4..6525ddc 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage {
#include <linux/uio.h> /* iovec support */
#include <linux/types.h> /* pid_t */
#include <linux/compiler.h> /* __user */
+#include <linux/checkpoint_hdr.h> /* ckpt_hdr */
#ifdef __KERNEL__
# ifdef CONFIG_PROC_FS
@@ -323,5 +324,60 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
#endif
+
+struct ckpt_hdr_socket_un {
+ __u32 this;
+ __u32 peer;
+};
+
+struct ckpt_hdr_socket {
+ struct ckpt_hdr h;
+
+ /* sock_common */
+ __u16 family;
+ __u8 state;
+ __u8 reuse;
+ __u32 bound_dev_if;
+
+ /* sock */
+ __u8 protocol;
+ __u16 type;
+ __u8 sock_state;
+ __u8 shutdown;
+ __u8 userlocks;
+ __u8 no_check;
+ __u32 err;
+ __u32 err_soft;
+ __u32 priority;
+ __u64 rcvlowat;
+ __u64 rcvtimeo;
+ __u64 sndtimeo;
+ __u16 backlog;
+ __s32 rcvbuf;
+ __s32 sndbuf;
+ __u64 flags;
+ __u64 lingertime;
+
+ /* socket */
+ __u64 socket_flags;
+ __u8 socket_state;
+
+ /* common to all supported families */
+ struct sockaddr laddr;
+ struct sockaddr raddr;
+ __u32 laddr_len;
+ __u32 raddr_len;
+
+ union {
+ struct ckpt_hdr_socket_un un;
+ };
+
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket_buffer {
+ struct ckpt_hdr h;
+ __u32 skb_count;
+} __attribute__ ((aligned(8)));
+
#endif /* not kernel and not glibc */
#endif /* _LINUX_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..d00598c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max;
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
+/* Checkpoint/Restart Functions */
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
+extern void *sock_file_restore(struct ckpt_ctx *);
+extern struct socket *__sock_file_restore(struct ckpt_ctx *,
+ struct ckpt_hdr_socket *);
+extern int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+
#endif /* _SOCK_H */
diff --git a/net/Makefile b/net/Makefile
index 9e00a55..1c68a4e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
endif
obj-$(CONFIG_WIMAX) += wimax/
+
+obj-$(CONFIG_CHECKPOINT) += socket_cr.o
diff --git a/net/socket.c b/net/socket.c
index 791d71a..5c36753 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -96,6 +96,9 @@
#include <net/sock.h>
#include <linux/netfilter.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
@@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
+#ifdef CONFIG_CHECKPOINT
+ .checkpoint = sock_file_checkpoint,
+#endif
};
/*
@@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags)
return fd;
}
+static struct file *sock_alloc_attach_fd(struct socket *socket)
+{
+ struct file *file;
+ int err;
+
+ file = get_empty_filp();
+ if (!file)
+ return ERR_PTR(ENOMEM);
+
+ err = sock_attach_fd(socket, file, 0);
+ if (err < 0) {
+ put_filp(file);
+ file = ERR_PTR(err);
+ }
+
+ return file;
+}
+
+void *sock_file_restore(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_socket *h = NULL;
+ struct socket *socket = NULL;
+ struct file *file = NULL;
+ int err;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+ if (IS_ERR(h))
+ return h;
+
+ socket = __sock_file_restore(ctx, h);
+ if (IS_ERR(socket)) {
+ err = PTR_ERR(socket);
+ goto err_put;
+ }
+
+ file = sock_alloc_attach_fd(socket);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_release;
+ }
+
+ ckpt_hdr_put(ctx, h);
+
+ return file;
+
+ err_release:
+ sock_release(socket);
+ err_put:
+ ckpt_hdr_put(ctx, h);
+
+ return ERR_PTR(err);
+}
+
+int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
+{
+ struct ckpt_hdr_file_socket *h;
+ int ret;
+ struct file *file = ptr;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+ if (!h)
+ return -ENOMEM;
+
+ h->common.f_type = CKPT_FILE_SOCKET;
+
+ ret = checkpoint_file_common(ctx, file, &h->common);
+ if (ret < 0)
+ goto out;
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ if (ret < 0)
+ goto out;
+
+ ret = __sock_file_checkpoint(ctx, file);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
static struct socket *sock_from_file(struct file *file, int *err)
{
if (file->f_op == &socket_file_ops)
diff --git a/net/socket_cr.c b/net/socket_cr.c
new file mode 100644
index 0000000..448b95f
--- /dev/null
+++ b/net/socket_cr.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2009 IBM Corporation
+ *
+ * Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/socket.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
+{
+ int count = 0;
+ struct sk_buff *skb;
+
+ spin_lock(&from->lock);
+
+ skb_queue_walk(from, skb) {
+ struct sk_buff *tmp;
+
+ tmp = skb_copy(skb, GFP_ATOMIC);
+ if (!tmp) {
+ count = -ENOMEM;
+ goto out;
+ }
+ skb_queue_tail(to, tmp);
+ count++;
+ }
+ out:
+ spin_unlock(&from->lock);
+
+ return count;
+}
+
+static int __sock_write_buffers(struct ckpt_ctx *ctx,
+ struct sk_buff_head *queue)
+{
+ struct sk_buff *skb;
+ int ret = 0;
+
+ skb_queue_walk(queue, skb) {
+ ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
+ CKPT_HDR_SOCKET_BUFFER);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+ struct ckpt_hdr_socket_buffer *h;
+ struct sk_buff_head tmpq;
+ int ret = -ENOMEM;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+ if (!h)
+ goto out;
+
+ skb_queue_head_init(&tmpq);
+
+ h->skb_count = sock_copy_buffers(queue, &tmpq);
+ if (h->skb_count < 0) {
+ ret = h->skb_count;
+ goto out;
+ }
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ if (!ret)
+ ret = __sock_write_buffers(ctx, &tmpq);
+
+ out:
+ ckpt_hdr_put(ctx, h);
+ __skb_queue_purge(&tmpq);
+
+ return ret;
+}
+
+static int sock_un_checkpoint(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct ckpt_hdr_socket *h)
+{
+ struct unix_sock *sk = unix_sk(sock);
+ struct unix_sock *pr = unix_sk(sk->peer);
+ int new;
+ int ret;
+
+ h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
+ if (h->un.this < 0)
+ goto out;
+
+ if (sk->peer)
+ h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+ else
+ h->un.peer = 0;
+
+ if (h->un.peer < 0) {
+ ret = h->un.peer;
+ goto out;
+ }
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ out:
+ return ret;
+}
+
+static int sock_cptrst(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct ckpt_hdr_socket *h,
+ int op)
+{
+ if (sock->sk_socket) {
+ CKPT_COPY(op, h->socket_flags, sock->sk_socket->flags);
+ CKPT_COPY(op, h->socket_state, sock->sk_socket->state);
+ }
+
+ CKPT_COPY(op, h->reuse, sock->sk_reuse);
+ CKPT_COPY(op, h->shutdown, sock->sk_shutdown);
+ CKPT_COPY(op, h->userlocks, sock->sk_userlocks);
+ CKPT_COPY(op, h->no_check, sock->sk_no_check);
+ CKPT_COPY(op, h->protocol, sock->sk_protocol);
+ CKPT_COPY(op, h->err, sock->sk_err);
+ CKPT_COPY(op, h->err_soft, sock->sk_err_soft);
+ CKPT_COPY(op, h->priority, sock->sk_priority);
+ CKPT_COPY(op, h->rcvlowat, sock->sk_rcvlowat);
+ CKPT_COPY(op, h->backlog, sock->sk_max_ack_backlog);
+ CKPT_COPY(op, h->rcvtimeo, sock->sk_rcvtimeo);
+ CKPT_COPY(op, h->sndtimeo, sock->sk_sndtimeo);
+ CKPT_COPY(op, h->rcvbuf, sock->sk_rcvbuf);
+ CKPT_COPY(op, h->sndbuf, sock->sk_sndbuf);
+ CKPT_COPY(op, h->bound_dev_if, sock->sk_bound_dev_if);
+ CKPT_COPY(op, h->flags, sock->sk_flags);
+ CKPT_COPY(op, h->lingertime, sock->sk_lingertime);
+
+ return 0;
+}
+
+int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+ struct socket *socket = file->private_data;
+ struct sock *sock = socket->sk;
+ struct ckpt_hdr_socket *h;
+ int ret = 0;
+
+ h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+ if (!h)
+ return -ENOMEM;
+
+ h->family = sock->sk_family;
+ h->state = socket->state;
+ h->sock_state = sock->sk_state;
+ h->reuse = sock->sk_reuse;
+ h->type = sock->sk_type;
+ h->protocol = sock->sk_protocol;
+
+ h->laddr_len = sizeof(h->laddr);
+ h->raddr_len = sizeof(h->raddr);
+
+ if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((h->sock_state != TCP_LISTEN) &&
+ (h->type != SOCK_DGRAM) &&
+ (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sock_cptrst(ctx, sock, h, CKPT_CPT);
+
+ if (h->family == AF_UNIX) {
+ ret = sock_un_checkpoint(ctx, sock, h);
+ if (ret)
+ goto out;
+ } else {
+ ckpt_debug("unsupported socket type %i\n", h->family);
+ ret = EINVAL;
+ goto out;
+ }
+
+ ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+ if (ret)
+ goto out;
+
+ ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+ if (ret)
+ goto out;
+
+ /* FIXME: write out-of-order queue for TCP */
+ out:
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+static int sock_read_buffer(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct sk_buff **skb)
+{
+ struct ckpt_hdr *h;
+ int ret = 0;
+ int len;
+
+ h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ len = h->len - sizeof(*h);
+
+ *skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
+ if (*skb == NULL) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ memcpy(skb_put(*skb, len), (char *)(h + 1), len);
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int sock_read_buffers(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct sk_buff_head *queue)
+{
+ struct ckpt_hdr_socket_buffer *h;
+ int ret = 0;
+ int i;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+ if (IS_ERR(h)) {
+ ret = PTR_ERR(h);
+ goto out;
+ }
+
+ for (i = 0; i < h->skb_count; i++) {
+ struct sk_buff *skb = NULL;
+
+ ret = sock_read_buffer(ctx, sock, &skb);
+ if (ret)
+ break;
+
+ skb_queue_tail(queue, skb);
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+
+ return ret;
+}
+
+static int sock_un_restart(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_socket *h,
+ struct socket *socket)
+{
+ struct sock *peer;
+ int ret = 0;
+
+ if (h->sock_state == TCP_ESTABLISHED) {
+ peer = ckpt_obj_fetch(ctx, h->un.peer, CKPT_OBJ_SOCK);
+ if (peer && !IS_ERR(peer)) {
+ /* We're last, so join with peer */
+ struct sock *this = socket->sk;
+
+ sock_hold(this);
+ sock_hold(peer);
+
+ unix_sk(this)->peer = peer;
+ unix_sk(peer)->peer = this;
+
+ this->sk_peercred.pid = task_tgid_vnr(current);
+ current_euid_egid(&this->sk_peercred.uid,
+ &this->sk_peercred.gid);
+
+ peer->sk_peercred.pid = task_tgid_vnr(current);
+ current_euid_egid(&peer->sk_peercred.uid,
+ &peer->sk_peercred.gid);
+ } else {
+ /* We're first, so add our socket and wait for peer */
+ ckpt_obj_insert(ctx, socket->sk, h->un.this,
+ CKPT_OBJ_SOCK);
+ }
+
+ } else if (h->sock_state == TCP_LISTEN) {
+ ret = socket->ops->bind(socket,
+ (struct sockaddr *)&h->laddr,
+ h->laddr_len);
+ if (ret < 0)
+ goto out;
+
+ ret = socket->ops->listen(socket, h->backlog);
+ if (ret < 0)
+ goto out;
+ } else
+ ckpt_debug("unsupported UNIX socket state %i\n", h->state);
+
+ socket->state = h->state;
+ socket->sk->sk_state = h->sock_state;
+ out:
+ return ret;
+}
+
+struct socket *__sock_file_restore(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_socket *h)
+{
+ struct socket *socket;
+ int ret;
+
+ ret = sock_create(h->family, h->type, 0, &socket);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (h->family == AF_UNIX) {
+ ret = sock_un_restart(ctx, h, socket);
+ ckpt_debug("sock_un_restart: %i\n", ret);
+ } else {
+ ckpt_debug("unsupported family %i\n", h->family);
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ goto out;
+
+ ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
+ if (ret)
+ goto out;
+
+ ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
+ if (ret)
+ goto out;
+ out:
+ if (ret) {
+ sock_release(socket);
+ socket = ERR_PTR(ret);
+ }
+
+ return socket;
+}
+
--
1.6.0.4
^ permalink raw reply related [flat|nested] 5+ messages in thread[parent not found: <1244226434-8167-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v2) [not found] ` <1244226434-8167-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2009-06-08 6:02 ` Oren Laadan [not found] ` <4A2CA97D.1060002-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 5+ messages in thread From: Oren Laadan @ 2009-06-08 6:02 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg Hi, Dan Smith wrote: > This patch adds basic checkpoint/restart support for AF_UNIX sockets. It > has been tested with a single and multiple processes, and with data inflight > at the time of checkpoint. It supports both socketpair()s and path-based > sockets. > > I have an almost-working AF_INET follow-on to this which I can submit after > this is reviewed and tweaked into acceptance. > > Changes in v2: > - Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems > to be rather common in other uses of skb_copy()) > - Move the ckpt_hdr_socket structure definition to linux/socket.h > - Fix whitespace issue > - Move sock_file_checkpoint() to net/socket.c for symmetry > > Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> [...] > diff --git a/include/linux/socket.h b/include/linux/socket.h > index 421afb4..6525ddc 100644 > --- a/include/linux/socket.h > +++ b/include/linux/socket.h > @@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage { > #include <linux/uio.h> /* iovec support */ > #include <linux/types.h> /* pid_t */ > #include <linux/compiler.h> /* __user */ > +#include <linux/checkpoint_hdr.h> /* ckpt_hdr */ > > #ifdef __KERNEL__ > # ifdef CONFIG_PROC_FS > @@ -323,5 +324,60 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka > extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); > > #endif > + > +struct ckpt_hdr_socket_un { > + __u32 this; > + __u32 peer; > +}; Nit: add __attribute__ ((aligned(8))); > + > +struct ckpt_hdr_socket { > + struct ckpt_hdr h; > + > + /* sock_common */ > + __u16 family; > + __u8 state; > + __u8 reuse; > + __u32 bound_dev_if; > + > + /* sock */ > + __u8 protocol; > + __u16 type; Data needs to be aligned (at least for some archs) > + __u8 sock_state; > + __u8 shutdown; > + __u8 userlocks; > + __u8 no_check; > + __u32 err; Here too ... > + __u32 err_soft; > + __u32 priority; > + __u64 rcvlowat; > + __u64 rcvtimeo; > + __u64 sndtimeo; > + __u16 backlog; > + __s32 rcvbuf; > + __s32 sndbuf; > + __u64 flags; > + __u64 lingertime; > + > + /* socket */ > + __u64 socket_flags; > + __u8 socket_state; Here too ... > + > + /* common to all supported families */ > + struct sockaddr laddr; > + struct sockaddr raddr; > + __u32 laddr_len; > + __u32 raddr_len; > + > + union { > + struct ckpt_hdr_socket_un un; > + }; Perhaps use a separate header/object for each socket type, instead of a union ? Unix domain sockets take 64 bytes, I expect inet sockets to require much more. > + > +} __attribute__ ((aligned(8))); > + > +struct ckpt_hdr_socket_buffer { > + struct ckpt_hdr h; > + __u32 skb_count; > +} __attribute__ ((aligned(8))); > + Do we need the skb_count ? How about using the total data length instead ? > #endif /* not kernel and not glibc */ > #endif /* _LINUX_SOCKET_H */ > diff --git a/include/net/sock.h b/include/net/sock.h > index 4bb1ff9..d00598c 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max; > extern __u32 sysctl_wmem_default; > extern __u32 sysctl_rmem_default; > > +/* Checkpoint/Restart Functions */ > +struct ckpt_ctx; > +struct ckpt_hdr_socket; > +extern int sock_file_checkpoint(struct ckpt_ctx *, void *); > +extern void *sock_file_restore(struct ckpt_ctx *); > +extern struct socket *__sock_file_restore(struct ckpt_ctx *, > + struct ckpt_hdr_socket *); > +extern int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file); > + > #endif /* _SOCK_H */ > diff --git a/net/Makefile b/net/Makefile > index 9e00a55..1c68a4e 100644 > --- a/net/Makefile > +++ b/net/Makefile > @@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y) > obj-$(CONFIG_SYSCTL) += sysctl_net.o > endif > obj-$(CONFIG_WIMAX) += wimax/ > + > +obj-$(CONFIG_CHECKPOINT) += socket_cr.o > diff --git a/net/socket.c b/net/socket.c > index 791d71a..5c36753 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -96,6 +96,9 @@ > #include <net/sock.h> > #include <linux/netfilter.h> > > +#include <linux/checkpoint.h> > +#include <linux/checkpoint_hdr.h> > + > static int sock_no_open(struct inode *irrelevant, struct file *dontcare); > static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, > unsigned long nr_segs, loff_t pos); > @@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = { > .sendpage = sock_sendpage, > .splice_write = generic_splice_sendpage, > .splice_read = sock_splice_read, > +#ifdef CONFIG_CHECKPOINT > + .checkpoint = sock_file_checkpoint, > +#endif > }; > > /* > @@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags) > return fd; > } > > +static struct file *sock_alloc_attach_fd(struct socket *socket) > +{ > + struct file *file; > + int err; > + > + file = get_empty_filp(); > + if (!file) > + return ERR_PTR(ENOMEM); > + > + err = sock_attach_fd(socket, file, 0); > + if (err < 0) { > + put_filp(file); > + file = ERR_PTR(err); > + } > + > + return file; > +} > + > +void *sock_file_restore(struct ckpt_ctx *ctx) > +{ > + struct ckpt_hdr_socket *h = NULL; > + struct socket *socket = NULL; > + struct file *file = NULL; > + int err; > + > + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET); > + if (IS_ERR(h)) > + return h; > + > + socket = __sock_file_restore(ctx, h); Nit: like elsewhere, how about "do_sock_file_restore" ? > + if (IS_ERR(socket)) { > + err = PTR_ERR(socket); > + goto err_put; > + } > + > + file = sock_alloc_attach_fd(socket); > + if (IS_ERR(file)) { > + err = PTR_ERR(file); > + goto err_release; > + } > + > + ckpt_hdr_put(ctx, h); > + > + return file; > + > + err_release: > + sock_release(socket); > + err_put: > + ckpt_hdr_put(ctx, h); > + > + return ERR_PTR(err); > +} > + > +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr) Heh.. I feel more comfortable with checkpoint code appearing before restart: it helps to already have seen how checkpoint works when going through the restore. > +{ > + struct ckpt_hdr_file_socket *h; > + int ret; > + struct file *file = ptr; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE); > + if (!h) > + return -ENOMEM; > + > + h->common.f_type = CKPT_FILE_SOCKET; > + > + ret = checkpoint_file_common(ctx, file, &h->common); > + if (ret < 0) > + goto out; > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + if (ret < 0) > + goto out; > + > + ret = __sock_file_checkpoint(ctx, file); Nit: here too, perhaps "do_sock_file_checkpoint" ? > + out: > + ckpt_hdr_put(ctx, h); > + return ret; > +} > + > static struct socket *sock_from_file(struct file *file, int *err) > { > if (file->f_op == &socket_file_ops) > diff --git a/net/socket_cr.c b/net/socket_cr.c > new file mode 100644 > index 0000000..448b95f > --- /dev/null > +++ b/net/socket_cr.c People objected to "cr" prefix/ending in file- and function- names. I suggest "net/checkpoint.c". Also, is there a reason to split checkpoint_... and restore_... code between two files ? I'd put everything in net/checkpoint.c > @@ -0,0 +1,352 @@ > +/* > + * Copyright 2009 IBM Corporation > + * > + * Author: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License as > + * published by the Free Software Foundation, version 2 of the > + * License. > + */ > + > +#include <linux/socket.h> > +#include <linux/mount.h> > +#include <linux/file.h> > + > +#include <net/af_unix.h> > +#include <net/tcp_states.h> > + > +#include <linux/checkpoint.h> > +#include <linux/checkpoint_hdr.h> > + > +static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to) > +{ > + int count = 0; > + struct sk_buff *skb; > + > + spin_lock(&from->lock); > + > + skb_queue_walk(from, skb) { > + struct sk_buff *tmp; > + > + tmp = skb_copy(skb, GFP_ATOMIC); I believe you can use skb_clone() here, and avoid the actual memory allocation and data copy. I think the GFP_ATOMIC in this case isn't that bad. But you could get mitigate (at least some of) it by pre-allocating skb's before grabbing the lock and then use skb_morph(). > + if (!tmp) { > + count = -ENOMEM; > + goto out; > + } > + skb_queue_tail(to, tmp); > + count++; > + } > + out: > + spin_unlock(&from->lock); > + > + return count; > +} What about passed file-descriptors ? Either handle them or fail with an error (e.g. -EBUSY) when found. > + > +static int __sock_write_buffers(struct ckpt_ctx *ctx, > + struct sk_buff_head *queue) > +{ > + struct sk_buff *skb; > + int ret = 0; > + > + skb_queue_walk(queue, skb) { > + ret = ckpt_write_obj_type(ctx, skb->data, skb->len, > + CKPT_HDR_SOCKET_BUFFER); Seems like you intend for the code to be generic and useful for not only for unix domain sockets. Skb's may point to fragmented data as well. If not supported, then the code should test for it and report an error in that case. > + if (ret) > + return ret; > + } > + > + return 0; > +} What's the advantage of writing skb's instead of one long stream of data ? In fact, perhaps you can use splice() for that data transfer (similar to pipe implementation) - will also take care of the fragmentation eventually ... > +static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue) > +{ > + struct ckpt_hdr_socket_buffer *h; > + struct sk_buff_head tmpq; > + int ret = -ENOMEM; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS); > + if (!h) > + goto out; > + > + skb_queue_head_init(&tmpq); > + > + h->skb_count = sock_copy_buffers(queue, &tmpq); > + if (h->skb_count < 0) { > + ret = h->skb_count; > + goto out; > + } > + > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + if (!ret) > + ret = __sock_write_buffers(ctx, &tmpq); > + > + out: > + ckpt_hdr_put(ctx, h); > + __skb_queue_purge(&tmpq); > + > + return ret; > +} > + > +static int sock_un_checkpoint(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct ckpt_hdr_socket *h) > +{ Later we'll have sock_inet_... and then others, and this file will grow indefinitely. Perhaps place family specific logic (sock_un_checkpoint, sock_un_restore) in the corresponding subdir (unix/checkpoint.c) ? > + struct unix_sock *sk = unix_sk(sock); > + struct unix_sock *pr = unix_sk(sk->peer); > + int new; > + int ret; > + > + h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new); > + if (h->un.this < 0) > + goto out; > + > + if (sk->peer) > + h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new); > + else > + h->un.peer = 0; > + > + if (h->un.peer < 0) { > + ret = h->un.peer; > + goto out; > + } For TCP_LISTEN sockets, there may be sockets connected to it, that have not been accepted yet. In the case of af_unix, they are embedded in skb's ... > + > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + out: > + return ret; > +} > + > +static int sock_cptrst(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct ckpt_hdr_socket *h, > + int op) > +{ > + if (sock->sk_socket) { > + CKPT_COPY(op, h->socket_flags, sock->sk_socket->flags); > + CKPT_COPY(op, h->socket_state, sock->sk_socket->state); > + } > + > + CKPT_COPY(op, h->reuse, sock->sk_reuse); > + CKPT_COPY(op, h->shutdown, sock->sk_shutdown); > + CKPT_COPY(op, h->userlocks, sock->sk_userlocks); > + CKPT_COPY(op, h->no_check, sock->sk_no_check); > + CKPT_COPY(op, h->protocol, sock->sk_protocol); > + CKPT_COPY(op, h->err, sock->sk_err); > + CKPT_COPY(op, h->err_soft, sock->sk_err_soft); > + CKPT_COPY(op, h->priority, sock->sk_priority); > + CKPT_COPY(op, h->rcvlowat, sock->sk_rcvlowat); > + CKPT_COPY(op, h->backlog, sock->sk_max_ack_backlog); > + CKPT_COPY(op, h->rcvtimeo, sock->sk_rcvtimeo); > + CKPT_COPY(op, h->sndtimeo, sock->sk_sndtimeo); > + CKPT_COPY(op, h->rcvbuf, sock->sk_rcvbuf); > + CKPT_COPY(op, h->sndbuf, sock->sk_sndbuf); > + CKPT_COPY(op, h->bound_dev_if, sock->sk_bound_dev_if); > + CKPT_COPY(op, h->flags, sock->sk_flags); > + CKPT_COPY(op, h->lingertime, sock->sk_lingertime); > + > + return 0; > +} > + > +int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file) > +{ > + struct socket *socket = file->private_data; > + struct sock *sock = socket->sk; > + struct ckpt_hdr_socket *h; > + int ret = 0; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET); > + if (!h) > + return -ENOMEM; > + > + h->family = sock->sk_family; > + h->state = socket->state; > + h->sock_state = sock->sk_state; > + h->reuse = sock->sk_reuse; > + h->type = sock->sk_type; > + h->protocol = sock->sk_protocol; > + > + h->laddr_len = sizeof(h->laddr); > + h->raddr_len = sizeof(h->raddr); > + > + if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) { > + ret = -EINVAL; > + goto out; > + } > + > + if ((h->sock_state != TCP_LISTEN) && > + (h->type != SOCK_DGRAM) && > + (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) { > + ret = -EINVAL; > + goto out; > + } This deserves a comment about SOCK_STREAM and SOCK_SEQPACKET being connection based. (For unix domain sockets you don't really need the remote address). SOCK_DGRAM may also be "connected" in a way that requires saving the remote address. > + > + sock_cptrst(ctx, sock, h, CKPT_CPT); > + > + if (h->family == AF_UNIX) { > + ret = sock_un_checkpoint(ctx, sock, h); > + if (ret) > + goto out; > + } else { > + ckpt_debug("unsupported socket type %i\n", h->family); > + ret = EINVAL; > + goto out; > + } > + > + ret = sock_write_buffers(ctx, &sock->sk_receive_queue); > + if (ret) > + goto out; > + > + ret = sock_write_buffers(ctx, &sock->sk_write_queue); > + if (ret) > + goto out; > + > + /* FIXME: write out-of-order queue for TCP */ > + out: > + ckpt_hdr_put(ctx, h); > + > + return ret; > +} > + > +static int sock_read_buffer(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct sk_buff **skb) > +{ > + struct ckpt_hdr *h; > + int ret = 0; > + int len; > + > + h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER); > + if (IS_ERR(h)) > + return PTR_ERR(h); > + > + len = h->len - sizeof(*h); > + > + *skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret); > + if (*skb == NULL) { > + ret = ENOMEM; > + goto out; > + } > + > + memcpy(skb_put(*skb, len), (char *)(h + 1), len); > + out: > + ckpt_hdr_put(ctx, h); > + return ret; > +} > + > +static int sock_read_buffers(struct ckpt_ctx *ctx, > + struct sock *sock, > + struct sk_buff_head *queue) > +{ > + struct ckpt_hdr_socket_buffer *h; > + int ret = 0; > + int i; > + > + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS); > + if (IS_ERR(h)) { > + ret = PTR_ERR(h); > + goto out; > + } > + > + for (i = 0; i < h->skb_count; i++) { > + struct sk_buff *skb = NULL; > + > + ret = sock_read_buffer(ctx, sock, &skb); > + if (ret) > + break; > + > + skb_queue_tail(queue, skb); > + } > + out: > + ckpt_hdr_put(ctx, h); > + > + return ret; > +} > + > +static int sock_un_restart(struct ckpt_ctx *ctx, > + struct ckpt_hdr_socket *h, > + struct socket *socket) > +{ > + struct sock *peer; > + int ret = 0; if (h->un.peer <= 0) return -EINVAL; > + > + if (h->sock_state == TCP_ESTABLISHED) { > + peer = ckpt_obj_fetch(ctx, h->un.peer, CKPT_OBJ_SOCK); > + if (peer && !IS_ERR(peer)) { ckpt_obj_fetch() returns either a valid pointer or an error: if (!IS_ERR(peer)) > + /* We're last, so join with peer */ > + struct sock *this = socket->sk; > + > + sock_hold(this); > + sock_hold(peer); > + > + unix_sk(this)->peer = peer; > + unix_sk(peer)->peer = this; > + > + this->sk_peercred.pid = task_tgid_vnr(current); > + current_euid_egid(&this->sk_peercred.uid, > + &this->sk_peercred.gid); > + > + peer->sk_peercred.pid = task_tgid_vnr(current); > + current_euid_egid(&peer->sk_peercred.uid, > + &peer->sk_peercred.gid); > + } else { else if (PTR_ERR(peer) == -EINVAL) { > + /* We're first, so add our socket and wait for peer */ > + ckpt_obj_insert(ctx, socket->sk, h->un.this, > + CKPT_OBJ_SOCK); > + } else { ret = PTE_ERR(peer); goto out; } > + > + } else if (h->sock_state == TCP_LISTEN) { > + ret = socket->ops->bind(socket, > + (struct sockaddr *)&h->laddr, > + h->laddr_len); bind() may also be required for TCP_ESTABLISHED or TCP_CLOSED (and of course, to SOCK_DRGRAM). Also, usually when you checkpoint a listening socket with a standard (non-abstract) address, the socket inode will exist in the file system. So it will be there on restart (assuming the file system view was restored too, e.g. from snapshot). So bind() will fail with -EADDRINUSE. Therefore you need to first unlink() the desired pathname. And if it wasn't there, you need to unlink() after the bind()... > + if (ret < 0) > + goto out; > + > + ret = socket->ops->listen(socket, h->backlog); > + if (ret < 0) > + goto out; > + } else > + ckpt_debug("unsupported UNIX socket state %i\n", h->state); > + > + socket->state = h->state; h->state may hold arbitrary value, and in particular not necessarily in agreement with h->sock_state :( > + socket->sk->sk_state = h->sock_state; > + out: > + return ret; > +} > + > +struct socket *__sock_file_restore(struct ckpt_ctx *ctx, > + struct ckpt_hdr_socket *h) > +{ > + struct socket *socket; > + int ret; > + > + ret = sock_create(h->family, h->type, 0, &socket); > + if (ret < 0) > + return ERR_PTR(ret); > + > + if (h->family == AF_UNIX) { > + ret = sock_un_restart(ctx, h, socket); > + ckpt_debug("sock_un_restart: %i\n", ret); > + } else { > + ckpt_debug("unsupported family %i\n", h->family); > + ret = -EINVAL; > + } > + > + if (ret) > + goto out; > + Missing call to sock_cptrst() ? And in that case, will need to sanitize the data. > + ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue); > + if (ret) > + goto out; > + > + ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue); > + if (ret) > + goto out; > + out: > + if (ret) { > + sock_release(socket); > + socket = ERR_PTR(ret); > + } > + > + return socket; > +} > + The local addresses of sockets are not restored. This means that getpeername() will not give the expected result. I also wonder how this scheme will end up working when we add support to INET sockets - at first, locally connected. I like the simplicity of your approach that creates two separate sockets and manually connect them. Unfortunately, this "manually" is going to be very complicated with INET sockets. Here's an alternative approach for SOCK_STEAM sockets (SOCK_DGRAM later): A socket may be either listening, connect or accept (dgram: only connect). If a socket is not connected, then c/r is relatively easy. -- Checkpoint: If it is connected, then locate the correspondig other sockets: listening (L), connect (C) and accept (A). Checkpoint them in this order (regardless of which socket you hit first). If you first find the (L), then checkpoint it alone If it has not-yet-accepted sockets pending, say (a1, a2), then locate their corresponding (C1), (C2), and checkpoint (C1), (a1) and then (C2), (a2) etc. If for a pair (C)-(A), the (L) was already checkpointed, then save the objref only. If for a pair (C)-(A), the (L) does not exist anymore then save objref=0 (objrefs are otherwise always > 0). -- Restart: When seeing an (L), create a listening socket. (If need to later unlink the pathname, defer this to end of checkpoint). When seeing a (C), read it in, connect to the corresponding (L). If there was no (L), create a temporary one. If the peer was accepted, then accept to create (A), else leave as is to create (a). Add both (C) and (A)/(a) to the objhash. When seeing an (A) or (a), ckpt_obj_fetch() must succeed and there is little work to do. --- For SOCK_DGRAM Here, each socket may be "connected" (in the DGRAM sense) to another, or even to itself. To checkpoint, first save the socket, then the peer, then connect - similar to what you do now, but with a real "connect()" call, to get the full behind-the-scenes effect. For INET sockets, it's even simpler. --- The advantages of this approach are: - You care not about the internals of socket connection, or not-yet-accepted sockets. - The local- and peer-addresses are all set automagically. - It should work for INET sockets (locally connected, e.g. to localhost), without caring about e.g. TCP seq-numbers). Comments ? Oren. ^ permalink raw reply [flat|nested] 5+ messages in thread
[parent not found: <4A2CA97D.1060002-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v2) [not found] ` <4A2CA97D.1060002-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-06-08 15:10 ` Dan Smith [not found] ` <87iqj6sqvs.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> 0 siblings, 1 reply; 5+ messages in thread From: Dan Smith @ 2009-06-08 15:10 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg OL> People objected to "cr" prefix/ending in file- and function- OL> names. I suggest "net/checkpoint.c". Sure. I actually had this patch started before the big rename thing, so I changed the functions and not the file. net/checkpoint.c seems good to me. OL> Also, is there a reason to split checkpoint_... and restore_... OL> code between two files ? I'd put everything in net/checkpoint.c Personally, I don't like that separation, and judging by the amount of code I currently have for UNIX and INET sockets, I think having everything in one file is reasonable. OL> I believe you can use skb_clone() here, and avoid the actual OL> memory allocation and data copy. Okay, I'll check that out. OL> I think the GFP_ATOMIC in this case isn't that bad. But you could OL> get mitigate (at least some of) it by pre-allocating skb's before OL> grabbing the lock and then use skb_morph(). Okay. OL> What about passed file-descriptors ? Either handle them or fail OL> with an error (e.g. -EBUSY) when found. Ah, yeah, I'll add a test and failure for now and address it later. OL> Seems like you intend for the code to be generic and useful for OL> not only for unix domain sockets. Skb's may point to fragmented OL> data as well. If not supported, then the code should test for it OL> and report an error in that case. Okay. OL> What's the advantage of writing skb's instead of one long stream OL> of data ? If the receiver does a read() on the socket of (for example) 1024 long and the next skb in the stack is only 512 bytes long, they'll only get 512 bytes back until the next read(), right? If so, then restructuring the stream of data so that it behaves a little differently after restart seems less desirable to me. OL> Later we'll have sock_inet_... and then others, and this file will OL> grow indefinitely. Perhaps place family specific logic OL> (sock_un_checkpoint, sock_un_restore) in the corresponding subdir OL> (unix/checkpoint.c) ? If you like, sure. It doesn't seem like enough code to justify all the interfaces needed between them, but if it will help to make it easier to read then I can do that. OL> bind() may also be required for TCP_ESTABLISHED or TCP_CLOSED (and OL> of course, to SOCK_DRGRAM). Hmm, and how do I tell? OL> Also, usually when you checkpoint a listening socket with a OL> standard (non-abstract) address, the socket inode will exist in OL> the file system. So it will be there on restart (assuming the file OL> system view was restored too, e.g. from snapshot). So bind() will OL> fail with -EADDRINUSE. OL> Therefore you need to first unlink() the desired pathname. OL> And if it wasn't there, you need to unlink() after the bind()... I had a conversation about this with someone privately at one point and the thought was that userspace should handle removing the path before restart. It seems to make sense to me, to avoid having the kernel unlink() something as a side effect of the restart when there should be a userspace component managing more of the high-level stuff. h-> state may hold arbitrary value, and in particular not OL> necessarily in agreement with h->sock_state :( ...I'm not sure I follow :) OL> Missing call to sock_cptrst() ? And in that case, will need to OL> sanitize the data. Whoops, that must have been a casualty of splitting out the UNIX and INET bits :) OL> The local addresses of sockets are not restored. This means that OL> getpeername() will not give the expected result. Heh, this too. OL> I also wonder how this scheme will end up working when we add OL> support to INET sockets - at first, locally connected. I like the OL> simplicity of your approach that creates two separate sockets and OL> manually connect them. Unfortunately, this "manually" is going to OL> be very complicated with INET sockets. Well, it doesn't seem too bad thus far, FWIW. OL> Here's an alternative approach for SOCK_STEAM sockets (SOCK_DGRAM OL> later): OL> A socket may be either listening, connect or accept (dgram: only OL> connect). If a socket is not connected, then c/r is relatively OL> easy. OL> -- Checkpoint: OL> If it is connected, then locate the correspondig other sockets: OL> listening (L), connect (C) and accept (A). Checkpoint them in OL> this order (regardless of which socket you hit first). OL> If you first find the (L), then checkpoint it alone If it has OL> not-yet-accepted sockets pending, say (a1, a2), then locate OL> their corresponding (C1), (C2), and checkpoint (C1), (a1) and OL> then (C2), (a2) etc. OL> If for a pair (C)-(A), the (L) was already checkpointed, then OL> save the objref only. OL> If for a pair (C)-(A), the (L) does not exist anymore then save OL> objref=0 (objrefs are otherwise always > 0). OL> -- Restart: OL> When seeing an (L), create a listening socket. (If need to OL> later unlink the pathname, defer this to end of checkpoint). OL> When seeing a (C), read it in, connect to the corresponding OL> (L). If there was no (L), create a temporary one. If the OL> peer was accepted, then accept to create (A), else leave as OL> is to create (a). Add both (C) and (A)/(a) to the objhash. I assume you mean using connect() and accept() to simulate this interaction. How do you handle doing this within one thread when connect() will block you before you get to accept()? I must be missing some subtlety about the ordering. OL> When seeing an (A) or (a), ckpt_obj_fetch() must succeed and OL> there is little work to do. OL> --- For SOCK_DGRAM OL> Here, each socket may be "connected" (in the DGRAM sense) to OL> another, or even to itself. To checkpoint, first save the OL> socket, then the peer, then connect - similar to what you OL> do now, but with a real "connect()" call, to get the full OL> behind-the-scenes effect. For INET sockets, it's even OL> simpler. OL> --- OL> The advantages of this approach are: OL> - You care not about the internals of socket connection, or OL> not-yet-accepted sockets. With respect to the state of things, I suppose that's true. However, we'll still need to restore a fair bit of information about the socket settings (state, counters, etc) if we want to support anything other than trivial locally-connected sockets. From memory, restoring the state of the sockets is a fraction of the rest of it. OL> - The local- and peer-addresses are all set automagically. OL> - It should work for INET sockets (locally connected, e.g. OL> to localhost), without caring about e.g. TCP seq-numbers). ...and what about non-local sockets? I'm not sure I see how the above will work for INET sockets connected to remote machines, which is an important thing to support for migration. Can you elaborate? -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 5+ messages in thread
[parent not found: <87iqj6sqvs.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v2) [not found] ` <87iqj6sqvs.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org> @ 2009-06-08 18:39 ` Oren Laadan [not found] ` <4A2D5AD2.40805-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> 0 siblings, 1 reply; 5+ messages in thread From: Oren Laadan @ 2009-06-08 18:39 UTC (permalink / raw) To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg Dan Smith wrote: > OL> People objected to "cr" prefix/ending in file- and function- > OL> names. I suggest "net/checkpoint.c". > > Sure. I actually had this patch started before the big rename thing, > so I changed the functions and not the file. net/checkpoint.c seems > good to me. > > OL> Also, is there a reason to split checkpoint_... and restore_... > OL> code between two files ? I'd put everything in net/checkpoint.c > > Personally, I don't like that separation, and judging by the amount of > code I currently have for UNIX and INET sockets, I think having > everything in one file is reasonable. I meant that it's better to have it in one file/place. So we agree. (Because right now sock_file_restore() and __sock_file_restore() are separated into two files). > > OL> I believe you can use skb_clone() here, and avoid the actual > OL> memory allocation and data copy. > > Okay, I'll check that out. > > OL> I think the GFP_ATOMIC in this case isn't that bad. But you could > OL> get mitigate (at least some of) it by pre-allocating skb's before > OL> grabbing the lock and then use skb_morph(). > > Okay. > > OL> What about passed file-descriptors ? Either handle them or fail > OL> with an error (e.g. -EBUSY) when found. > > Ah, yeah, I'll add a test and failure for now and address it later. > > OL> Seems like you intend for the code to be generic and useful for > OL> not only for unix domain sockets. Skb's may point to fragmented > OL> data as well. If not supported, then the code should test for it > OL> and report an error in that case. > > Okay. > > OL> What's the advantage of writing skb's instead of one long stream > OL> of data ? > > If the receiver does a read() on the socket of (for example) 1024 long > and the next skb in the stack is only 512 bytes long, they'll only get > 512 bytes back until the next read(), right? If so, then > restructuring the stream of data so that it behaves a little > differently after restart seems less desirable to me. Is that for stream or dgram socket ? For stream sockets that shouldn't really matter for the application. There is nothing in posix that mandates such behavior, and all apps should be ready to retry the read. For dgram sockets of course we should maintain dgram boundaries. For seq_packet too. > > OL> Later we'll have sock_inet_... and then others, and this file will > OL> grow indefinitely. Perhaps place family specific logic > OL> (sock_un_checkpoint, sock_un_restore) in the corresponding subdir > OL> (unix/checkpoint.c) ? > > If you like, sure. It doesn't seem like enough code to justify all > the interfaces needed between them, but if it will help to make it > easier to read then I can do that. > I expect sock_inet_.... code to aggregate code with time. On the other hand, we can leave it as is and split when it actually grows or if someone complains ? > OL> bind() may also be required for TCP_ESTABLISHED or TCP_CLOSED (and > OL> of course, to SOCK_DRGRAM). > > Hmm, and how do I tell? For unix domain sockets, if unix_sk(sk)->addr is set, then there is a chance that it had a bind() call. A bind() can be by-inode (via filesystem) or abstract (first character is '\0'). You can test like this (code may be outdated) - int is_unixsock_bind_byinode(struct sock *sk) { struct sock *skk; if (unix_sk(sk)->dentry) { skk = unix_find_socket_byinode(unix_sk(sk)->dentry->d_inode); if (skk) { sock_put(skk); return (sk == skk); } } return 0; } int is_unixsock_bind_byname(struct sock *sk) { struct unix_address *addr = unix_sk(sk)->addr; unsigned short type = sk_memb(sk, type); /* see net/unix/af_unix.c to explain this xor: */ unsigned hash = addr->hash ^ type; struct sock *skk; read_lock(&unix_table_lock); skk = __unix_find_socket_byname(addr->name, addr->len, type, hash); read_unlock(&unix_table_lock); return (sk == skk); } If a test succeeds, you need to bind it explicitly. If it fails, (and unix_sk(sk)->addr is not NULL), it means that the address was inherited from the listening socket. This is also how you can tell between a connect and an accept side of a connection. > > OL> Also, usually when you checkpoint a listening socket with a > OL> standard (non-abstract) address, the socket inode will exist in > OL> the file system. So it will be there on restart (assuming the file > OL> system view was restored too, e.g. from snapshot). So bind() will > OL> fail with -EADDRINUSE. > > OL> Therefore you need to first unlink() the desired pathname. > > OL> And if it wasn't there, you need to unlink() after the bind()... > > I had a conversation about this with someone privately at one point > and the thought was that userspace should handle removing the path > before restart. It seems to make sense to me, to avoid having the > kernel unlink() something as a side effect of the restart when there > should be a userspace component managing more of the high-level > stuff. To me the opposite makes sense. I don't see anything wrong with the kernel unlink()ing it (temporarily). At checkpoint time, the pathname may, or may not, exist (could have been removed, and even re-created). If you unlink it from userspace, how would you ensure that the file is gone after the restart succeeds ? > > h-> state may hold arbitrary value, and in particular not > OL> necessarily in agreement with h->sock_state :( > > ...I'm not sure I follow :) You need to check what the user provides in the header for that field before putting it inside a kernel data structure. In this particular case, it must "agree" with sk_state as well: the set of valid values depends on the value already in (and validate for) h->sock_state. > > OL> Missing call to sock_cptrst() ? And in that case, will need to > OL> sanitize the data. > > Whoops, that must have been a casualty of splitting out the UNIX and > INET bits :) > > OL> The local addresses of sockets are not restored. This means that > OL> getpeername() will not give the expected result. > > Heh, this too. > > OL> I also wonder how this scheme will end up working when we add > OL> support to INET sockets - at first, locally connected. I like the > OL> simplicity of your approach that creates two separate sockets and > OL> manually connect them. Unfortunately, this "manually" is going to > OL> be very complicated with INET sockets. > > Well, it doesn't seem too bad thus far, FWIW. Maybe because I haven't seen your INET code, so I can't comment. Personally I tend to stay away from unneeded tinkering of complex kernel data structures. Needing to rebuilding the tcp control block by hand, which is what I assume you'll be doing, is something I'd like to avoid. > > OL> Here's an alternative approach for SOCK_STEAM sockets (SOCK_DGRAM > OL> later): > > OL> A socket may be either listening, connect or accept (dgram: only > OL> connect). If a socket is not connected, then c/r is relatively > OL> easy. > > OL> -- Checkpoint: > > OL> If it is connected, then locate the correspondig other sockets: > OL> listening (L), connect (C) and accept (A). Checkpoint them in > OL> this order (regardless of which socket you hit first). > > OL> If you first find the (L), then checkpoint it alone If it has > OL> not-yet-accepted sockets pending, say (a1, a2), then locate > OL> their corresponding (C1), (C2), and checkpoint (C1), (a1) and > OL> then (C2), (a2) etc. > > OL> If for a pair (C)-(A), the (L) was already checkpointed, then > OL> save the objref only. > > OL> If for a pair (C)-(A), the (L) does not exist anymore then save > OL> objref=0 (objrefs are otherwise always > 0). > > OL> -- Restart: > > OL> When seeing an (L), create a listening socket. (If need to > OL> later unlink the pathname, defer this to end of checkpoint). > > OL> When seeing a (C), read it in, connect to the corresponding > OL> (L). If there was no (L), create a temporary one. If the > OL> peer was accepted, then accept to create (A), else leave as > OL> is to create (a). Add both (C) and (A)/(a) to the objhash. > > I assume you mean using connect() and accept() to simulate this > interaction. How do you handle doing this within one thread when > connect() will block you before you get to accept()? I must be > missing some subtlety about the ordering. Yes, of course: using connect() and accept(). One thread is enough, provided that the connect() is done first: Connect() succeeds as soon as the listening socket can ack the connection (both INET and UNIX). The to-be-accepted - aka 'pending' sock (not yet socket) is kept with the listening socket in a (backlog) queue. In UNIX sockets it is put in a skb. In INET it's a real queue. A subsequent accept() will succeed immediately by accepting that pending sock and attaching it to a proper socket. > > OL> When seeing an (A) or (a), ckpt_obj_fetch() must succeed and > OL> there is little work to do. > > OL> --- For SOCK_DGRAM > > OL> Here, each socket may be "connected" (in the DGRAM sense) to > OL> another, or even to itself. To checkpoint, first save the > OL> socket, then the peer, then connect - similar to what you > OL> do now, but with a real "connect()" call, to get the full > OL> behind-the-scenes effect. For INET sockets, it's even > OL> simpler. > > OL> --- > > OL> The advantages of this approach are: > > OL> - You care not about the internals of socket connection, or > OL> not-yet-accepted sockets. > > With respect to the state of things, I suppose that's true. However, > we'll still need to restore a fair bit of information about the socket > settings (state, counters, etc) if we want to support anything other > than trivial locally-connected sockets. From memory, restoring the > state of the sockets is a fraction of the rest of it. Non locally connected sockets are different - because you only restore _one_ side of the connection, not both. And it requires more tinkering. But for locally connected sockets, what's the point in saving the protocol specific state (which isn't socket-state), like sequence numbers, acks, retransmits etc ? > > OL> - The local- and peer-addresses are all set automagically. > > OL> - It should work for INET sockets (locally connected, e.g. > OL> to localhost), without caring about e.g. TCP seq-numbers). > > ...and what about non-local sockets? I'm not sure I see how the above > will work for INET sockets connected to remote machines, which is an > important thing to support for migration. Can you elaborate? INET sockets connected to remote machines is a complex case. One way is to manually restore all the network stack for a connection. Another is to use connect/accept and fool the kernel to build what we want. I suggest that we start by handling listening sockets (easy), closing previously connected sockets (as if connection broke, like after a suspend/resume), and get some feedback from the networking people. Oren. ^ permalink raw reply [flat|nested] 5+ messages in thread
[parent not found: <4A2D5AD2.40805-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>]
* Re: [PATCH] c/r: Add AF_UNIX support (v2) [not found] ` <4A2D5AD2.40805-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> @ 2009-06-08 21:15 ` Dan Smith 0 siblings, 0 replies; 5+ messages in thread From: Dan Smith @ 2009-06-08 21:15 UTC (permalink / raw) To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg OL> I meant that it's better to have it in one file/place. So we OL> agree. (Because right now sock_file_restore() and OL> __sock_file_restore() are separated into two files). Ah, I see. The reason for that was because I needed a static function in socket.c for the restart function. However, I can make that cleaner. OL> Is that for stream or dgram socket ? Both, AFAIK. OL> For stream sockets that shouldn't really matter for the OL> application. There is nothing in posix that mandates such OL> behavior, and all apps should be ready to retry the read. It shouldn't matter, I agree. However, it may change the way the app behaves depending on whether it's just been restored or not. Plus, OL> For dgram sockets of course we should maintain dgram boundaries. Why special-case stream? h-> state may hold arbitrary value, and in particular not OL> necessarily in agreement with h->sock_state :( >> >> ...I'm not sure I follow :) OL> You need to check what the user provides in the header for that OL> field before putting it inside a kernel data structure. In this OL> particular case, it must "agree" with sk_state as well: the set of OL> valid values depends on the value already in (and validate for) OL> h->sock_state. Oh, I missed your point entirely before. Thanks. OL> Maybe because I haven't seen your INET code, so I can't comment. Not that this helps, but: checkpoint/sys.c | 3 include/linux/checkpoint_hdr.h | 94 ++++++++++++ include/linux/checkpoint_types.h | 2 net/ipv4/inet_connection_sock.c | 3 net/socket_cr.c | 288 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 390 insertions(+) That's about as much as should see the light of day yet. It needs a whole bunch of cleanup :) OL> Personally I tend to stay away from unneeded tinkering of complex OL> kernel data structures. Needing to rebuilding the tcp control OL> block by hand, which is what I assume you'll be doing, is OL> something I'd like to avoid. It works effectively the same way the unix patch does, yes. OL> Connect() succeeds as soon as the listening socket can ack the OL> connection (both INET and UNIX). OL> The to-be-accepted - aka 'pending' sock (not yet socket) is kept OL> with the listening socket in a (backlog) queue. In UNIX sockets OL> it is put in a skb. In INET it's a real queue. OL> A subsequent accept() will succeed immediately by accepting that OL> pending sock and attaching it to a proper socket. Okay, I'll switch it to this and see what it looks like. OL> Non locally connected sockets are different - because you only OL> restore _one_ side of the connection, not both. And it requires OL> more tinkering. Well, not in my approach, of course. The way I have the inet stuff done, I yank the socket into being. If you have two halves that happen to point at each other, then they will be connected after restart. If not, and you only restore one half, it's still connected to whatever is on the other end (provided it didn't go away). OL> But for locally connected sockets, what's the point in saving the OL> protocol specific state (which isn't socket-state), like sequence OL> numbers, acks, retransmits etc ? Symmetry with the non-special case? OL> INET sockets connected to remote machines is a complex case. One OL> way is to manually restore all the network stack for a OL> connection. Another is to use connect/accept and fool the kernel OL> to build what we want. Well, approaching this from the container point of view, I'd expect to have a veth device that can be shut down ahead of time to stop arping for the destination address, avoid a RST while monkeying with the connection, etc. OL> I suggest that we start by handling listening sockets (easy), OL> closing previously connected sockets (as if connection broke, OL> like after a suspend/resume), and get some feedback from the OL> networking people. Sounds good. I'll make some of the above changes to the unix patch and re-post, and then follow it by a patch to enable listen-only INET sockets. Thanks! -- Dan Smith IBM Linux Technology Center email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2009-06-08 21:15 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-05 18:27 [PATCH] c/r: Add AF_UNIX support (v2) Dan Smith
[not found] ` <1244226434-8167-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-08 6:02 ` Oren Laadan
[not found] ` <4A2CA97D.1060002-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-06-08 15:10 ` Dan Smith
[not found] ` <87iqj6sqvs.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-06-08 18:39 ` Oren Laadan
[not found] ` <4A2D5AD2.40805-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-06-08 21:15 ` Dan Smith
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox