[PATCH v3 5/6] net: add paged frag destructor support to kernel

linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 5/6] net: add paged frag destructor support to kernel_sendpage.
       [not found] <1327494389.24561.316.camel@zakaz.uk.xensource.com>
@ 2012-01-25 12:27 ` Ian Campbell
  2012-01-25 12:27 ` [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack Ian Campbell
  1 sibling, 0 replies; 7+ messages in thread
From: Ian Campbell @ 2012-01-25 12:27 UTC (permalink / raw)
  To: netdev
  Cc: Ian Campbell, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Trond Myklebust, Greg Kroah-Hartman, drbd-user,
	devel, cluster-devel, ocfs2-devel, ceph-devel, rds-devel,
	linux-nfs

This requires adding a new argument to various sendpage hooks up and down the
stack. At the moment this parameter is always NULL.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: "Pekka Savola (ipv6)" <pekkas@netcore.fi>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: drbd-user@lists.linbit.com
Cc: devel@driverdev.osuosl.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
Cc: netdev@vger.kernel.org
Cc: ceph-devel@vger.kernel.org
Cc: rds-devel@oss.oracle.com
Cc: linux-nfs@vger.kernel.org
---
 drivers/block/drbd/drbd_main.c           |    1 +
 drivers/scsi/iscsi_tcp.c                 |    4 ++--
 drivers/scsi/iscsi_tcp.h                 |    3 ++-
 drivers/staging/pohmelfs/trans.c         |    3 ++-
 drivers/target/iscsi/iscsi_target_util.c |    3 ++-
 fs/dlm/lowcomms.c                        |    4 ++--
 fs/ocfs2/cluster/tcp.c                   |    1 +
 include/linux/net.h                      |    6 +++++-
 include/net/inet_common.h                |    4 +++-
 include/net/ip.h                         |    4 +++-
 include/net/sock.h                       |    8 +++++---
 include/net/tcp.h                        |    4 +++-
 net/ceph/messenger.c                     |    2 +-
 net/core/sock.c                          |    6 +++++-
 net/ipv4/af_inet.c                       |    9 ++++++---
 net/ipv4/ip_output.c                     |    6 ++++--
 net/ipv4/tcp.c                           |   24 +++++++++++++++---------
 net/ipv4/udp.c                           |   11 ++++++-----
 net/ipv4/udp_impl.h                      |    5 +++--
 net/rds/tcp_send.c                       |    1 +
 net/socket.c                             |   11 +++++++----
 net/sunrpc/svcsock.c                     |    6 +++---
 net/sunrpc/xprtsock.c                    |    2 +-
 23 files changed, 83 insertions(+), 45 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55..49c7346 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2584,6 +2584,7 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
 	set_fs(KERNEL_DS);
 	do {
 		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
+							NULL,
 							offset, len,
 							msg_flags);
 		if (sent == -EAGAIN) {
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 7c34d8e..3884ae1 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -284,8 +284,8 @@ static int iscsi_sw_tcp_xmit_segment(struct iscsi_tcp_conn *tcp_conn,
 		if (!segment->data) {
 			sg = segment->sg;
 			offset += segment->sg_offset + sg->offset;
-			r = tcp_sw_conn->sendpage(sk, sg_page(sg), offset,
-						  copy, flags);
+			r = tcp_sw_conn->sendpage(sk, sg_page(sg), NULL,
+						  offset, copy, flags);
 		} else {
 			struct msghdr msg = { .msg_flags = flags };
 			struct kvec iov = {
diff --git a/drivers/scsi/iscsi_tcp.h b/drivers/scsi/iscsi_tcp.h
index 666fe09..1e23265 100644
--- a/drivers/scsi/iscsi_tcp.h
+++ b/drivers/scsi/iscsi_tcp.h
@@ -52,7 +52,8 @@ struct iscsi_sw_tcp_conn {
 	uint32_t		sendpage_failures_cnt;
 	uint32_t		discontiguous_hdr_cnt;
 
-	ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
+	ssize_t (*sendpage)(struct socket *, struct page *,
+			    struct skb_frag_destructor *, int, size_t, int);
 };
 
 struct iscsi_sw_tcp_host {
diff --git a/drivers/staging/pohmelfs/trans.c b/drivers/staging/pohmelfs/trans.c
index 06c1a74..96a7921 100644
--- a/drivers/staging/pohmelfs/trans.c
+++ b/drivers/staging/pohmelfs/trans.c
@@ -104,7 +104,8 @@ static int netfs_trans_send_pages(struct netfs_trans *t, struct netfs_state *st)
 		msg.msg_flags = MSG_WAITALL | (attached_pages == 1 ? 0 :
 				MSG_MORE);
 
-		err = kernel_sendpage(st->socket, page, 0, size, msg.msg_flags);
+		err = kernel_sendpage(st->socket, page, NULL,
+				      0, size, msg.msg_flags);
 		if (err <= 0) {
 			printk("%s: %d/%d failed to send transaction page: t: %p, gen: %u, size: %u, err: %d.\n",
 					__func__, i, t->page_num, t, t->gen, size, err);
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 02348f7..d0c984b 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1315,7 +1315,8 @@ send_hdr:
 		u32 sub_len = min_t(u32, data_len, space);
 send_pg:
 		tx_sent = conn->sock->ops->sendpage(conn->sock,
-					sg_page(sg), sg->offset + offset, sub_len, 0);
+					sg_page(sg), NULL,
+					sg->offset + offset, sub_len, 0);
 		if (tx_sent != sub_len) {
 			if (tx_sent == -EAGAIN) {
 				pr_err("tcp_sendpage() returned"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0b3109e..622107a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1342,8 +1342,8 @@ static void send_to_sock(struct connection *con)
 
 		ret = 0;
 		if (len) {
-			ret = kernel_sendpage(con->sock, e->page, offset, len,
-					      msg_flags);
+			ret = kernel_sendpage(con->sock, e->page, NULL,
+					      offset, len, msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
 				if (ret == -EAGAIN &&
 				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 044e7b5..e13851e 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -983,6 +983,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
 		mutex_lock(&sc->sc_send_lock);
 		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
 						 virt_to_page(kmalloced_virt),
+						 NULL,
 						 (long)kmalloced_virt & ~PAGE_MASK,
 						 size, MSG_DONTWAIT);
 		mutex_unlock(&sc->sc_send_lock);
diff --git a/include/linux/net.h b/include/linux/net.h
index b299230..db562ba 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -157,6 +157,7 @@ struct kiocb;
 struct sockaddr;
 struct msghdr;
 struct module;
+struct skb_frag_destructor;
 
 struct proto_ops {
 	int		family;
@@ -203,6 +204,7 @@ struct proto_ops {
 	int		(*mmap)	     (struct file *file, struct socket *sock,
 				      struct vm_area_struct * vma);
 	ssize_t		(*sendpage)  (struct socket *sock, struct page *page,
+				      struct skb_frag_destructor *destroy,
 				      int offset, size_t size, int flags);
 	ssize_t 	(*splice_read)(struct socket *sock,  loff_t *ppos,
 				       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
@@ -273,7 +275,9 @@ extern int kernel_getsockopt(struct socket *sock, int level, int optname,
 			     char *optval, int *optlen);
 extern int kernel_setsockopt(struct socket *sock, int level, int optname,
 			     char *optval, unsigned int optlen);
-extern int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+extern int kernel_sendpage(struct socket *sock, struct page *page,
+			   struct skb_frag_destructor *destroy,
+			   int offset,
 			   size_t size, int flags);
 extern int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 extern int kernel_sock_shutdown(struct socket *sock,
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 22fac98..91cd8d0 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -21,7 +21,9 @@ extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
 extern int inet_accept(struct socket *sock, struct socket *newsock, int flags);
 extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock,
 			struct msghdr *msg, size_t size);
-extern ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+extern ssize_t inet_sendpage(struct socket *sock, struct page *page,
+			     struct skb_frag_destructor *frag,
+			     int offset,
 			     size_t size, int flags);
 extern int inet_recvmsg(struct kiocb *iocb, struct socket *sock,
 			struct msghdr *msg, size_t size, int flags);
diff --git a/include/net/ip.h b/include/net/ip.h
index 775009f..33ea0da 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -114,7 +114,9 @@ extern int		ip_append_data(struct sock *sk, struct flowi4 *fl4,
 				struct rtable **rt,
 				unsigned int flags);
 extern int		ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
-extern ssize_t		ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+extern ssize_t		ip_append_page(struct sock *sk, struct flowi4 *fl4,
+				struct page *page,
+				struct skb_frag_destructor *destroy,
 				int offset, size_t size, int flags);
 extern struct sk_buff  *__ip_make_skb(struct sock *sk,
 				      struct flowi4 *fl4,
diff --git a/include/net/sock.h b/include/net/sock.h
index bb972d2..e13a512 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -801,6 +801,7 @@ struct proto {
 					size_t len, int noblock, int flags, 
 					int *addr_len);
 	int			(*sendpage)(struct sock *sk, struct page *page,
+					struct skb_frag_destructor *destroy,
 					int offset, size_t size, int flags);
 	int			(*bind)(struct sock *sk, 
 					struct sockaddr *uaddr, int addr_len);
@@ -1422,9 +1423,10 @@ extern int			sock_no_mmap(struct file *file,
 					     struct socket *sock,
 					     struct vm_area_struct *vma);
 extern ssize_t			sock_no_sendpage(struct socket *sock,
-						struct page *page,
-						int offset, size_t size, 
-						int flags);
+					struct page *page,
+					struct skb_frag_destructor *destroy,
+					int offset, size_t size,
+					int flags);
 
 /*
  * Functions to fill in entries in struct proto_ops when a protocol
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0118ea9..147a268 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -322,7 +322,9 @@ extern void *tcp_v4_tw_get_peer(struct sock *sk);
 extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t size);
-extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+extern int tcp_sendpage(struct sock *sk, struct page *page,
+			struct skb_frag_destructor *destroy,
+			int offset,
 			size_t size, int flags);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ad5b708..69f049b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -851,7 +851,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 				cpu_to_le32(crc32c(tmpcrc, base, len));
 			con->out_msg_pos.did_page_crc = 1;
 		}
-		ret = kernel_sendpage(con->sock, page,
+		ret = kernel_sendpage(con->sock, page, NULL,
 				      con->out_msg_pos.page_pos + page_shift,
 				      len,
 				      MSG_DONTWAIT | MSG_NOSIGNAL |
diff --git a/net/core/sock.c b/net/core/sock.c
index 002939c..563401b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1939,7 +1939,9 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
 }
 EXPORT_SYMBOL(sock_no_mmap);
 
-ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+ssize_t sock_no_sendpage(struct socket *sock, struct page *page,
+			 struct skb_frag_destructor *destroy,
+			 int offset, size_t size, int flags)
 {
 	ssize_t res;
 	struct msghdr msg = {.msg_flags = flags};
@@ -1949,6 +1951,8 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
 	iov.iov_len = size;
 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
 	kunmap(page);
+	/* kernel_sendmsg copies so we can destroy immediately */
+	skb_frag_destructor_unref(destroy);
 	return res;
 }
 EXPORT_SYMBOL(sock_no_sendpage);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f7b5670..591665d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -745,7 +745,9 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 }
 EXPORT_SYMBOL(inet_sendmsg);
 
-ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ssize_t inet_sendpage(struct socket *sock, struct page *page,
+		      struct skb_frag_destructor *destroy,
+		      int offset,
 		      size_t size, int flags)
 {
 	struct sock *sk = sock->sk;
@@ -758,8 +760,9 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		return -EAGAIN;
 
 	if (sk->sk_prot->sendpage)
-		return sk->sk_prot->sendpage(sk, page, offset, size, flags);
-	return sock_no_sendpage(sock, page, offset, size, flags);
+		return sk->sk_prot->sendpage(sk, page, destroy,
+					     offset, size, flags);
+	return sock_no_sendpage(sock, page, destroy, offset, size, flags);
 }
 EXPORT_SYMBOL(inet_sendpage);
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9e4eca6..2ce0b8e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1130,6 +1130,7 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
 }
 
 ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+		       struct skb_frag_destructor *destroy,
 		       int offset, size_t size, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -1243,11 +1244,12 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 		i = skb_shinfo(skb)->nr_frags;
 		if (len > size)
 			len = size;
-		if (skb_can_coalesce(skb, i, page, NULL, offset)) {
+		if (skb_can_coalesce(skb, i, page, destroy, offset)) {
 			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
 		} else if (i < MAX_SKB_FRAGS) {
-			get_page(page);
 			skb_fill_page_desc(skb, i, page, offset, len);
+			skb_frag_set_destructor(skb, i, destroy);
+			skb_frag_ref(skb, i);
 		} else {
 			err = -EMSGSIZE;
 			goto error;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a928aec..17dd307 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -755,8 +755,11 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 	return mss_now;
 }
 
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
-			 size_t psize, int flags)
+static ssize_t do_tcp_sendpages(struct sock *sk,
+				struct page **pages,
+				struct skb_frag_destructor *destroy,
+				int poffset,
+				size_t psize, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mss_now, size_goal;
@@ -802,7 +805,7 @@ new_segment:
 			copy = size;
 
 		i = skb_shinfo(skb)->nr_frags;
-		can_coalesce = skb_can_coalesce(skb, i, page, NULL, offset);
+		can_coalesce = skb_can_coalesce(skb, i, page, destroy, offset);
 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 			tcp_mark_push(tp, skb);
 			goto new_segment;
@@ -813,8 +816,9 @@ new_segment:
 		if (can_coalesce) {
 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
 		} else {
-			get_page(page);
 			skb_fill_page_desc(skb, i, page, offset, copy);
+			skb_frag_set_destructor(skb, i, destroy);
+			skb_frag_ref(skb, i);
 		}
 
 		skb->len += copy;
@@ -869,18 +873,20 @@ out_err:
 	return sk_stream_error(sk, flags, err);
 }
 
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
-		 size_t size, int flags)
+int tcp_sendpage(struct sock *sk, struct page *page,
+		 struct skb_frag_destructor *destroy,
+		 int offset, size_t size, int flags)
 {
 	ssize_t res;
 
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
 	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
-		return sock_no_sendpage(sk->sk_socket, page, offset, size,
-					flags);
+		return sock_no_sendpage(sk->sk_socket, page, destroy,
+					offset, size, flags);
 
 	lock_sock(sk);
-	res = do_tcp_sendpages(sk, &page, offset, size, flags);
+	res = do_tcp_sendpages(sk, &page, destroy,
+			       offset, size, flags);
 	release_sock(sk);
 	return res;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5d075b5..c596d36 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1029,8 +1029,9 @@ do_confirm:
 }
 EXPORT_SYMBOL(udp_sendmsg);
 
-int udp_sendpage(struct sock *sk, struct page *page, int offset,
-		 size_t size, int flags)
+int udp_sendpage(struct sock *sk, struct page *page,
+		 struct skb_frag_destructor *destroy,
+		 int offset, size_t size, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct udp_sock *up = udp_sk(sk);
@@ -1058,11 +1059,11 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	}
 
 	ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
-			     page, offset, size, flags);
+			     page, destroy, offset, size, flags);
 	if (ret == -EOPNOTSUPP) {
 		release_sock(sk);
-		return sock_no_sendpage(sk->sk_socket, page, offset,
-					size, flags);
+		return sock_no_sendpage(sk->sk_socket, page, destroy,
+					offset, size, flags);
 	}
 	if (ret < 0) {
 		udp_flush_pending_frames(sk);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index aaad650..4923d82 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -23,8 +23,9 @@ extern int	compat_udp_getsockopt(struct sock *sk, int level, int optname,
 #endif
 extern int	udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			    size_t len, int noblock, int flags, int *addr_len);
-extern int	udp_sendpage(struct sock *sk, struct page *page, int offset,
-			     size_t size, int flags);
+extern int	udp_sendpage(struct sock *sk, struct page *page,
+			     struct skb_frag_destructor *destroy,
+			     int offset, size_t size, int flags);
 extern int	udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
 extern void	udp_destroy_sock(struct sock *sk);
 
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 1b4fd68..71503ad 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -119,6 +119,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 	while (sg < rm->data.op_nents) {
 		ret = tc->t_sock->ops->sendpage(tc->t_sock,
 						sg_page(&rm->data.op_sg[sg]),
+						NULL,
 						rm->data.op_sg[sg].offset + off,
 						rm->data.op_sg[sg].length - off,
 						MSG_DONTWAIT|MSG_NOSIGNAL);
diff --git a/net/socket.c b/net/socket.c
index e56162c..1812660 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -815,7 +815,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 	if (more)
 		flags |= MSG_MORE;
 
-	return kernel_sendpage(sock, page, offset, size, flags);
+	return kernel_sendpage(sock, page, NULL, offset, size, flags);
 }
 
 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
@@ -3358,15 +3358,18 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
 }
 EXPORT_SYMBOL(kernel_setsockopt);
 
-int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+int kernel_sendpage(struct socket *sock, struct page *page,
+		    struct skb_frag_destructor *destroy,
+		    int offset,
 		    size_t size, int flags)
 {
 	sock_update_classid(sock->sk);
 
 	if (sock->ops->sendpage)
-		return sock->ops->sendpage(sock, page, offset, size, flags);
+		return sock->ops->sendpage(sock, page, destroy,
+					   offset, size, flags);
 
-	return sock_no_sendpage(sock, page, offset, size, flags);
+	return sock_no_sendpage(sock, page, destroy, offset, size, flags);
 }
 EXPORT_SYMBOL(kernel_sendpage);
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 4653286..0e6b919 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -185,7 +185,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
 	/* send head */
 	if (slen == xdr->head[0].iov_len)
 		flags = 0;
-	len = kernel_sendpage(sock, headpage, headoffset,
+	len = kernel_sendpage(sock, headpage, NULL, headoffset,
 				  xdr->head[0].iov_len, flags);
 	if (len != xdr->head[0].iov_len)
 		goto out;
@@ -198,7 +198,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
 	while (pglen > 0) {
 		if (slen == size)
 			flags = 0;
-		result = kernel_sendpage(sock, *ppage, base, size, flags);
+		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
 		if (result > 0)
 			len += result;
 		if (result != size)
@@ -212,7 +212,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
 
 	/* send tail */
 	if (xdr->tail[0].iov_len) {
-		result = kernel_sendpage(sock, tailpage, tailoffset,
+		result = kernel_sendpage(sock, tailpage, NULL, tailoffset,
 				   xdr->tail[0].iov_len, 0);
 		if (result > 0)
 			len += result;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 55472c4..38b2fec 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -408,7 +408,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
 		remainder -= len;
 		if (remainder != 0 || more)
 			flags |= MSG_MORE;
-		err = sock->ops->sendpage(sock, *ppage, base, len, flags);
+		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
 		if (remainder == 0 || err != len)
 			break;
 		sent += err;
-- 
1.7.2.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
       [not found] <1327494389.24561.316.camel@zakaz.uk.xensource.com>
  2012-01-25 12:27 ` [PATCH v3 5/6] net: add paged frag destructor support to kernel_sendpage Ian Campbell
@ 2012-01-25 12:27 ` Ian Campbell
  2012-01-26 13:11   ` Michael S. Tsirkin
  1 sibling, 1 reply; 7+ messages in thread
From: Ian Campbell @ 2012-01-25 12:27 UTC (permalink / raw)
  To: netdev
  Cc: Ian Campbell, David S. Miller, Neil Brown, J. Bruce Fields,
	linux-nfs

This prevents an issue where an ACK is delayed, a retransmit is queued (either
at the RPC or TCP level) and the ACK arrives before the retransmission hits the
wire. If this happens to an NFS WRITE RPC then the write() system call
completes and the userspace process can continue, potentially modifying data
referenced by the retransmission before the retransmission occurs.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: linux-nfs@vger.kernel.org
Cc: netdev@vger.kernel.org
---
 include/linux/sunrpc/xdr.h  |    2 ++
 include/linux/sunrpc/xprt.h |    5 ++++-
 net/sunrpc/clnt.c           |   27 ++++++++++++++++++++++-----
 net/sunrpc/svcsock.c        |    3 ++-
 net/sunrpc/xprt.c           |   12 ++++++++++++
 net/sunrpc/xprtsock.c       |    3 ++-
 6 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a20970e..172f81e 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -16,6 +16,7 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <linux/scatterlist.h>
+#include <linux/skbuff.h>
 
 /*
  * Buffer adjustment
@@ -57,6 +58,7 @@ struct xdr_buf {
 			tail[1];	/* Appended after page data */
 
 	struct page **	pages;		/* Array of contiguous pages */
+	struct skb_frag_destructor *destructor;
 	unsigned int	page_base,	/* Start of page data */
 			page_len,	/* Length of page data */
 			flags;		/* Flags for data disposition */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 15518a1..75131eb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -92,7 +92,10 @@ struct rpc_rqst {
 						/* A cookie used to track the
 						   state of the transport
 						   connection */
-	
+	struct skb_frag_destructor destructor;	/* SKB paged fragment
+						 * destructor for
+						 * transmitted pages*/
+
 	/*
 	 * Partial send handling
 	 */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f0268ea..06f363f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -61,6 +61,7 @@ static void	call_reserve(struct rpc_task *task);
 static void	call_reserveresult(struct rpc_task *task);
 static void	call_allocate(struct rpc_task *task);
 static void	call_decode(struct rpc_task *task);
+static void	call_complete(struct rpc_task *task);
 static void	call_bind(struct rpc_task *task);
 static void	call_bind_status(struct rpc_task *task);
 static void	call_transmit(struct rpc_task *task);
@@ -1115,6 +1116,8 @@ rpc_xdr_encode(struct rpc_task *task)
 			 (char *)req->rq_buffer + req->rq_callsize,
 			 req->rq_rcvsize);
 
+	req->rq_snd_buf.destructor = &req->destructor;
+
 	p = rpc_encode_header(task);
 	if (p == NULL) {
 		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
@@ -1278,6 +1281,7 @@ call_connect_status(struct rpc_task *task)
 static void
 call_transmit(struct rpc_task *task)
 {
+	struct rpc_rqst *req = task->tk_rqstp;
 	dprint_status(task);
 
 	task->tk_action = call_status;
@@ -1311,8 +1315,8 @@ call_transmit(struct rpc_task *task)
 	call_transmit_status(task);
 	if (rpc_reply_expected(task))
 		return;
-	task->tk_action = rpc_exit_task;
-	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
+	task->tk_action = call_complete;
+	skb_frag_destructor_unref(&req->destructor);
 }
 
 /*
@@ -1385,7 +1389,8 @@ call_bc_transmit(struct rpc_task *task)
 		return;
 	}
 
-	task->tk_action = rpc_exit_task;
+	task->tk_action = call_complete;
+	skb_frag_destructor_unref(&req->destructor);
 	if (task->tk_status < 0) {
 		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
 			"error: %d\n", task->tk_status);
@@ -1425,7 +1430,6 @@ call_bc_transmit(struct rpc_task *task)
 			"error: %d\n", task->tk_status);
 		break;
 	}
-	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
@@ -1591,12 +1595,14 @@ call_decode(struct rpc_task *task)
 		return;
 	}
 
-	task->tk_action = rpc_exit_task;
+	task->tk_action = call_complete;
 
 	if (decode) {
 		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
 						      task->tk_msg.rpc_resp);
 	}
+	rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
+	skb_frag_destructor_unref(&req->destructor);
 	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
 			task->tk_status);
 	return;
@@ -1611,6 +1617,17 @@ out_retry:
 	}
 }
 
+/*
+ * 8.	Wait for pages to be released by the network stack.
+ */
+static void
+call_complete(struct rpc_task *task)
+{
+	dprintk("RPC: %5u call_complete result %d\n",
+		task->tk_pid, task->tk_status);
+	task->tk_action = rpc_exit_task;
+}
+
 static __be32 *
 rpc_encode_header(struct rpc_task *task)
 {
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0e6b919..aa7f108 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
 	while (pglen > 0) {
 		if (slen == size)
 			flags = 0;
-		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
+		result = kernel_sendpage(sock, *ppage, xdr->destructor,
+					 base, size, flags);
 		if (result > 0)
 			len += result;
 		if (result != size)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c64c0ef..2137eb6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1101,6 +1101,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
 	xprt->xid = net_random();
 }
 
+static int xprt_complete_skb_pages(struct skb_frag_destructor *destroy)
+{
+	struct rpc_rqst	*req =
+		container_of(destroy, struct rpc_rqst, destructor);
+
+	dprintk("RPC: %5u completing skb pages\n", req->rq_task->tk_pid);
+	rpc_wake_up_queued_task(&req->rq_xprt->pending, req->rq_task);
+	return 0;
+}
+
 static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
 {
 	struct rpc_rqst	*req = task->tk_rqstp;
@@ -1113,6 +1123,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
 	req->rq_xid     = xprt_alloc_xid(xprt);
 	req->rq_release_snd_buf = NULL;
 	xprt_reset_majortimeo(req);
+	atomic_set(&req->destructor.ref, 1);
+	req->destructor.destroy = &xprt_complete_skb_pages;
 	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
 			req, ntohl(req->rq_xid));
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 38b2fec..5406977 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
 		remainder -= len;
 		if (remainder != 0 || more)
 			flags |= MSG_MORE;
-		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
+		err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
+					  base, len, flags);
 		if (remainder == 0 || err != len)
 			break;
 		sent += err;
-- 
1.7.2.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
  2012-01-25 12:27 ` [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack Ian Campbell
@ 2012-01-26 13:11   ` Michael S. Tsirkin
  2012-01-30 15:51     ` Ian Campbell
  0 siblings, 1 reply; 7+ messages in thread
From: Michael S. Tsirkin @ 2012-01-26 13:11 UTC (permalink / raw)
  To: Ian Campbell
  Cc: netdev, David S. Miller, Neil Brown, J. Bruce Fields, linux-nfs

On Wed, Jan 25, 2012 at 12:27:14PM +0000, Ian Campbell wrote:
> This prevents an issue where an ACK is delayed, a retransmit is queued (either
> at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> wire. If this happens to an NFS WRITE RPC then the write() system call
> completes and the userspace process can continue, potentially modifying data
> referenced by the retransmission before the retransmission occurs.
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Neil Brown <neilb@suse.de>
> Cc: "J. Bruce Fields" <bfields@fieldses.org>
> Cc: linux-nfs@vger.kernel.org
> Cc: netdev@vger.kernel.org

This doesn't include either of the two options you proposed to address
the sender blocked forever by receiver issue with bridged septups and
endpoints such a tap device or a socket on the same box,
does it?

I mean this discussion:
http://article.gmane.org/gmane.linux.nfs/44846/match=use+skb+fragment+destructors

It would be nice to have this addressed in some way.

How about patching __skb_queue_tail to do a deep copy?
That would seem to handle both tap and socket cases -
any other ones left?

If we do this, I think it would be beneficial to pass a flag
to the destructor indicating that a deep copy was done:
this would allow senders to detect that and adapt.


> ---
>  include/linux/sunrpc/xdr.h  |    2 ++
>  include/linux/sunrpc/xprt.h |    5 ++++-
>  net/sunrpc/clnt.c           |   27 ++++++++++++++++++++++-----
>  net/sunrpc/svcsock.c        |    3 ++-
>  net/sunrpc/xprt.c           |   12 ++++++++++++
>  net/sunrpc/xprtsock.c       |    3 ++-
>  6 files changed, 44 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
> index a20970e..172f81e 100644
> --- a/include/linux/sunrpc/xdr.h
> +++ b/include/linux/sunrpc/xdr.h
> @@ -16,6 +16,7 @@
>  #include <asm/byteorder.h>
>  #include <asm/unaligned.h>
>  #include <linux/scatterlist.h>
> +#include <linux/skbuff.h>
>  
>  /*
>   * Buffer adjustment
> @@ -57,6 +58,7 @@ struct xdr_buf {
>  			tail[1];	/* Appended after page data */
>  
>  	struct page **	pages;		/* Array of contiguous pages */
> +	struct skb_frag_destructor *destructor;
>  	unsigned int	page_base,	/* Start of page data */
>  			page_len,	/* Length of page data */
>  			flags;		/* Flags for data disposition */
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> index 15518a1..75131eb 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -92,7 +92,10 @@ struct rpc_rqst {
>  						/* A cookie used to track the
>  						   state of the transport
>  						   connection */
> -	
> +	struct skb_frag_destructor destructor;	/* SKB paged fragment
> +						 * destructor for
> +						 * transmitted pages*/
> +
>  	/*
>  	 * Partial send handling
>  	 */
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index f0268ea..06f363f 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -61,6 +61,7 @@ static void	call_reserve(struct rpc_task *task);
>  static void	call_reserveresult(struct rpc_task *task);
>  static void	call_allocate(struct rpc_task *task);
>  static void	call_decode(struct rpc_task *task);
> +static void	call_complete(struct rpc_task *task);
>  static void	call_bind(struct rpc_task *task);
>  static void	call_bind_status(struct rpc_task *task);
>  static void	call_transmit(struct rpc_task *task);
> @@ -1115,6 +1116,8 @@ rpc_xdr_encode(struct rpc_task *task)
>  			 (char *)req->rq_buffer + req->rq_callsize,
>  			 req->rq_rcvsize);
>  
> +	req->rq_snd_buf.destructor = &req->destructor;
> +
>  	p = rpc_encode_header(task);
>  	if (p == NULL) {
>  		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
> @@ -1278,6 +1281,7 @@ call_connect_status(struct rpc_task *task)
>  static void
>  call_transmit(struct rpc_task *task)
>  {
> +	struct rpc_rqst *req = task->tk_rqstp;
>  	dprint_status(task);
>  
>  	task->tk_action = call_status;
> @@ -1311,8 +1315,8 @@ call_transmit(struct rpc_task *task)
>  	call_transmit_status(task);
>  	if (rpc_reply_expected(task))
>  		return;
> -	task->tk_action = rpc_exit_task;
> -	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
> +	task->tk_action = call_complete;
> +	skb_frag_destructor_unref(&req->destructor);
>  }
>  
>  /*
> @@ -1385,7 +1389,8 @@ call_bc_transmit(struct rpc_task *task)
>  		return;
>  	}
>  
> -	task->tk_action = rpc_exit_task;
> +	task->tk_action = call_complete;
> +	skb_frag_destructor_unref(&req->destructor);
>  	if (task->tk_status < 0) {
>  		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
>  			"error: %d\n", task->tk_status);
> @@ -1425,7 +1430,6 @@ call_bc_transmit(struct rpc_task *task)
>  			"error: %d\n", task->tk_status);
>  		break;
>  	}
> -	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
>  }
>  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
>  
> @@ -1591,12 +1595,14 @@ call_decode(struct rpc_task *task)
>  		return;
>  	}
>  
> -	task->tk_action = rpc_exit_task;
> +	task->tk_action = call_complete;
>  
>  	if (decode) {
>  		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
>  						      task->tk_msg.rpc_resp);
>  	}
> +	rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
> +	skb_frag_destructor_unref(&req->destructor);
>  	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
>  			task->tk_status);
>  	return;
> @@ -1611,6 +1617,17 @@ out_retry:
>  	}
>  }
>  
> +/*
> + * 8.	Wait for pages to be released by the network stack.
> + */
> +static void
> +call_complete(struct rpc_task *task)
> +{
> +	dprintk("RPC: %5u call_complete result %d\n",
> +		task->tk_pid, task->tk_status);
> +	task->tk_action = rpc_exit_task;
> +}
> +
>  static __be32 *
>  rpc_encode_header(struct rpc_task *task)
>  {
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 0e6b919..aa7f108 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
>  	while (pglen > 0) {
>  		if (slen == size)
>  			flags = 0;
> -		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> +		result = kernel_sendpage(sock, *ppage, xdr->destructor,
> +					 base, size, flags);
>  		if (result > 0)
>  			len += result;
>  		if (result != size)
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index c64c0ef..2137eb6 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -1101,6 +1101,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
>  	xprt->xid = net_random();
>  }
>  
> +static int xprt_complete_skb_pages(struct skb_frag_destructor *destroy)
> +{
> +	struct rpc_rqst	*req =
> +		container_of(destroy, struct rpc_rqst, destructor);
> +
> +	dprintk("RPC: %5u completing skb pages\n", req->rq_task->tk_pid);
> +	rpc_wake_up_queued_task(&req->rq_xprt->pending, req->rq_task);

It seems that at the sunrpc module can get unloaded
at this point. The code for 'return 0' can then get
overwritten. Right?

Not sure how important making module unloading robust
is - if we wanted to fix this we'd need to move
the callback code into core (hopefully generalizing
it somewhat).

> +	return 0;
> +}
> +
>  static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>  {
>  	struct rpc_rqst	*req = task->tk_rqstp;
> @@ -1113,6 +1123,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>  	req->rq_xid     = xprt_alloc_xid(xprt);
>  	req->rq_release_snd_buf = NULL;
>  	xprt_reset_majortimeo(req);
> +	atomic_set(&req->destructor.ref, 1);
> +	req->destructor.destroy = &xprt_complete_skb_pages;
>  	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
>  			req, ntohl(req->rq_xid));
>  }
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 38b2fec..5406977 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>  		remainder -= len;
>  		if (remainder != 0 || more)
>  			flags |= MSG_MORE;
> -		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
> +		err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
> +					  base, len, flags);
>  		if (remainder == 0 || err != len)
>  			break;
>  		sent += err;
> -- 
> 1.7.2.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
  2012-01-26 13:11   ` Michael S. Tsirkin
@ 2012-01-30 15:51     ` Ian Campbell
  2012-01-30 16:23       ` Michael S. Tsirkin
  0 siblings, 1 reply; 7+ messages in thread
From: Ian Campbell @ 2012-01-30 15:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev@vger.kernel.org, David S. Miller, Neil Brown,
	J. Bruce Fields, linux-nfs@vger.kernel.org

On Thu, 2012-01-26 at 13:11 +0000, Michael S. Tsirkin wrote:
> On Wed, Jan 25, 2012 at 12:27:14PM +0000, Ian Campbell wrote:
> > This prevents an issue where an ACK is delayed, a retransmit is queued (either
> > at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> > wire. If this happens to an NFS WRITE RPC then the write() system call
> > completes and the userspace process can continue, potentially modifying data
> > referenced by the retransmission before the retransmission occurs.
> > 
> > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> > Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> > Cc: "David S. Miller" <davem@davemloft.net>
> > Cc: Neil Brown <neilb@suse.de>
> > Cc: "J. Bruce Fields" <bfields@fieldses.org>
> > Cc: linux-nfs@vger.kernel.org
> > Cc: netdev@vger.kernel.org
> 
> This doesn't include either of the two options you proposed to address
> the sender blocked forever by receiver issue with bridged septups and
> endpoints such a tap device or a socket on the same box,
> does it?

There was never any response to Bruce's question:
http://thread.gmane.org/gmane.linux.network/210873/focus=44849

        Stupid question: Is it a requirement that you be safe against DOS by a
        rogue process with a tap device?  (And if so, does current code satisfy
        that requirement?)

IMHO the answer to both questions is no, there are plenty of ways a
rogue process with a tap device can wreak havoc.

> How about patching __skb_queue_tail to do a deep copy?
> That would seem to handle both tap and socket cases -
> any other ones left?

Wouldn't that mean we were frequently (almost always) copying for lots
of other cases too? That would rather defeat the purpose of being able
to hand pages off to the network stack in a zero copy fashion.

> If we do this, I think it would be beneficial to pass a flag
> to the destructor indicating that a deep copy was done:
> this would allow senders to detect that and adapt.

If you were doing a deep copy anyway you might as well create a
completely new skb and release the old one, thereby causing the
destructors to fire in the normal way for it SKB. The copy wouldn't have
destructors because the pages would no longer be owned by the sender.

Ian.

> 
> > ---
> >  include/linux/sunrpc/xdr.h  |    2 ++
> >  include/linux/sunrpc/xprt.h |    5 ++++-
> >  net/sunrpc/clnt.c           |   27 ++++++++++++++++++++++-----
> >  net/sunrpc/svcsock.c        |    3 ++-
> >  net/sunrpc/xprt.c           |   12 ++++++++++++
> >  net/sunrpc/xprtsock.c       |    3 ++-
> >  6 files changed, 44 insertions(+), 8 deletions(-)
> > 
> > diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
> > index a20970e..172f81e 100644
> > --- a/include/linux/sunrpc/xdr.h
> > +++ b/include/linux/sunrpc/xdr.h
> > @@ -16,6 +16,7 @@
> >  #include <asm/byteorder.h>
> >  #include <asm/unaligned.h>
> >  #include <linux/scatterlist.h>
> > +#include <linux/skbuff.h>
> >  
> >  /*
> >   * Buffer adjustment
> > @@ -57,6 +58,7 @@ struct xdr_buf {
> >  			tail[1];	/* Appended after page data */
> >  
> >  	struct page **	pages;		/* Array of contiguous pages */
> > +	struct skb_frag_destructor *destructor;
> >  	unsigned int	page_base,	/* Start of page data */
> >  			page_len,	/* Length of page data */
> >  			flags;		/* Flags for data disposition */
> > diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> > index 15518a1..75131eb 100644
> > --- a/include/linux/sunrpc/xprt.h
> > +++ b/include/linux/sunrpc/xprt.h
> > @@ -92,7 +92,10 @@ struct rpc_rqst {
> >  						/* A cookie used to track the
> >  						   state of the transport
> >  						   connection */
> > -	
> > +	struct skb_frag_destructor destructor;	/* SKB paged fragment
> > +						 * destructor for
> > +						 * transmitted pages*/
> > +
> >  	/*
> >  	 * Partial send handling
> >  	 */
> > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > index f0268ea..06f363f 100644
> > --- a/net/sunrpc/clnt.c
> > +++ b/net/sunrpc/clnt.c
> > @@ -61,6 +61,7 @@ static void	call_reserve(struct rpc_task *task);
> >  static void	call_reserveresult(struct rpc_task *task);
> >  static void	call_allocate(struct rpc_task *task);
> >  static void	call_decode(struct rpc_task *task);
> > +static void	call_complete(struct rpc_task *task);
> >  static void	call_bind(struct rpc_task *task);
> >  static void	call_bind_status(struct rpc_task *task);
> >  static void	call_transmit(struct rpc_task *task);
> > @@ -1115,6 +1116,8 @@ rpc_xdr_encode(struct rpc_task *task)
> >  			 (char *)req->rq_buffer + req->rq_callsize,
> >  			 req->rq_rcvsize);
> >  
> > +	req->rq_snd_buf.destructor = &req->destructor;
> > +
> >  	p = rpc_encode_header(task);
> >  	if (p == NULL) {
> >  		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
> > @@ -1278,6 +1281,7 @@ call_connect_status(struct rpc_task *task)
> >  static void
> >  call_transmit(struct rpc_task *task)
> >  {
> > +	struct rpc_rqst *req = task->tk_rqstp;
> >  	dprint_status(task);
> >  
> >  	task->tk_action = call_status;
> > @@ -1311,8 +1315,8 @@ call_transmit(struct rpc_task *task)
> >  	call_transmit_status(task);
> >  	if (rpc_reply_expected(task))
> >  		return;
> > -	task->tk_action = rpc_exit_task;
> > -	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
> > +	task->tk_action = call_complete;
> > +	skb_frag_destructor_unref(&req->destructor);
> >  }
> >  
> >  /*
> > @@ -1385,7 +1389,8 @@ call_bc_transmit(struct rpc_task *task)
> >  		return;
> >  	}
> >  
> > -	task->tk_action = rpc_exit_task;
> > +	task->tk_action = call_complete;
> > +	skb_frag_destructor_unref(&req->destructor);
> >  	if (task->tk_status < 0) {
> >  		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
> >  			"error: %d\n", task->tk_status);
> > @@ -1425,7 +1430,6 @@ call_bc_transmit(struct rpc_task *task)
> >  			"error: %d\n", task->tk_status);
> >  		break;
> >  	}
> > -	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
> >  }
> >  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
> >  
> > @@ -1591,12 +1595,14 @@ call_decode(struct rpc_task *task)
> >  		return;
> >  	}
> >  
> > -	task->tk_action = rpc_exit_task;
> > +	task->tk_action = call_complete;
> >  
> >  	if (decode) {
> >  		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
> >  						      task->tk_msg.rpc_resp);
> >  	}
> > +	rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
> > +	skb_frag_destructor_unref(&req->destructor);
> >  	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
> >  			task->tk_status);
> >  	return;
> > @@ -1611,6 +1617,17 @@ out_retry:
> >  	}
> >  }
> >  
> > +/*
> > + * 8.	Wait for pages to be released by the network stack.
> > + */
> > +static void
> > +call_complete(struct rpc_task *task)
> > +{
> > +	dprintk("RPC: %5u call_complete result %d\n",
> > +		task->tk_pid, task->tk_status);
> > +	task->tk_action = rpc_exit_task;
> > +}
> > +
> >  static __be32 *
> >  rpc_encode_header(struct rpc_task *task)
> >  {
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 0e6b919..aa7f108 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
> >  	while (pglen > 0) {
> >  		if (slen == size)
> >  			flags = 0;
> > -		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> > +		result = kernel_sendpage(sock, *ppage, xdr->destructor,
> > +					 base, size, flags);
> >  		if (result > 0)
> >  			len += result;
> >  		if (result != size)
> > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > index c64c0ef..2137eb6 100644
> > --- a/net/sunrpc/xprt.c
> > +++ b/net/sunrpc/xprt.c
> > @@ -1101,6 +1101,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
> >  	xprt->xid = net_random();
> >  }
> >  
> > +static int xprt_complete_skb_pages(struct skb_frag_destructor *destroy)
> > +{
> > +	struct rpc_rqst	*req =
> > +		container_of(destroy, struct rpc_rqst, destructor);
> > +
> > +	dprintk("RPC: %5u completing skb pages\n", req->rq_task->tk_pid);
> > +	rpc_wake_up_queued_task(&req->rq_xprt->pending, req->rq_task);
> 
> It seems that at the sunrpc module can get unloaded
> at this point. The code for 'return 0' can then get
> overwritten. Right?
> 
> Not sure how important making module unloading robust
> is - if we wanted to fix this we'd need to move
> the callback code into core (hopefully generalizing
> it somewhat).
> 
> > +	return 0;
> > +}
> > +
> >  static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> >  {
> >  	struct rpc_rqst	*req = task->tk_rqstp;
> > @@ -1113,6 +1123,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> >  	req->rq_xid     = xprt_alloc_xid(xprt);
> >  	req->rq_release_snd_buf = NULL;
> >  	xprt_reset_majortimeo(req);
> > +	atomic_set(&req->destructor.ref, 1);
> > +	req->destructor.destroy = &xprt_complete_skb_pages;
> >  	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
> >  			req, ntohl(req->rq_xid));
> >  }
> > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > index 38b2fec..5406977 100644
> > --- a/net/sunrpc/xprtsock.c
> > +++ b/net/sunrpc/xprtsock.c
> > @@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> >  		remainder -= len;
> >  		if (remainder != 0 || more)
> >  			flags |= MSG_MORE;
> > -		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
> > +		err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
> > +					  base, len, flags);
> >  		if (remainder == 0 || err != len)
> >  			break;
> >  		sent += err;
> > -- 
> > 1.7.2.5
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
  2012-01-30 15:51     ` Ian Campbell
@ 2012-01-30 16:23       ` Michael S. Tsirkin
  2012-01-30 16:43         ` Ian Campbell
  0 siblings, 1 reply; 7+ messages in thread
From: Michael S. Tsirkin @ 2012-01-30 16:23 UTC (permalink / raw)
  To: Ian Campbell
  Cc: netdev@vger.kernel.org, David S. Miller, Neil Brown,
	J. Bruce Fields, linux-nfs@vger.kernel.org

On Mon, Jan 30, 2012 at 03:51:53PM +0000, Ian Campbell wrote:
> On Thu, 2012-01-26 at 13:11 +0000, Michael S. Tsirkin wrote:
> > On Wed, Jan 25, 2012 at 12:27:14PM +0000, Ian Campbell wrote:
> > > This prevents an issue where an ACK is delayed, a retransmit is queued (either
> > > at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> > > wire. If this happens to an NFS WRITE RPC then the write() system call
> > > completes and the userspace process can continue, potentially modifying data
> > > referenced by the retransmission before the retransmission occurs.
> > > 
> > > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> > > Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> > > Cc: "David S. Miller" <davem@davemloft.net>
> > > Cc: Neil Brown <neilb@suse.de>
> > > Cc: "J. Bruce Fields" <bfields@fieldses.org>
> > > Cc: linux-nfs@vger.kernel.org
> > > Cc: netdev@vger.kernel.org
> > 
> > This doesn't include either of the two options you proposed to address
> > the sender blocked forever by receiver issue with bridged septups and
> > endpoints such a tap device or a socket on the same box,
> > does it?
> 
> There was never any response to Bruce's question:
> http://thread.gmane.org/gmane.linux.network/210873/focus=44849
> 
>         Stupid question: Is it a requirement that you be safe against DOS by a
>         rogue process with a tap device?  (And if so, does current code satisfy
>         that requirement?)
> 
> IMHO the answer to both questions is no, there are plenty of ways a
> rogue process with a tap device can wreak havoc.

I thought the answer is an obviious yes :(
What are these ways tap can wreak havoc?

> > How about patching __skb_queue_tail to do a deep copy?
> > That would seem to handle both tap and socket cases -
> > any other ones left?
> 
> Wouldn't that mean we were frequently (almost always) copying for lots
> of other cases too? That would rather defeat the purpose of being able
> to hand pages off to the network stack in a zero copy fashion.

Yes but the case of an rpc connection to the same box
is very rare I think, not worth optimizing for.

> > If we do this, I think it would be beneficial to pass a flag
> > to the destructor indicating that a deep copy was done:
> > this would allow senders to detect that and adapt.
> 
> If you were doing a deep copy anyway you might as well create a
> completely new skb and release the old one, thereby causing the
> destructors to fire in the normal way for it SKB. The copy wouldn't have
> destructors because the pages would no longer be owned by the sender.
> 
> Ian.

What I mean is that page pin + deep copy might be more expensive
than directly copying. So the owner of the original skb
cares whether we did a deep copy or zero copy transmit worked.

> > 
> > > ---
> > >  include/linux/sunrpc/xdr.h  |    2 ++
> > >  include/linux/sunrpc/xprt.h |    5 ++++-
> > >  net/sunrpc/clnt.c           |   27 ++++++++++++++++++++++-----
> > >  net/sunrpc/svcsock.c        |    3 ++-
> > >  net/sunrpc/xprt.c           |   12 ++++++++++++
> > >  net/sunrpc/xprtsock.c       |    3 ++-
> > >  6 files changed, 44 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
> > > index a20970e..172f81e 100644
> > > --- a/include/linux/sunrpc/xdr.h
> > > +++ b/include/linux/sunrpc/xdr.h
> > > @@ -16,6 +16,7 @@
> > >  #include <asm/byteorder.h>
> > >  #include <asm/unaligned.h>
> > >  #include <linux/scatterlist.h>
> > > +#include <linux/skbuff.h>
> > >  
> > >  /*
> > >   * Buffer adjustment
> > > @@ -57,6 +58,7 @@ struct xdr_buf {
> > >  			tail[1];	/* Appended after page data */
> > >  
> > >  	struct page **	pages;		/* Array of contiguous pages */
> > > +	struct skb_frag_destructor *destructor;
> > >  	unsigned int	page_base,	/* Start of page data */
> > >  			page_len,	/* Length of page data */
> > >  			flags;		/* Flags for data disposition */
> > > diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> > > index 15518a1..75131eb 100644
> > > --- a/include/linux/sunrpc/xprt.h
> > > +++ b/include/linux/sunrpc/xprt.h
> > > @@ -92,7 +92,10 @@ struct rpc_rqst {
> > >  						/* A cookie used to track the
> > >  						   state of the transport
> > >  						   connection */
> > > -	
> > > +	struct skb_frag_destructor destructor;	/* SKB paged fragment
> > > +						 * destructor for
> > > +						 * transmitted pages*/
> > > +
> > >  	/*
> > >  	 * Partial send handling
> > >  	 */
> > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > > index f0268ea..06f363f 100644
> > > --- a/net/sunrpc/clnt.c
> > > +++ b/net/sunrpc/clnt.c
> > > @@ -61,6 +61,7 @@ static void	call_reserve(struct rpc_task *task);
> > >  static void	call_reserveresult(struct rpc_task *task);
> > >  static void	call_allocate(struct rpc_task *task);
> > >  static void	call_decode(struct rpc_task *task);
> > > +static void	call_complete(struct rpc_task *task);
> > >  static void	call_bind(struct rpc_task *task);
> > >  static void	call_bind_status(struct rpc_task *task);
> > >  static void	call_transmit(struct rpc_task *task);
> > > @@ -1115,6 +1116,8 @@ rpc_xdr_encode(struct rpc_task *task)
> > >  			 (char *)req->rq_buffer + req->rq_callsize,
> > >  			 req->rq_rcvsize);
> > >  
> > > +	req->rq_snd_buf.destructor = &req->destructor;
> > > +
> > >  	p = rpc_encode_header(task);
> > >  	if (p == NULL) {
> > >  		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
> > > @@ -1278,6 +1281,7 @@ call_connect_status(struct rpc_task *task)
> > >  static void
> > >  call_transmit(struct rpc_task *task)
> > >  {
> > > +	struct rpc_rqst *req = task->tk_rqstp;
> > >  	dprint_status(task);
> > >  
> > >  	task->tk_action = call_status;
> > > @@ -1311,8 +1315,8 @@ call_transmit(struct rpc_task *task)
> > >  	call_transmit_status(task);
> > >  	if (rpc_reply_expected(task))
> > >  		return;
> > > -	task->tk_action = rpc_exit_task;
> > > -	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
> > > +	task->tk_action = call_complete;
> > > +	skb_frag_destructor_unref(&req->destructor);
> > >  }
> > >  
> > >  /*
> > > @@ -1385,7 +1389,8 @@ call_bc_transmit(struct rpc_task *task)
> > >  		return;
> > >  	}
> > >  
> > > -	task->tk_action = rpc_exit_task;
> > > +	task->tk_action = call_complete;
> > > +	skb_frag_destructor_unref(&req->destructor);
> > >  	if (task->tk_status < 0) {
> > >  		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
> > >  			"error: %d\n", task->tk_status);
> > > @@ -1425,7 +1430,6 @@ call_bc_transmit(struct rpc_task *task)
> > >  			"error: %d\n", task->tk_status);
> > >  		break;
> > >  	}
> > > -	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
> > >  }
> > >  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
> > >  
> > > @@ -1591,12 +1595,14 @@ call_decode(struct rpc_task *task)
> > >  		return;
> > >  	}
> > >  
> > > -	task->tk_action = rpc_exit_task;
> > > +	task->tk_action = call_complete;
> > >  
> > >  	if (decode) {
> > >  		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
> > >  						      task->tk_msg.rpc_resp);
> > >  	}
> > > +	rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
> > > +	skb_frag_destructor_unref(&req->destructor);
> > >  	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
> > >  			task->tk_status);
> > >  	return;
> > > @@ -1611,6 +1617,17 @@ out_retry:
> > >  	}
> > >  }
> > >  
> > > +/*
> > > + * 8.	Wait for pages to be released by the network stack.
> > > + */
> > > +static void
> > > +call_complete(struct rpc_task *task)
> > > +{
> > > +	dprintk("RPC: %5u call_complete result %d\n",
> > > +		task->tk_pid, task->tk_status);
> > > +	task->tk_action = rpc_exit_task;
> > > +}
> > > +
> > >  static __be32 *
> > >  rpc_encode_header(struct rpc_task *task)
> > >  {
> > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > > index 0e6b919..aa7f108 100644
> > > --- a/net/sunrpc/svcsock.c
> > > +++ b/net/sunrpc/svcsock.c
> > > @@ -198,7 +198,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
> > >  	while (pglen > 0) {
> > >  		if (slen == size)
> > >  			flags = 0;
> > > -		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> > > +		result = kernel_sendpage(sock, *ppage, xdr->destructor,
> > > +					 base, size, flags);
> > >  		if (result > 0)
> > >  			len += result;
> > >  		if (result != size)
> > > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > > index c64c0ef..2137eb6 100644
> > > --- a/net/sunrpc/xprt.c
> > > +++ b/net/sunrpc/xprt.c
> > > @@ -1101,6 +1101,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
> > >  	xprt->xid = net_random();
> > >  }
> > >  
> > > +static int xprt_complete_skb_pages(struct skb_frag_destructor *destroy)
> > > +{
> > > +	struct rpc_rqst	*req =
> > > +		container_of(destroy, struct rpc_rqst, destructor);
> > > +
> > > +	dprintk("RPC: %5u completing skb pages\n", req->rq_task->tk_pid);
> > > +	rpc_wake_up_queued_task(&req->rq_xprt->pending, req->rq_task);
> > 
> > It seems that at the sunrpc module can get unloaded
> > at this point. The code for 'return 0' can then get
> > overwritten. Right?
> > 
> > Not sure how important making module unloading robust
> > is - if we wanted to fix this we'd need to move
> > the callback code into core (hopefully generalizing
> > it somewhat).
> > 
> > > +	return 0;
> > > +}
> > > +
> > >  static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> > >  {
> > >  	struct rpc_rqst	*req = task->tk_rqstp;
> > > @@ -1113,6 +1123,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> > >  	req->rq_xid     = xprt_alloc_xid(xprt);
> > >  	req->rq_release_snd_buf = NULL;
> > >  	xprt_reset_majortimeo(req);
> > > +	atomic_set(&req->destructor.ref, 1);
> > > +	req->destructor.destroy = &xprt_complete_skb_pages;
> > >  	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
> > >  			req, ntohl(req->rq_xid));
> > >  }
> > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > index 38b2fec..5406977 100644
> > > --- a/net/sunrpc/xprtsock.c
> > > +++ b/net/sunrpc/xprtsock.c
> > > @@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> > >  		remainder -= len;
> > >  		if (remainder != 0 || more)
> > >  			flags |= MSG_MORE;
> > > -		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
> > > +		err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
> > > +					  base, len, flags);
> > >  		if (remainder == 0 || err != len)
> > >  			break;
> > >  		sent += err;
> > > -- 
> > > 1.7.2.5
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
  2012-01-30 16:23       ` Michael S. Tsirkin
@ 2012-01-30 16:43         ` Ian Campbell
  2012-01-30 18:06           ` Michael S. Tsirkin
  0 siblings, 1 reply; 7+ messages in thread
From: Ian Campbell @ 2012-01-30 16:43 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev@vger.kernel.org, David S. Miller, Neil Brown,
	J. Bruce Fields, linux-nfs@vger.kernel.org

On Mon, 2012-01-30 at 16:23 +0000, Michael S. Tsirkin wrote:
> On Mon, Jan 30, 2012 at 03:51:53PM +0000, Ian Campbell wrote:
> > On Thu, 2012-01-26 at 13:11 +0000, Michael S. Tsirkin wrote:
> > > On Wed, Jan 25, 2012 at 12:27:14PM +0000, Ian Campbell wrote:
> > > > This prevents an issue where an ACK is delayed, a retransmit is queued (either
> > > > at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> > > > wire. If this happens to an NFS WRITE RPC then the write() system call
> > > > completes and the userspace process can continue, potentially modifying data
> > > > referenced by the retransmission before the retransmission occurs.
> > > > 
> > > > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> > > > Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> > > > Cc: "David S. Miller" <davem@davemloft.net>
> > > > Cc: Neil Brown <neilb@suse.de>
> > > > Cc: "J. Bruce Fields" <bfields@fieldses.org>
> > > > Cc: linux-nfs@vger.kernel.org
> > > > Cc: netdev@vger.kernel.org
> > > 
> > > This doesn't include either of the two options you proposed to address
> > > the sender blocked forever by receiver issue with bridged septups and
> > > endpoints such a tap device or a socket on the same box,
> > > does it?
> > 
> > There was never any response to Bruce's question:
> > http://thread.gmane.org/gmane.linux.network/210873/focus=44849
> > 
> >         Stupid question: Is it a requirement that you be safe against DOS by a
> >         rogue process with a tap device?  (And if so, does current code satisfy
> >         that requirement?)
> > 
> > IMHO the answer to both questions is no, there are plenty of ways a
> > rogue process with a tap device can wreak havoc.
> 
> I thought the answer is an obviious yes :(
> What are these ways tap can wreak havoc?

Can't they spoof traffic and all sorts of things like that? Hrm. I
suppose that the same as any peer on the network so we already handle
that sort of thing. Maybe that's a red herring then.

> > > How about patching __skb_queue_tail to do a deep copy?
> > > That would seem to handle both tap and socket cases -
> > > any other ones left?
> > 
> > Wouldn't that mean we were frequently (almost always) copying for lots
> > of other cases too? That would rather defeat the purpose of being able
> > to hand pages off to the network stack in a zero copy fashion.
> 
> Yes but the case of an rpc connection to the same box
> is very rare I think, not worth optimizing for.

But changing __skb_queue_tail doesn't only impact rpc connections to the
same box, does it? At least I can see plenty of callers of
__skb_queue_tail which don't look like they would want a copy to occur
-- plenty of drivers for one thing.

Perhaps in combination with a per-queue flag or per-socket flag to
enable it though it might work though?

> > > If we do this, I think it would be beneficial to pass a flag
> > > to the destructor indicating that a deep copy was done:
> > > this would allow senders to detect that and adapt.
> > 
> > If you were doing a deep copy anyway you might as well create a
> > completely new skb and release the old one, thereby causing the
> > destructors to fire in the normal way for it SKB. The copy wouldn't have
> > destructors because the pages would no longer be owned by the sender.
> > 
> > Ian.
> 
> What I mean is that page pin + deep copy might be more expensive
> than directly copying. So the owner of the original skb
> cares whether we did a deep copy or zero copy transmit worked.

You mean so they can adaptively do a copy directly next time?

I think that would add a fair bit more complexity to what, as you point
out, is a fairly rare occurrence.

Ian.



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
  2012-01-30 16:43         ` Ian Campbell
@ 2012-01-30 18:06           ` Michael S. Tsirkin
  0 siblings, 0 replies; 7+ messages in thread
From: Michael S. Tsirkin @ 2012-01-30 18:06 UTC (permalink / raw)
  To: Ian Campbell
  Cc: netdev@vger.kernel.org, David S. Miller, Neil Brown,
	J. Bruce Fields, linux-nfs@vger.kernel.org

On Mon, Jan 30, 2012 at 04:43:33PM +0000, Ian Campbell wrote:
> On Mon, 2012-01-30 at 16:23 +0000, Michael S. Tsirkin wrote:
> > On Mon, Jan 30, 2012 at 03:51:53PM +0000, Ian Campbell wrote:
> > > On Thu, 2012-01-26 at 13:11 +0000, Michael S. Tsirkin wrote:
> > > > On Wed, Jan 25, 2012 at 12:27:14PM +0000, Ian Campbell wrote:
> > > > > This prevents an issue where an ACK is delayed, a retransmit is queued (either
> > > > > at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> > > > > wire. If this happens to an NFS WRITE RPC then the write() system call
> > > > > completes and the userspace process can continue, potentially modifying data
> > > > > referenced by the retransmission before the retransmission occurs.
> > > > > 
> > > > > Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> > > > > Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> > > > > Cc: "David S. Miller" <davem@davemloft.net>
> > > > > Cc: Neil Brown <neilb@suse.de>
> > > > > Cc: "J. Bruce Fields" <bfields@fieldses.org>
> > > > > Cc: linux-nfs@vger.kernel.org
> > > > > Cc: netdev@vger.kernel.org
> > > > 
> > > > This doesn't include either of the two options you proposed to address
> > > > the sender blocked forever by receiver issue with bridged septups and
> > > > endpoints such a tap device or a socket on the same box,
> > > > does it?
> > > 
> > > There was never any response to Bruce's question:
> > > http://thread.gmane.org/gmane.linux.network/210873/focus=44849
> > > 
> > >         Stupid question: Is it a requirement that you be safe against DOS by a
> > >         rogue process with a tap device?  (And if so, does current code satisfy
> > >         that requirement?)
> > > 
> > > IMHO the answer to both questions is no, there are plenty of ways a
> > > rogue process with a tap device can wreak havoc.
> > 
> > I thought the answer is an obviious yes :(
> > What are these ways tap can wreak havoc?
> 
> Can't they spoof traffic
> and all sorts of things like that?
> Hrm. I
> suppose that the same as any peer on the network so we already handle
> that sort of thing. Maybe that's a red herring then.

Right. It typically does not include DOS on the sender :)

> 
> > > > How about patching __skb_queue_tail to do a deep copy?
> > > > That would seem to handle both tap and socket cases -
> > > > any other ones left?
> > > 
> > > Wouldn't that mean we were frequently (almost always) copying for lots
> > > of other cases too? That would rather defeat the purpose of being able
> > > to hand pages off to the network stack in a zero copy fashion.
> > 
> > Yes but the case of an rpc connection to the same box
> > is very rare I think, not worth optimizing for.
> 
> But changing __skb_queue_tail doesn't only impact rpc connections to the
> same box, does it? At least I can see plenty of callers of
> __skb_queue_tail which don't look like they would want a copy to occur
> -- plenty of drivers for one thing.
> Perhaps in combination with a per-queue flag or per-socket flag to
> enable it though it might work though?

Right. I missed that. I'm guessing drivers don't hang on to skbs
indefinitely.  Still, copying is always safe - maybe the right thing to
do is to add an __skb_queue_tail variant that does not copy, and
gradually convert drivers that care to that API?

> > > > If we do this, I think it would be beneficial to pass a flag
> > > > to the destructor indicating that a deep copy was done:
> > > > this would allow senders to detect that and adapt.
> > > 
> > > If you were doing a deep copy anyway you might as well create a
> > > completely new skb and release the old one, thereby causing the
> > > destructors to fire in the normal way for it SKB. The copy wouldn't have
> > > destructors because the pages would no longer be owned by the sender.
> > > 
> > > Ian.
> > 
> > What I mean is that page pin + deep copy might be more expensive
> > than directly copying. So the owner of the original skb
> > cares whether we did a deep copy or zero copy transmit worked.
> 
> You mean so they can adaptively do a copy directly next time?
> I think that would add a fair bit more complexity to what, as you point
> out, is a fairly rare occurrence.
> 
> Ian.

For sunrpc yes but I was thinking about utilizing this
mechanism for e.g. kvm in the future. It might be more common
there. I agree this might be a future extension.

-- 
MSt

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2012-01-30 18:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1327494389.24561.316.camel@zakaz.uk.xensource.com>
2012-01-25 12:27 ` [PATCH v3 5/6] net: add paged frag destructor support to kernel_sendpage Ian Campbell
2012-01-25 12:27 ` [PATCH v3 6/6] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack Ian Campbell
2012-01-26 13:11   ` Michael S. Tsirkin
2012-01-30 15:51     ` Ian Campbell
2012-01-30 16:23       ` Michael S. Tsirkin
2012-01-30 16:43         ` Ian Campbell
2012-01-30 18:06           ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).